From d2baf5f8194303e1151d6c59ff37c0e9fd39b965 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Mon, 5 Aug 2024 10:32:21 -0400 Subject: [PATCH 01/13] branch init --- README_active_learning.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 README_active_learning.md diff --git a/README_active_learning.md b/README_active_learning.md new file mode 100644 index 00000000..89f9f177 --- /dev/null +++ b/README_active_learning.md @@ -0,0 +1 @@ +Instructions to run the active learning benchmark. \ No newline at end of file From a69c6e56ef46e842cd248078d045a44308977a0c Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Mon, 5 Aug 2024 15:19:38 -0400 Subject: [PATCH 02/13] adding hydra to the requirements --- .../{train_mtp.py => mlip/mtp_train.py} | 0 crystal_diffusion/mlip/mtp_utils.py | 59 +++++++++++++++++++ requirements.txt | 3 +- 3 files changed, 61 insertions(+), 1 deletion(-) rename crystal_diffusion/{train_mtp.py => mlip/mtp_train.py} (100%) create mode 100644 crystal_diffusion/mlip/mtp_utils.py diff --git a/crystal_diffusion/train_mtp.py b/crystal_diffusion/mlip/mtp_train.py similarity index 100% rename from crystal_diffusion/train_mtp.py rename to crystal_diffusion/mlip/mtp_train.py diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py new file mode 100644 index 00000000..71076e52 --- /dev/null +++ b/crystal_diffusion/mlip/mtp_utils.py @@ -0,0 +1,59 @@ +from typing import Any, Dict, List, Tuple + +import numpy as np +import yaml +from pymatgen.core import Structure + + +def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \ + Tuple[List[Structure], List[List[float]]]: + """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods. + + Args: + filename: path to LAMMPS output file in yaml format + atom_dict: mapping from LAMMPS atom indices to atom type (atomic number as int or atom name as str) + + Returns: + list of pymatgen Structure containing the atoms and their positions + list of forces (n x 3) for each atom + """ + structures = [] + forces = [] + with (open(filename, 'r') as f): + l_yaml = yaml.safe_load_all(f) + for d in l_yaml: # loop over LAMMPS outputs and convert in pymatgen Structure objects + # lattice in yaml is 3 x 2 [0, x_lim] + # we assume a rectangular lattice for now with the 2nd coordinates as the lattice vectors + lattice = np.zeros((3, 3)) + for i, x in enumerate(d['box']): + lattice[i, i] = x[1] + type_idx = d['keywords'].index('type') + species = [atom_dict[x[type_idx]] for x in d['data']] # convert to atom type + coords_idx = [d['keywords'].index(x) for x in ['x', 'y', 'z']] + coords = [[x[i] for i in coords_idx] for x in d['data']] + pm_structure = Structure(lattice=lattice, + species=species, + coords=coords, + coords_are_cartesian=True) + structures.append(pm_structure) + force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']] + structure_forces = [[x[i] for i in force_idx] for x in d['data']] + forces.append(structure_forces) + return structures, forces + + +def extract_energy_from_thermo_log(filename: str) -> List[float]: + """Read energies from LAMMPS thermodynamic output file. + + Args: + filename: path to LAMMPS thermodynamic output file in yaml format. + + Returns: + list of energies (1 value per configuration) + """ + with open(filename, 'r') as f: + log_yaml = yaml.safe_load(f) + kin_idx = log_yaml['keywords'].index('KinEng') + pot_idx = log_yaml['keywords'].index('PotEng') + energies = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']] + return energies diff --git a/requirements.txt b/requirements.txt index 34680d3e..7bc45fec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ datasets==2.17.1 flake8==4.0.1 flake8-docstrings==1.6.0 gitpython==3.1.27 +hydra-core==1.3.2 isort==5.13.2 jupyter==1.0.0 jinja2==3.1.2 @@ -36,4 +37,4 @@ pykeops==2.2.3 comet_ml einops==0.8.0 torchode==0.2.0 -torchsde==0.2.6 \ No newline at end of file +torchsde==0.2.6 From 44b06ffa4f8fed31440754b0620219a665ead743 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Wed, 7 Aug 2024 10:47:53 -0400 Subject: [PATCH 03/13] wip: active learning loop with mostly placholders --- .../active_learning_loop/benchmark.py | 160 ++++++++++++++++++ crystal_diffusion/mlip/mtp_train.py | 99 +---------- crystal_diffusion/mlip/mtp_utils.py | 64 +++++++ 3 files changed, 231 insertions(+), 92 deletions(-) create mode 100644 crystal_diffusion/active_learning_loop/benchmark.py diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py new file mode 100644 index 00000000..5e0e2ea1 --- /dev/null +++ b/crystal_diffusion/active_learning_loop/benchmark.py @@ -0,0 +1,160 @@ +import argparse +import os +from dataclasses import dataclass +from typing import Dict, List, Optional + +import pandas as pd +import yaml +from hydra.utils import instantiate + +from crystal_diffusion.mlip.mtp_train import train_mtp, prepare_dataset, evaluate_mtp +from crystal_diffusion.models.mtp import MTPWithMLIP3 + + +def get_arguments() -> argparse.Namespace: + """Parse arguments. + + Returns: + args: arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument('--mtp_config', help='path to data directory', required=True) + args = parser.parse_args() + return args + + +@dataclass(kw_only=True) +class MTPArguments: + training_data_dir: str # training data directory + evaluation_data_dir: str # evaluation data directory + mlip_dir: str # directory with the mlp executable + output_dir: str # directory where to save the MTP outputs + + +@dataclass(kw_only=True) +class StructureEvaluationArguments: + evaluation_criteria: str ='nbh_grades' + criteria_threshold: float = 10 + number_of_structures: int = None + extraction_radius: float = 3 + + +def train_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str]) -> MTPWithMLIP3: + """Train a MTP model using the specified data. + + Args: + mtp_args: MTPArguments data class + atom_dict: map between atom names and indices used by LAMMPS + + Returns: + trained MTP model + """ + # TODO make it more configurable + train_datasets = prepare_dataset(mtp_args.training_data_dir, atom_dict, mode="train") + # create the output directory if it doesn't exist already + trained_mtp = train_mtp(train_datasets, mlip_folder_path=mtp_args.mlip_dir, save_dir=mtp_args.output_dir) + return trained_mtp + + +def evaluate_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str], mtp: MTPWithMLIP3) -> pd.DataFrame: + """Evaluate a MTP model using the specified data. + + Args: + mtp_args: MTPArguments data class + atom_dict: map between atom names and indices used by LAMMPS + mtp: trained MTP model + + Returns: + dataframe with a column specified the structure, a column with the atom index, 3 columns with the x,y,z + coordinates and a column with the MaxVol criteria (nbh_grades) + """ + evaluation_datasets = prepare_dataset(mtp_args.evaluation_data_dir, atom_dict, mode="evaluation") + # TODO the current evaluation method also returns the ground truth informations - this won't always be the case + # TODO make more configurable + _, prediction_df = evaluate_mtp(evaluation_datasets, mtp) + return prediction_df + + +def get_structures_for_retraining(prediction_df: pd.DataFrame, + criteria_threshold: Optional[float] = None, + number_of_structures: Optional[int] = None, + evaluation_criteria: str = 'nbh_grades', + structure_index: str = 'structure_index' + ) -> List[pd.DataFrame]: + assert criteria_threshold is not None or number_of_structures is not None, \ + "criteria_threshold or number_of_structures should be set." + # get the highest evaluation_criteria for each structure i.e. only the worst atom counts for structure selection + criteria_by_structure = prediction_df[[evaluation_criteria, structure_index]].groupby(structure_index).max() + # find the top number_of_structures + structures_indices = criteria_by_structure.sort_values(by=evaluation_criteria, ascending=False) + if number_of_structures is not None: + structures_indices = structures_indices[:number_of_structures] + else: # criteria_threshold is not None + structures_indices = structures_indices[structures_indices[evaluation_criteria] >= criteria_threshold] + structures_indices = structures_indices.index.to_list() + assert len(structures_indices) > 0, "No structure meet the criteria." + structures_to_retrain = [] + for idx in structures_indices: + structures_to_retrain.append(prediction_df[prediction_df[structure_index] == idx]) + return structures_to_retrain + + +def extract_target_region(structure_df: pd.DataFrame, + extraction_radius: float, + evaluation_criteria: str ='nbh_grades') -> pd.DataFrame: + """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious. + + Args: + structure_df: dataframe with the atomic positions and the evaluation criteria (e.g. MaxVol value) + extraction_radius: include all atoms within this distance of the targeted atom + evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (maxvol in MTP) + + Returns: + dataframe with the atomic coordinates in columns x, y, z + """ + # extract the worst ato and a region around of radius extraction_radius + # TODO better method to determine radius: number of atoms ? + target_atom = structure_df[evaluation_criteria].idxmax() + target_position = structure_df.loc[target_atom][['x', 'y', 'z']] + structure_df.loc[:, 'distance_squared'] = structure_df.apply( + lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1) + atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']] + return atom_positions + + +def generate_new_structures(fixed_atoms: List[pd.DataFrame]) -> None: + pass + + +def main(): + # args = get_arguments() + # TODO get mtp_config_path from the args + mtp_config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/" + mtp_config_path = os.path.join(mtp_config_path, "config", "mtp_training.yaml") + with open(mtp_config_path, 'r') as stream: + mtp_config = yaml.load(stream, Loader=yaml.FullLoader) + # use hydra to convert the yaml file in a dataclass format + mtp_config = instantiate(mtp_config) + os.makedirs(mtp_config.output_dir, exist_ok=True) + atom_dict = {1: "Si"} # TODO this should be define somewhere smart + # STEP 1: train a MLIP + trained_mtp = train_mlip(mtp_config, atom_dict) + # STEP 2: evaluate the MLIP + prediction_df = evaluate_mlip(mtp_config, atom_dict, trained_mtp) + # STEP 3: identify the problematic structures + # TODO extraction_params should come from a config file with hydra instantiate + extraction_params = StructureEvaluationArguments() + structures_to_retrain = get_structures_for_retraining(prediction_df, + criteria_threshold=extraction_params.criteria_threshold, + number_of_structures=extraction_params.number_of_structures, + evaluation_criteria=extraction_params.evaluation_criteria) + # STEP 4: extract the region + bad_regions = [extract_target_region(s, extraction_radius=extraction_params.extraction_radius) + for s in structures_to_retrain] + # STEP 5: call the generative model to create new candidates + + print('hello') + + +if __name__ == '__main__': + main() diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py index 59d65bf3..3153ccee 100644 --- a/crystal_diffusion/mlip/mtp_train.py +++ b/crystal_diffusion/mlip/mtp_train.py @@ -1,8 +1,9 @@ """Script to train and evaluate a MTP. -Running the main() runs a debugging example. Entry points are train_mtp and evaluate_mtp. +Running the main() runs a debugging example. Entry points are train_mtp. """ import argparse +from dataclasses import dataclass from typing import Any, Dict, List, NamedTuple, Tuple import numpy as np @@ -12,101 +13,15 @@ from sklearn.metrics import mean_absolute_error from crystal_diffusion.models.mtp import MTPWithMLIP3 +from crystal_diffusion.mlip.mtp_utils import prepare_mtp_inputs_from_lammps, crawl_lammps_directory, MTP_Inputs atom_dict = {1: 'Si'} -def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \ - Tuple[List[Structure], List[List[float]]]: - """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods. - - Args: - filename: path to LAMMPS output file in yaml format - atom_dict: mapping from LAMMPS atom indices to atom type (atomic number as int or atom name as str) - - Returns: - list of pymatgen Structure containing the atoms and their positions - list of forces (n x 3) for each atom - """ - structures = [] - forces = [] - with (open(filename, 'r') as f): - l_yaml = yaml.safe_load_all(f) - for d in l_yaml: # loop over LAMMPS outputs and convert in pymatgen Structure objects - # lattice in yaml is 3 x 2 [0, x_lim] - # we assume a rectangular lattice for now with the 2nd coordinates as the lattice vectors - lattice = np.zeros((3, 3)) - for i, x in enumerate(d['box']): - lattice[i, i] = x[1] - type_idx = d['keywords'].index('type') - species = [atom_dict[x[type_idx]] for x in d['data']] # convert to atom type - coords_idx = [d['keywords'].index(x) for x in ['x', 'y', 'z']] - coords = [[x[i] for i in coords_idx] for x in d['data']] - pm_structure = Structure(lattice=lattice, - species=species, - coords=coords, - coords_are_cartesian=True) - structures.append(pm_structure) - force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']] - structure_forces = [[x[i] for i in force_idx] for x in d['data']] - forces.append(structure_forces) - return structures, forces - - -def extract_energy_from_thermo_log(filename: str) -> List[float]: - """Read energies from LAMMPS thermodynamic output file. - - Args: - filename: path to LAMMPS thermodynamic output file in yaml format. - - Returns: - list of energies (1 value per configuration) - """ - with open(filename, 'r') as f: - log_yaml = yaml.safe_load(f) - kin_idx = log_yaml['keywords'].index('KinEng') - pot_idx = log_yaml['keywords'].index('PotEng') - energies = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']] - return energies - - -class MTP_Inputs(NamedTuple): - """Create a namedtuple instance for MTP inputs.""" - - structure: List[Structure] - forces: List[List[float]] - energy: List[float] - - -def prepare_mtp_inputs_from_lammps(output_yaml: List[str], - thermo_yaml: List[str], - atom_dict: Dict[int, Any] - ) -> MTP_Inputs: - """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format. - - Args: - output_yaml: list of LAMMPS output files as yaml. - thermo_yaml: list of LAMMPS thermodynamic output files as yaml. - atom_dict: mapping of LAMMPS indices to atom type. - - Returns: - namedtuple with structure, energies and forces usable by MTP. - """ - mtp_inputs = { - 'structure': [], - 'energy': [], - 'forces': [] - } - for filename in output_yaml: - structures, forces = extract_structure_and_forces_from_file(filename, atom_dict) - mtp_inputs['structure'] += structures - mtp_inputs['forces'] += forces - for filename in thermo_yaml: - mtp_inputs['energy'] += extract_energy_from_thermo_log(filename) - mtp_inputs = MTP_Inputs(structure=mtp_inputs['structure'], - energy=mtp_inputs['energy'], - forces=mtp_inputs['forces']) - return mtp_inputs +def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTP_Inputs: + lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode) + mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict) + return mtp_dataset def train_mtp(train_inputs: MTP_Inputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3: diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py index 71076e52..f717d8f6 100644 --- a/crystal_diffusion/mlip/mtp_utils.py +++ b/crystal_diffusion/mlip/mtp_utils.py @@ -1,3 +1,6 @@ +import os +import re +from dataclasses import dataclass from typing import Any, Dict, List, Tuple import numpy as np @@ -5,6 +8,14 @@ from pymatgen.core import Structure +@dataclass(kw_only=True) +class MTP_Inputs: + """Create a dataclass to train or evaluate a MTP model.""" + structure: List[Structure] + forces: List[List[float]] + energy: List[float] + + def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \ Tuple[List[Structure], List[List[float]]]: """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods. @@ -57,3 +68,56 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]: pot_idx = log_yaml['keywords'].index('PotEng') energies = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']] return energies + + +def prepare_mtp_inputs_from_lammps(output_yaml: List[str], + thermo_yaml: List[str], + atom_dict: Dict[int, Any] + ) -> MTP_Inputs: + """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format. + + Args: + output_yaml: list of LAMMPS output files as yaml. + thermo_yaml: list of LAMMPS thermodynamic output files as yaml. + atom_dict: mapping of LAMMPS indices to atom type. + + Returns: + dataclass used to + """ + mtp_inputs = { + 'structure': [], + 'energy': [], + 'forces': [] + } + for filename in output_yaml: + structures, forces = extract_structure_and_forces_from_file(filename, atom_dict) + mtp_inputs['structure'] += structures + mtp_inputs['forces'] += forces + for filename in thermo_yaml: + mtp_inputs['energy'] += extract_energy_from_thermo_log(filename) + mtp_inputs = MTP_Inputs(structure=mtp_inputs['structure'], + energy=mtp_inputs['energy'], + forces=mtp_inputs['forces']) + return mtp_inputs + + +def crawl_lammps_directory(folder_name: str, folder_name_pattern: str="train") -> Tuple[List[str], List[str]]: + """Crawl through a folder and find the LAMMPS output files in folders containing a specified pattern in their name. + + LAMMPS outputs should end with dump.yaml and Thermondynamics variables files should end with thermo.yaml + + Args: + folder_name: folder to crawl + folder_name_pattern (optional): name of the subfolder to keep. Defaults to train. + + Returns: + list of LAMMPS dump outputs and list of LAMMPS thermo outputs + + """ + assert os.path.exists(folder_name), "Invalid folder name provided." + lammps_output_files, thermo_output_files = [], [] + for dirpath, _, filenames in os.walk(folder_name): + if re.search(folder_name_pattern, dirpath): + lammps_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("dump.yaml")]) + thermo_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("thermo.yaml")]) + return lammps_output_files, thermo_output_files From 3c307d66d6e30437a5ff33ec88e1d662d641590f Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Fri, 9 Aug 2024 10:19:47 -0400 Subject: [PATCH 04/13] wip: active learning loop - training a MTP with modular classes --- .../active_learning_loop/benchmark.py | 143 +++++++++++++---- crystal_diffusion/mlip/mtp_train.py | 4 +- crystal_diffusion/mlip/mtp_utils.py | 12 +- crystal_diffusion/models/{ => mlip}/mtp.py | 145 +++++++++--------- .../config/mtp_training.yaml | 33 ++++ 5 files changed, 222 insertions(+), 115 deletions(-) rename crystal_diffusion/models/{ => mlip}/mtp.py (73%) create mode 100644 experiments/active_learning_benchmark/config/mtp_training.yaml diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py index 5e0e2ea1..b17f6110 100644 --- a/crystal_diffusion/active_learning_loop/benchmark.py +++ b/crystal_diffusion/active_learning_loop/benchmark.py @@ -3,12 +3,103 @@ from dataclasses import dataclass from typing import Dict, List, Optional +import numpy as np import pandas as pd import yaml from hydra.utils import instantiate -from crystal_diffusion.mlip.mtp_train import train_mtp, prepare_dataset, evaluate_mtp -from crystal_diffusion.models.mtp import MTPWithMLIP3 +# from crystal_diffusion.mlip.mtp_train import prepare_dataset, evaluate_mtp +from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3 +from crystal_diffusion.mlip.mtp_utils import MTPInputs + + +@dataclass(kw_only=True) +class ActiveLearningDataArguments: + training_data_dir: str # training data directory + evaluation_data_dir: str # evaluation data directory + output_dir: str # directory where to save the results + + +@dataclass(kw_only=True) +class StructureEvaluationArguments: + evaluation_criteria: str = 'nbh_grades' + criteria_threshold: float = 10 + number_of_structures: int = None + extraction_radius: float = 3 + + +class ActiveLearningLoop: + def __init__(self, + meta_config: str, + evaluation_config: str): + assert os.path.exists(meta_config), "configuration file for active learning loop does not exist." + self.data_paths, self.mlip_model, self.evaluation_args = None, None, None + # use hydra to convert the yaml file in a dataclass format + self.parse_config(meta_config) + self.atom_dict = {1: "Si"} # TODO this should be define somewhere smart + self.trained_mlips = [] # history of trained MLIPs (optional - not sure if we should keep this) + + def parse_config(self, meta_config: str): + with open(meta_config, 'r') as stream: + meta_config = yaml.load(stream, Loader=yaml.FullLoader) + self.data_paths = instantiate(meta_config['active_learning_data']) + self.mlip_model = instantiate(meta_config['mlip']) + self.evaluation_args = instantiate(meta_config['structure_evaluation']) + + def train_mlip(self, round: int = 1) -> MTPWithMLIP3: + train_dataset = self.mlip_model.prepare_dataset_from_lammps( + root_data_dir=self.data_paths.training_data_dir, + atom_dict=self.atom_dict, + mode="train" + ) + trained_mtp = self.mlip_model.train(train_dataset, mlip_name=f'mlip_round_{round}') + self.trained_mlips.append(trained_mtp) # history of trained MLIPs ... not sure if useful + return trained_mtp + + def evaluate_mlip(self, trained_mtp: MTPWithMLIP3) -> pd.DataFrame: + prediction_df = evaluate_mlip(self.mtp_config, self.atom_dict, trained_mtp) + return prediction_df + + def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]: + structures_to_retrain = get_structures_for_retraining(prediction_df, + criteria_threshold=self.eval_config.criteria_threshold, + number_of_structures=self.eval_config.number_of_structures, + evaluation_criteria=self.eval_config.evaluation_criteria) + return structures_to_retrain + + def excise_worst_atom(self, structures_to_retrain: List[pd.DataFrame]) -> List[pd.DataFrame]: + bad_regions = [extract_target_region(s, extraction_radius=self.eval_config.extraction_radius) + for s in structures_to_retrain] + return bad_regions + + def get_structure_candidate_from_generative_model(self, + fixed_atoms: pd.DataFrame, + number_of_candidates: int =1 + ) -> None: + # TODO: call the diffusion model and get number_of_candidates samples with repaint using the fixed_atoms + # the output should be a list of np.ndarray (?) + return None + + def update_mlip_training_set(self, new_structures: List[np.ndarray]): + # TODO take the examples generated by the diffusion model and append them to the MLIP training set + return None + + def round_of_active_learning_loop(self, trained_mtp: Optional[MTPWithMLIP3]=None, training_set=None): + # one round from a known mtp (or train from provided training set) + # evaluate, find candidates and update MTP + # return the updated MTP + if trained_mtp is None: + trained_mtp = self.train_mlip(training_set) + pred_df = self.evaluate_mlip(trained_mtp) + bad_structures = self.get_bad_structures(pred_df) + bad_regions = self.excise_worst_atom(bad_structures) + new_candidates = [self.get_structure_candidate_from_generative_model(x) for x in bad_regions] + new_mlip_training_set = self.update_mlip_training_set(self, new_candidates) + new_mtp = self.trained_mlip(new_mlip_training_set) + return new_mtp + + def evaluate_mtp_update(self): + pass def get_arguments() -> argparse.Namespace: @@ -23,23 +114,10 @@ def get_arguments() -> argparse.Namespace: return args -@dataclass(kw_only=True) -class MTPArguments: - training_data_dir: str # training data directory - evaluation_data_dir: str # evaluation data directory - mlip_dir: str # directory with the mlp executable - output_dir: str # directory where to save the MTP outputs - -@dataclass(kw_only=True) -class StructureEvaluationArguments: - evaluation_criteria: str ='nbh_grades' - criteria_threshold: float = 10 - number_of_structures: int = None - extraction_radius: float = 3 -def train_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str]) -> MTPWithMLIP3: +def train_mlip(mtp_args, atom_dict: Dict[int, str]) -> MTPWithMLIP3: """Train a MTP model using the specified data. Args: @@ -56,7 +134,7 @@ def train_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str]) -> MTPWithMLIP return trained_mtp -def evaluate_mlip(mtp_args: MTPArguments, atom_dict: Dict[int, str], mtp: MTPWithMLIP3) -> pd.DataFrame: +def evaluate_mlip(mtp_args, atom_dict: Dict[int, str], mtp: MTPWithMLIP3) -> pd.DataFrame: """Evaluate a MTP model using the specified data. Args: @@ -129,30 +207,27 @@ def generate_new_structures(fixed_atoms: List[pd.DataFrame]) -> None: def main(): # args = get_arguments() # TODO get mtp_config_path from the args - mtp_config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/" - mtp_config_path = os.path.join(mtp_config_path, "config", "mtp_training.yaml") - with open(mtp_config_path, 'r') as stream: - mtp_config = yaml.load(stream, Loader=yaml.FullLoader) - # use hydra to convert the yaml file in a dataclass format - mtp_config = instantiate(mtp_config) - os.makedirs(mtp_config.output_dir, exist_ok=True) + config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/" + config_path = os.path.join(config_path, "config", "mtp_training.yaml") + al_loop = ActiveLearningLoop(config_path, 'hello') + al_loop.train_mlip() + atom_dict = {1: "Si"} # TODO this should be define somewhere smart # STEP 1: train a MLIP - trained_mtp = train_mlip(mtp_config, atom_dict) + # trained_mtp = train_mlip(mtp_config, atom_dict) # STEP 2: evaluate the MLIP - prediction_df = evaluate_mlip(mtp_config, atom_dict, trained_mtp) + # prediction_df = evaluate_mlip(mtp_config, atom_dict, trained_mtp) # STEP 3: identify the problematic structures # TODO extraction_params should come from a config file with hydra instantiate - extraction_params = StructureEvaluationArguments() - structures_to_retrain = get_structures_for_retraining(prediction_df, - criteria_threshold=extraction_params.criteria_threshold, - number_of_structures=extraction_params.number_of_structures, - evaluation_criteria=extraction_params.evaluation_criteria) + # extraction_params = StructureEvaluationArguments() + #structures_to_retrain = get_structures_for_retraining(prediction_df, + # criteria_threshold=extraction_params.criteria_threshold, + # number_of_structures=extraction_params.number_of_structures, + # evaluation_criteria=extraction_params.evaluation_criteria) # STEP 4: extract the region - bad_regions = [extract_target_region(s, extraction_radius=extraction_params.extraction_radius) - for s in structures_to_retrain] + #bad_regions = [extract_target_region(s, extraction_radius=extraction_params.extraction_radius) + # for s in structures_to_retrain] # STEP 5: call the generative model to create new candidates - print('hello') diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py index 3153ccee..8e449f80 100644 --- a/crystal_diffusion/mlip/mtp_train.py +++ b/crystal_diffusion/mlip/mtp_train.py @@ -12,8 +12,8 @@ from pymatgen.core import Structure from sklearn.metrics import mean_absolute_error -from crystal_diffusion.models.mtp import MTPWithMLIP3 -from crystal_diffusion.mlip.mtp_utils import prepare_mtp_inputs_from_lammps, crawl_lammps_directory, MTP_Inputs +from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3 +from crystal_diffusion.mlip.mtp_utils import prepare_mtp_inputs_from_lammps, crawl_lammps_directory, MTPInputs atom_dict = {1: 'Si'} diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py index f717d8f6..9f2f974c 100644 --- a/crystal_diffusion/mlip/mtp_utils.py +++ b/crystal_diffusion/mlip/mtp_utils.py @@ -9,7 +9,7 @@ @dataclass(kw_only=True) -class MTP_Inputs: +class MTPInputs: """Create a dataclass to train or evaluate a MTP model.""" structure: List[Structure] forces: List[List[float]] @@ -73,7 +73,7 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]: def prepare_mtp_inputs_from_lammps(output_yaml: List[str], thermo_yaml: List[str], atom_dict: Dict[int, Any] - ) -> MTP_Inputs: + ) -> MTPInputs: """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format. Args: @@ -95,13 +95,13 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str], mtp_inputs['forces'] += forces for filename in thermo_yaml: mtp_inputs['energy'] += extract_energy_from_thermo_log(filename) - mtp_inputs = MTP_Inputs(structure=mtp_inputs['structure'], - energy=mtp_inputs['energy'], - forces=mtp_inputs['forces']) + mtp_inputs = MTPInputs(structure=mtp_inputs['structure'], + energy=mtp_inputs['energy'], + forces=mtp_inputs['forces']) return mtp_inputs -def crawl_lammps_directory(folder_name: str, folder_name_pattern: str="train") -> Tuple[List[str], List[str]]: +def crawl_lammps_directory(folder_name: str, folder_name_pattern: str= "train") -> Tuple[List[str], List[str]]: """Crawl through a folder and find the LAMMPS output files in folders containing a specified pattern in their name. LAMMPS outputs should end with dump.yaml and Thermondynamics variables files should end with thermo.yaml diff --git a/crystal_diffusion/models/mtp.py b/crystal_diffusion/models/mlip/mtp.py similarity index 73% rename from crystal_diffusion/models/mtp.py rename to crystal_diffusion/models/mlip/mtp.py index 67570495..474b6c25 100644 --- a/crystal_diffusion/models/mtp.py +++ b/crystal_diffusion/models/mlip/mtp.py @@ -10,6 +10,7 @@ import shutil import subprocess from collections import defaultdict +from dataclasses import dataclass from typing import Any, Dict, List, Optional, TextIO, Tuple import numpy as np @@ -20,30 +21,47 @@ from monty.tempfile import ScratchDir from pymatgen.core import Structure +from crystal_diffusion.mlip.mtp_utils import MTPInputs, crawl_lammps_directory, prepare_mtp_inputs_from_lammps + + +@dataclass(kw_only=True) +class MTPArguments: + mlip_path: str # path to MLIP3 library + name: Optional[str]= None # MTP + param: Optional[Dict[Any, Any]] = None + unfitted_mtp: str = "08.almtp" # Define the initial mtp file. Default to 08g.amltp + fitted_mtp_savedir: str = '../' # save directory for the fitted MTP. Defaults to '../' (current wd) + max_dist: float = 5 # The actual radial cutoff. Defaults to 5. + radial_basis_size: int = 8 # Relevant to number of radial basis function. Defaults to 8. + max_iter: int = 1000 # The number of maximum iteration. Defaults to 1000. + energy_weight: float = 1 # The weight of energy. Defaults to 1 + force_weight: float = 1e-2 # The weight of forces. Defaults to 1e-2 + stress_weight: float = 1e-3 # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3. + init_params: str = "same" # how to initialize parameters if a potential was not pre-fitted: "same" or "random". + scale_by_force: float = 0 # If > 0 then configurations near equilibrium get more weight. Defaults to 0. + bfgs_conv_tol: float = 1e-3 # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations. + weighting: str = "vibration" # How to weight configuration with different sizes relative to each other. + # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration". + class MTPWithMLIP3(MTPotential): """MTP with MLIP-3.""" - def __init__(self, - mlip_path: str, - name: Optional[str] = None, - param: Optional[Dict[Any, Any]] = None, - version: Optional[str] = None): + def __init__(self, mtp_args: MTPArguments): """Modifications to maml.apps.pes._mtp.MTPotential to be compatible with mlip-3. Args: - mlip_path: path to mlip3 library - name: MTPotential argument - param: MTPotential argument - version: MTPotential argument + mtp_args: MTP arguments from the class MTPArguments """ - super().__init__(name, param, version) - self.mlp_command = os.path.join(mlip_path, "build", "mlp") + super().__init__(mtp_args.name, mtp_args.param) + self.mlp_command = os.path.join(mtp_args.mlip_path, "build", "mlp") assert os.path.exists(self.mlp_command), "mlp command not found in mlip-3 build folder" - self.mlp_templates = os.path.join(mlip_path, "MTP_templates") + self.mlp_templates = os.path.join(mtp_args.mlip_path, "MTP_templates") assert os.path.exists(self.mlp_templates), "MTP templates not found in mlip-3 folder" self.fitted_mtp = None self.elements = None + self.mtp_args = mtp_args + os.makedirs(mtp_args.fitted_mtp_savedir, exist_ok=True) def to_lammps_format(self): """Write the trained MTP in a LAMMPS compatible format.""" @@ -235,72 +253,50 @@ def _call_cmd_to_stdout(cmd: List[str], output_file: TextIO): with subprocess.Popen(cmd, stdout=output_file) as p: p.communicate()[0] - def train( - self, - train_structures: List[Structure], - train_energies: List[float], - train_forces: List[List[float]], - train_stresses: Optional[List[List[float]]] = None, - unfitted_mtp: str = "08.almtp", - fitted_mtp_savedir: str = '../', - max_dist: float = 5, - radial_basis_size: int = 8, - max_iter: int = 1000, # TODO check the next kwargs in mlip3 - energy_weight: float = 1, - force_weight: float = 1e-2, - stress_weight: float = 1e-3, - init_params: str = "same", - scale_by_force: float = 0, - bfgs_conv_tol: float = 1e-3, - weighting: str = "vibration", - ) -> int: + @staticmethod + def prepare_dataset_from_lammps( + root_data_dir: str, + atom_dict: Dict[int, str], + mode: str = "train" + ) -> MTPInputs: + lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode) + mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict) + return mtp_dataset + + def train(self, dataset: MTPInputs, mlip_name: str = 'mtp_fitted.almtp') -> str: """Training data with moment tensor method using MLIP-3. Override the base class method. Args: - train_structures: The list of Pymatgen Structure object. - train_energies: List of total energies of each structure in structures list. - train_forces: List of (m, 3) forces array of each structure with m atoms in structures list. - m can be varied with each single structure case. - train_stresses (optional): List of (6, ) virial stresses of each structure in structures list. - Defaults to None. - unfitted_mtp (optional): Define the initial mtp file. Default to 08g.amltp - fitted_mtp_savedir (optional): save directory for the fitted MTP. Defaults to '../' (current wd) - max_dist (optional): The actual radial cutoff. Defaults to 5. - radial_basis_size (optional): Relevant to number of radial basis function. Defaults to 8. - max_iter (optional): The number of maximum iteration. Defaults to 1000. - energy_weight (optional): The weight of energy. Defaults to 1 - force_weight (optional): The weight of forces. Defaults to 1e-2 - stress_weight (optional): The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3. - init_params (optional): How to initialize parameters if a potential was not - pre-fitted. Choose from "same" and "random". Defaults to "same". - scale_by_force (optional): If >0 then configurations near equilibrium - (with roughly force < scale_by_force) get more weight. Defaults to 0. - bfgs_conv_tol (optional): Stop training if error dropped by a factor smaller than this - over 50 BFGS iterations. Defaults to 1e-3. - weighting (optional): How to weight configuration with different sizes relative to each other. - Choose from "vibrations", "molecules" and "structures". Defaults to "vibration". + dataset: MTPInputs dataclass with the following elements: + structures: The list of Pymatgen Structure object. + energies: List of total energies of each structure in structures list. + forces: List of (m, 3) forces array of each structure with m atoms in structures list. + m can be varied with each single structure case. + mlip_name: str : filename for the trained MTP. Defaults to mtp_fitted.almtp Returns: - rc : return code of the mlp training script + fitted_mtp: path to the fitted MTP """ train_structures, train_forces, train_stresses = check_structures_forces_stresses( - train_structures, train_forces, train_stresses + dataset.structure, dataset.forces, None ) - train_pool = pool_from(train_structures, train_energies, train_forces, train_stresses) + # last argument is for stresses - not used currently + train_pool = pool_from(train_structures, dataset.energy, train_forces) + elements = sorted(set(itertools.chain(*[struct.species for struct in train_structures]))) self.elements = [str(element) for element in elements] # TODO move to __init__ atoms_filename = "train.cfgs" - with (ScratchDir(".")): # create a tmpdir - deleted afterwards + with ((ScratchDir("."))): # create a tmpdir - deleted afterwards atoms_filename = self.write_cfg(filename=atoms_filename, cfg_pool=train_pool) - if not unfitted_mtp: + if not self.mtp_args.unfitted_mtp: raise RuntimeError("No specific parameter file provided.") - mtp_file_path = os.path.join(self.mlp_templates, unfitted_mtp) - shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), unfitted_mtp)) + mtp_file_path = os.path.join(self.mlp_templates, self.mtp_args.unfitted_mtp) + shutil.copyfile(mtp_file_path, os.path.join(os.getcwd(), self.mtp_args.unfitted_mtp)) commands = [self.mlp_command, "mindist", atoms_filename] with open("min_dist", "w") as f: self._call_cmd_to_stdout(commands, f) @@ -311,23 +307,26 @@ def train( # split_symbol = "=" # different for mlip-2 (":") and mlip-3 ("=") # min_dist = float(lines[-1].split(split_symbol)[1]) - save_fitted_mtp = ".".join([unfitted_mtp.split(".")[0] + "_fitted", unfitted_mtp.split(".")[1]]) + save_fitted_mtp = mlip_name + if not save_fitted_mtp.endswith('.almtp'): + save_fitted_mtp += '.almtp' + cmds_list = [ self.mlp_command, "train", - unfitted_mtp, + self.mtp_args.unfitted_mtp, atoms_filename, f"--save_to={save_fitted_mtp}", - f"--iteration_limit={max_iter}", + f"--iteration_limit={self.mtp_args.max_iter}", "--al_mode=nbh", # active learning mode - required to get extrapolation grade - # f"--curr-pot-name={unfitted_mtp}", # TODO check those kwargs - # f"--energy-weight={energy_weight}", - # f"--force-weight={force_weight}", - # f"--stress-weight={stress_weight}", - # f"--init-params={init_params}", - # f"--scale-by-force={scale_by_force}", - # f"--bfgs-conv-tol={bfgs_conv_tol}", - # f"--weighting={weighting}", + f"--curr-pot-name={self.mtp_args.unfitted_mtp}", + f"--energy-weight={self.mtp_args.energy_weight}", + f"--force-weight={self.mtp_args.force_weight}", + f"--stress-weight={self.mtp_args.stress_weight}", + f"--init-params={self.mtp_args.init_params}", + f"--scale-by-force={self.mtp_args.scale_by_force}", + f"--bfgs-conv-tol={self.mtp_args.bfgs_conv_tol}", + f"--weighting={self.mtp_args.weighting}", ] stdout, rc = self._call_mlip(cmds_list) if rc != 0: @@ -340,6 +339,6 @@ def train( error_msg += msg[-1] raise RuntimeError(error_msg) # copy the fitted mtp outside the working directory - self.fitted_mtp = os.path.join(fitted_mtp_savedir, save_fitted_mtp) + self.fitted_mtp = os.path.join(self.mtp_args.fitted_mtp_savedir, save_fitted_mtp) shutil.copyfile(save_fitted_mtp, self.fitted_mtp) - return rc + return self.fitted_mtp diff --git a/experiments/active_learning_benchmark/config/mtp_training.yaml b/experiments/active_learning_benchmark/config/mtp_training.yaml new file mode 100644 index 00000000..1c835a78 --- /dev/null +++ b/experiments/active_learning_benchmark/config/mtp_training.yaml @@ -0,0 +1,33 @@ +active_learning_data: + _target_: crystal_diffusion.active_learning_loop.benchmark.ActiveLearningDataArguments + training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/ + evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/ + output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/ + +mlip: + _target_: crystal_diffusion.models.mlip.mtp.MTPWithMLIP3 + mtp_args: + _target_: crystal_diffusion.models.mlip.mtp.MTPArguments + mlip_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/ # path to MLIP3 library + name: None # MTP + param: None + unfitted_mtp: 08.almtp # Define the initial mtp file. Default to 08g.amltp + fitted_mtp_savedir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/mtp/ # save directory for the fitted MTP. Defaults to '../' (current wd) + max_dist: 5.0 # The actual radial cutoff. Defaults to 5. + radial_basis_size: 8 # Relevant to number of radial basis function. Defaults to 8. + max_iter: 1000 # The number of maximum iteration. Defaults to 1000. + energy_weight: 1.0 # The weight of energy. Defaults to 1 + force_weight: 1e-2 # The weight of forces. Defaults to 1e-2 + stress_weight: 1e-3 # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3. + init_params: same # how to initialize parameters if a potential was not pre-fitted: "same" or "random". + scale_by_force: 0 # If > 0 then configurations near equilibrium get more weight. Defaults to 0. + bfgs_conv_tol: 1e-3 # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations. + weighting: vibration # How to weight configuration with different sizes relative to each other. + # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration". + +structure_evaluation: + _target_: crystal_diffusion.active_learning_loop.benchmark.StructureEvaluationArguments + evaluation_criteria: 'nbh_grades' # maxvol coefficient name in the mtp outputs + criteria_threshold: 10.0 # atoms with a MaxVol value under this value are considered bad + number_of_structures: None # alternatively, take the N worst samples from the evaluation set + extraction_radius: float = 3 # extract atoms within this radius from the \ No newline at end of file From f3ac433ded5342b871b95566208917612eb5adc2 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Thu, 15 Aug 2024 08:43:28 -0400 Subject: [PATCH 05/13] active learning loop basic implementation --- .../activelearning_dataclasses.py | 21 + .../active_learning_loop/benchmark.py | 365 ++++++++++-------- .../active_learning_loop/oracle.py | 53 +++ .../active_learning_loop/utils.py | 68 ++++ crystal_diffusion/models/mlip/mtp.py | 98 +++-- 5 files changed, 425 insertions(+), 180 deletions(-) create mode 100644 crystal_diffusion/active_learning_loop/activelearning_dataclasses.py create mode 100644 crystal_diffusion/active_learning_loop/oracle.py create mode 100644 crystal_diffusion/active_learning_loop/utils.py diff --git a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py new file mode 100644 index 00000000..d4880f80 --- /dev/null +++ b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass + + +@dataclass(kw_only=True) +class ActiveLearningDataArguments: + training_data_dir: str # training data directory + evaluation_data_dir: str # evaluation data directory + output_dir: str # directory where to save the results + + +@dataclass(kw_only=True) +class StructureEvaluationArguments: + evaluation_criteria: str ='nbh_grades' + criteria_threshold: float = 10 + number_of_structures: int = None + extraction_radius: float = 3 + + +@dataclass(kw_only=True) +class RepaintingArguments: + model: str = 'dev_dummy' diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py index b17f6110..a01fb8c2 100644 --- a/crystal_diffusion/active_learning_loop/benchmark.py +++ b/crystal_diffusion/active_learning_loop/benchmark.py @@ -1,66 +1,122 @@ import argparse import os from dataclasses import dataclass -from typing import Dict, List, Optional +from typing import Any, List, Optional, Tuple import numpy as np import pandas as pd import yaml from hydra.utils import instantiate -# from crystal_diffusion.mlip.mtp_train import prepare_dataset, evaluate_mtp -from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3 -from crystal_diffusion.mlip.mtp_utils import MTPInputs - - -@dataclass(kw_only=True) -class ActiveLearningDataArguments: - training_data_dir: str # training data directory - evaluation_data_dir: str # evaluation data directory - output_dir: str # directory where to save the results - - -@dataclass(kw_only=True) -class StructureEvaluationArguments: - evaluation_criteria: str = 'nbh_grades' - criteria_threshold: float = 10 - number_of_structures: int = None - extraction_radius: float = 3 +from crystal_diffusion.active_learning_loop.utils import get_structures_for_retraining, extract_target_region +from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3, MTPInputs class ActiveLearningLoop: def __init__(self, meta_config: str, - evaluation_config: str): + ): + """Active learning benchmark with methods to train & evaluate a MLIP, isolate bad sub-structures, repaint + new structures and retrain the MLIP + + Args: + meta_config: path to a yaml configuration with the parameters for the modules in the class + """ assert os.path.exists(meta_config), "configuration file for active learning loop does not exist." - self.data_paths, self.mlip_model, self.evaluation_args = None, None, None - # use hydra to convert the yaml file in a dataclass format + # define the modules in the __init__ function + self.data_paths, self.mlip_model, self.eval_config, self.structure_generation = None, None, None, None + self.oracle = None + # use hydra to convert the yaml into modules and other data classes self.parse_config(meta_config) self.atom_dict = {1: "Si"} # TODO this should be define somewhere smart self.trained_mlips = [] # history of trained MLIPs (optional - not sure if we should keep this) + self.training_sets = [] # history of training sets def parse_config(self, meta_config: str): + """Read a configuration file and instantiate the different blocks with hydra. + + The configuration file should have the following blocks of parameters: + active_learning_data: dataset paths + mlip_model: MLIP module training parameters + structure_evaluation: identification and isolation of the atomic regions to finetune the MLIP + + Args: + meta_config: path to configuration yaml file + """ with open(meta_config, 'r') as stream: meta_config = yaml.load(stream, Loader=yaml.FullLoader) + # paths to the training & evaluation datasets self.data_paths = instantiate(meta_config['active_learning_data']) + # MLIP model - for example MTP self.mlip_model = instantiate(meta_config['mlip']) - self.evaluation_args = instantiate(meta_config['structure_evaluation']) + # parameters to find and isolate the problematic regions in the evaluation dataset + self.eval_config = instantiate(meta_config['structure_evaluation']) + # structure generation module + self.structure_generation = instantiate(meta_config['repainting_model']) + # force labeling module + self.oracle = instantiate(meta_config['oracle']) + + def train_mlip(self, round: int = 1, training_set: Optional[Any] = None) -> str: + """Train a MLIP using the parameters specified in the configuration file. + + Args: + round (optional): current round of training. Used to track now configurations in the training set. A round + includes the initial training and the evaluation process. + training_set (optional): if specified, use this dataset for training. Otherwise, use the dataset from the + paths in the configuration file. Defaults to None. + + Returns: + path to the trained MLIP model + """ + if training_set is None: + if len(self.training_sets) == 0: + self.training_sets = [self.mlip_model.prepare_dataset_from_lammps( + root_data_dir=self.data_paths.training_data_dir, + atom_dict=self.atom_dict, + mode="train" + )] + training_set = self.mlip_model.merge_inputs(self.training_sets) + + trained_mtp = self.mlip_model.train(training_set, mlip_name=f'mlip_round_{round}') + self.trained_mlips.append(trained_mtp) # history of trained MLIPs ... not sure if useful + return trained_mtp + + def evaluate_mlip(self, round: int = 1, mlip_name: Optional[str] = None, forces_available: bool = True + ) -> pd.DataFrame: + """Evaluate a MLIP using the parameters specified in the configuration file. - def train_mlip(self, round: int = 1) -> MTPWithMLIP3: - train_dataset = self.mlip_model.prepare_dataset_from_lammps( - root_data_dir=self.data_paths.training_data_dir, + Args: + round (optional): current round of training. Defaults to 1. + mlip_name (optional): if not None, use this MTP to evaluate the dataset. + forces_available (optional): if True, get the ground truth forces from the dataset. + + Returns: + dataframe with the atomic indices, positions, forces and evaluation criteria + """ + evaluation_dataset = self.mlip_model.prepare_dataset_from_lammps( + root_data_dir=self.data_paths.evaluation_data_dir, atom_dict=self.atom_dict, - mode="train" + mode="evaluation", + get_forces=forces_available ) - trained_mtp = self.mlip_model.train(train_dataset, mlip_name=f'mlip_round_{round}') - self.trained_mlips.append(trained_mtp) # history of trained MLIPs ... not sure if useful - return trained_mtp + # first returned element is the ground truth DF + # TODO make sure this works even if the GT is not available... + if mlip_name is None: + mlip_name = os.path.join(self.mlip_model.savedir, f'mlip_round_{round}.almtp') + _, prediction_df = self.mlip_model.evaluate(evaluation_dataset, mlip_name=mlip_name) - def evaluate_mlip(self, trained_mtp: MTPWithMLIP3) -> pd.DataFrame: - prediction_df = evaluate_mlip(self.mtp_config, self.atom_dict, trained_mtp) return prediction_df def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]: + """Find the structures with a high uncertainty based on the configuration file parameters. + + Args: + prediction_df: evaluation outputs of the MLIP model. Should contain atomic positions, uncertainty criteria + and structure indices. + + Returns: + list of structures with a high uncertainty criteria. + """ structures_to_retrain = get_structures_for_retraining(prediction_df, criteria_threshold=self.eval_config.criteria_threshold, number_of_structures=self.eval_config.number_of_structures, @@ -68,37 +124,138 @@ def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]: return structures_to_retrain def excise_worst_atom(self, structures_to_retrain: List[pd.DataFrame]) -> List[pd.DataFrame]: - bad_regions = [extract_target_region(s, extraction_radius=self.eval_config.extraction_radius) + """For a given structure, isolate the atom with the highest uncertainty criteria. + + Args: + structures_to_retrain: list of dataframes with the atomic positions and evaluate criteria + + Returns: + list of dataframes with only the targeted region + """ + # we assume the extraction region to be a sphere of radius extraction_radius around the worst atoms + # if more than 1 atom are bad in a structure, we only extract the worst + # TODO implement other extraction methods + bad_regions = [extract_target_region(s, + extraction_radius=self.eval_config.extraction_radius, + evaluation_criteria=self.eval_config.evaluation_criteria) for s in structures_to_retrain] return bad_regions def get_structure_candidate_from_generative_model(self, fixed_atoms: pd.DataFrame, - number_of_candidates: int =1 - ) -> None: - # TODO: call the diffusion model and get number_of_candidates samples with repaint using the fixed_atoms - # the output should be a list of np.ndarray (?) - return None + number_of_candidates: int = 1 + ) -> pd.DataFrame: + """Generate new structures around the specified fixed atoms. + + Args: + fixed_atoms: dataframe with the atom type, coordinates and unit cell information + number_of_candidates: how many structure to generate. Defaults to 1. - def update_mlip_training_set(self, new_structures: List[np.ndarray]): - # TODO take the examples generated by the diffusion model and append them to the MLIP training set - return None + Returns: + dataframe with the atom type, coordinates and unit cell - def round_of_active_learning_loop(self, trained_mtp: Optional[MTPWithMLIP3]=None, training_set=None): + """ + # TODO: call the diffusion model and get number_of_candidates samples with repaint using the fixed_atoms + if self.structure_generation.model == 'dev_dummy': # replace with a wrapper around the diffusion model + # and hydra instantiate + return fixed_atoms + else: + raise NotImplemented('Only dev_dummy is supported at the moment.') + + def new_structure_to_csv(self, new_structures: List[pd.DataFrame], round: int = 1): + """Save the generated structures in a csv format in the output dir + + Args: + new_structures: structures proposed by the generative model + round: current round of training. Defaults to 1. + """ + root_data_dir = os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}') + os.makedirs(root_data_dir, exist_ok=True) + for i, new_struc in enumerate(new_structures): + new_struc.to_csv(os.path.join(root_data_dir, f'structure_{i}.csv'), index=False) + + def get_labels_from_oracle(self, round: int = 1) -> Any: + """Compute energy and forces from an oracle such as LAMMPS for the new candidates generated in a round of AL. + + Args: + round (optional): round of retraining. Defaults to 1. + + Returns: + mlip data input (for example, MTPInputs) + """ + new_labeled_samples = [] + for file in os.listdir(os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}')): + if file.endswith('.csv'): + new_labeled_samples.append(self.call_oracle( + os.path.join(self.data_paths.output_dir, f'new_structures_round_{round}', file) + )) + new_labeled_samples = self.mlip_model.merge_inputs(new_labeled_samples) + return new_labeled_samples + + def call_oracle(self, path_to_file: str) -> Any: + """Compute energy and forces for a given atomic structure. + + Args: + path_to_file: path to csv file containing the atomic positions and structure information + + Returns: + mlip data inputs (for example, MTPInputs) + """ + data = pd.read_csv(path_to_file) + cartesian_positions = data[['x', 'y', 'z']].to_numpy() + box = np.eye(3, 3) * 5.43 # TODO this is bad - fix this + atom_type = np.ones(cartesian_positions.shape[0], dtype=np.integer) # TODO also bad + energy, forces = self.oracle(cartesian_positions, box, atom_type) + labels_as_mtp = self.mlip_model.prepare_dataset_from_numpy( + cartesian_positions, + box, + forces, + energy, + atom_type, + ) + return labels_as_mtp + + def round_of_active_learning_loop(self, trained_mlip: Optional[MTPWithMLIP3] = None + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Do a full loop of activate learning. + + The following steps are done in sequence: + - train a MLIP from the training set specified in the config file if trained_mlip is not specified + - evaluate the MLIP with the evaluation set specified in the config file + - find the "bad" structures in the evaluation set based on the criteria from the config file + - excise the problematic regions + - generate new candidates based on these regions + - call the oracle to get the labels for the new generated candidates + - retrain the MLIP + - evaluate the MLIP again + + Args: + trained_mlip (optional): if not None, use this MLIP as a starting point. If None, train a MLIP from scratch + using the training data specified in the config file. + + Returns: + dataframe with the MLIP evaluation results before finetuning with the generated structures + dataframe with the MLIP evaluation results after finetuning with the generated structures + """ # one round from a known mtp (or train from provided training set) # evaluate, find candidates and update MTP # return the updated MTP - if trained_mtp is None: - trained_mtp = self.train_mlip(training_set) - pred_df = self.evaluate_mlip(trained_mtp) + if trained_mlip is None: + trained_mlip = self.train_mlip() + pred_df = self.evaluate_mlip(mlip_name=trained_mlip) bad_structures = self.get_bad_structures(pred_df) bad_regions = self.excise_worst_atom(bad_structures) new_candidates = [self.get_structure_candidate_from_generative_model(x) for x in bad_regions] - new_mlip_training_set = self.update_mlip_training_set(self, new_candidates) - new_mtp = self.trained_mlip(new_mlip_training_set) - return new_mtp + self.new_structure_to_csv(new_candidates) + new_labeled_candidates = self.get_labels_from_oracle() + new_training_set = self.mlip_model.merge_inputs([self.training_sets[-1], new_labeled_candidates]) + self.training_sets.append(new_training_set) + new_mtp = self.train_mlip() + new_pred_df = self.evaluate_mlip(mlip_name=new_mtp) + return pred_df, new_pred_df def evaluate_mtp_update(self): + # TODO for benchmarking, compare results before and after finetuning pass @@ -114,121 +271,13 @@ def get_arguments() -> argparse.Namespace: return args - - - -def train_mlip(mtp_args, atom_dict: Dict[int, str]) -> MTPWithMLIP3: - """Train a MTP model using the specified data. - - Args: - mtp_args: MTPArguments data class - atom_dict: map between atom names and indices used by LAMMPS - - Returns: - trained MTP model - """ - # TODO make it more configurable - train_datasets = prepare_dataset(mtp_args.training_data_dir, atom_dict, mode="train") - # create the output directory if it doesn't exist already - trained_mtp = train_mtp(train_datasets, mlip_folder_path=mtp_args.mlip_dir, save_dir=mtp_args.output_dir) - return trained_mtp - - -def evaluate_mlip(mtp_args, atom_dict: Dict[int, str], mtp: MTPWithMLIP3) -> pd.DataFrame: - """Evaluate a MTP model using the specified data. - - Args: - mtp_args: MTPArguments data class - atom_dict: map between atom names and indices used by LAMMPS - mtp: trained MTP model - - Returns: - dataframe with a column specified the structure, a column with the atom index, 3 columns with the x,y,z - coordinates and a column with the MaxVol criteria (nbh_grades) - """ - evaluation_datasets = prepare_dataset(mtp_args.evaluation_data_dir, atom_dict, mode="evaluation") - # TODO the current evaluation method also returns the ground truth informations - this won't always be the case - # TODO make more configurable - _, prediction_df = evaluate_mtp(evaluation_datasets, mtp) - return prediction_df - - -def get_structures_for_retraining(prediction_df: pd.DataFrame, - criteria_threshold: Optional[float] = None, - number_of_structures: Optional[int] = None, - evaluation_criteria: str = 'nbh_grades', - structure_index: str = 'structure_index' - ) -> List[pd.DataFrame]: - assert criteria_threshold is not None or number_of_structures is not None, \ - "criteria_threshold or number_of_structures should be set." - # get the highest evaluation_criteria for each structure i.e. only the worst atom counts for structure selection - criteria_by_structure = prediction_df[[evaluation_criteria, structure_index]].groupby(structure_index).max() - # find the top number_of_structures - structures_indices = criteria_by_structure.sort_values(by=evaluation_criteria, ascending=False) - if number_of_structures is not None: - structures_indices = structures_indices[:number_of_structures] - else: # criteria_threshold is not None - structures_indices = structures_indices[structures_indices[evaluation_criteria] >= criteria_threshold] - structures_indices = structures_indices.index.to_list() - assert len(structures_indices) > 0, "No structure meet the criteria." - structures_to_retrain = [] - for idx in structures_indices: - structures_to_retrain.append(prediction_df[prediction_df[structure_index] == idx]) - return structures_to_retrain - - -def extract_target_region(structure_df: pd.DataFrame, - extraction_radius: float, - evaluation_criteria: str ='nbh_grades') -> pd.DataFrame: - """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious. - - Args: - structure_df: dataframe with the atomic positions and the evaluation criteria (e.g. MaxVol value) - extraction_radius: include all atoms within this distance of the targeted atom - evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (maxvol in MTP) - - Returns: - dataframe with the atomic coordinates in columns x, y, z - """ - # extract the worst ato and a region around of radius extraction_radius - # TODO better method to determine radius: number of atoms ? - target_atom = structure_df[evaluation_criteria].idxmax() - target_position = structure_df.loc[target_atom][['x', 'y', 'z']] - structure_df.loc[:, 'distance_squared'] = structure_df.apply( - lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1) - atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']] - return atom_positions - - -def generate_new_structures(fixed_atoms: List[pd.DataFrame]) -> None: - pass - - def main(): # args = get_arguments() # TODO get mtp_config_path from the args config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/" config_path = os.path.join(config_path, "config", "mtp_training.yaml") - al_loop = ActiveLearningLoop(config_path, 'hello') - al_loop.train_mlip() - - atom_dict = {1: "Si"} # TODO this should be define somewhere smart - # STEP 1: train a MLIP - # trained_mtp = train_mlip(mtp_config, atom_dict) - # STEP 2: evaluate the MLIP - # prediction_df = evaluate_mlip(mtp_config, atom_dict, trained_mtp) - # STEP 3: identify the problematic structures - # TODO extraction_params should come from a config file with hydra instantiate - # extraction_params = StructureEvaluationArguments() - #structures_to_retrain = get_structures_for_retraining(prediction_df, - # criteria_threshold=extraction_params.criteria_threshold, - # number_of_structures=extraction_params.number_of_structures, - # evaluation_criteria=extraction_params.evaluation_criteria) - # STEP 4: extract the region - #bad_regions = [extract_target_region(s, extraction_radius=extraction_params.extraction_radius) - # for s in structures_to_retrain] - # STEP 5: call the generative model to create new candidates - print('hello') + al_loop = ActiveLearningLoop(config_path) + al_loop.round_of_active_learning_loop() if __name__ == '__main__': diff --git a/crystal_diffusion/active_learning_loop/oracle.py b/crystal_diffusion/active_learning_loop/oracle.py new file mode 100644 index 00000000..cb6edbe5 --- /dev/null +++ b/crystal_diffusion/active_learning_loop/oracle.py @@ -0,0 +1,53 @@ +from pathlib import Path +from typing import Dict, Tuple + +import numpy as np + +from crystal_diffusion import DATA_DIR +from crystal_diffusion.oracle.lammps import get_energy_and_forces_from_lammps + + +class LAMMPS_for_active_learning: + def __init__(self): + pass + + def __call__(self, + cartesian_positions: np.ndarray, + box: np.ndarray, + atom_types: np.ndarray, + atom_type_map: Dict[int, str] = {1: 'Si'}, + tmp_work_dir: str = './', + pair_coeff_dir: Path = DATA_DIR) -> Tuple[float, np.ndarray]: + """Call LAMMPS to get energy and forces for a given set of atoms. + + Args: + cartesian_positions: atomic positions as a n_atom x 3 array + box: unit cell definition as a 3x3 array. Assumed to be diagonal. + atom_types: integers defining each atoms as an array of length n_atom + atom_type_map: map between indices and atom type. Defaults to {1: 'Si'} + tmp_work_dir: temporary work directory for LAMMPS. Defaults to ./ + pair_coeff_dir: path to stilinger-weber potential. Defaults to DATA_DIR. + + Returns: + energy and forces on each atom (n_atom x 3) + """ + shifted_positions = self.shift_positions(cartesian_positions, box) + energy, forces = get_energy_and_forces_from_lammps(shifted_positions, box, atom_types, atom_type_map, + tmp_work_dir, pair_coeff_dir) + return energy, forces[['fx', 'fy', 'fz']].to_numpy() + + def shift_positions(self, cartesian_positions: np.ndarray, box: np.ndarray) -> np.ndarray: + """Shift the positions of the atoms so all coordinates are positives. + + This is because LAMMPS will ignore atoms with coordinates outside the [0, a] range (a = size of the unit cell). + + Args: + cartesian_positions: atomic positions (n_atom x 3 array) + box: unit cell (3x3 array) - assumed to be diagonal + + Returns: + array with shifted positions + """ + for i, cell_size in enumerate(np.diag(box)): + cartesian_positions[:, i] = cartesian_positions[:, i] % cell_size + return cartesian_positions diff --git a/crystal_diffusion/active_learning_loop/utils.py b/crystal_diffusion/active_learning_loop/utils.py new file mode 100644 index 00000000..4fac563e --- /dev/null +++ b/crystal_diffusion/active_learning_loop/utils.py @@ -0,0 +1,68 @@ +from typing import List, Optional + +import pandas as pd + + +def get_structures_for_retraining(prediction_df: pd.DataFrame, + criteria_threshold: Optional[float] = None, + number_of_structures: Optional[int] = None, + evaluation_criteria: str = 'nbh_grades', + structure_index: str = 'structure_index' + ) -> List[pd.DataFrame]: + """Find the structures with the worst value of the evaluation criteria. + + Args: + prediction_df: dataframe with the atom positions, forces, uncertainty criteria (e.g. MaxVol coefficient), + indices and structure indices + criteria_threshold: value above which the evaluation_criteria is considered bad. Either this or + number_of_structures should be specified. number_of_structures has priority if both are specified. + Defaults to None. + number_of_structures: number of structures to return. The top number_of_structures with the highest value of + evaluation_criteria are returned. Either this or criteria_threshold should be specified. Defaults to None. + evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (MaxVol coefficient in MTP) + structure_index: name of the column in the dataframe with the index identifying the structure. Defaults to + structure_index. + + Returns: + list of the structures with a bad evaluation criteria. Length of the list depends on criteria_threhold and + number_of_structures. + """ + assert criteria_threshold is not None or number_of_structures is not None, \ + "criteria_threshold or number_of_structures should be set." + # get the highest evaluation_criteria for each structure i.e. only the worst atom counts for structure selection + criteria_by_structure = prediction_df[[evaluation_criteria, structure_index]].groupby(structure_index).max() + # find the top number_of_structures + structures_indices = criteria_by_structure.sort_values(by=evaluation_criteria, ascending=False) + if criteria_threshold is not None: + structures_indices = structures_indices[structures_indices[evaluation_criteria] >= criteria_threshold] + structures_indices = structures_indices.index.to_list() + if number_of_structures is not None: + structures_indices = structures_indices[:number_of_structures] + structures_to_retrain = [] + for idx in structures_indices: + structures_to_retrain.append(prediction_df[prediction_df[structure_index] == idx]) + return structures_to_retrain + + +def extract_target_region(structure_df: pd.DataFrame, + extraction_radius: float, + evaluation_criteria: str ='nbh_grades') -> pd.DataFrame: + """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious. + + Args: + structure_df: dataframe with the atomic positions and the evaluation criteria (e.g. MaxVol value) + extraction_radius: include all atoms within this distance of the targeted atom + evaluation_criteria: name of the evaluation criteria. Defaults to nbh_grades (maxvol in MTP) + + Returns: + dataframe with the atomic coordinates in columns x, y, z + """ + # extract the worst ato and a region around of radius extraction_radius + # TODO better method to determine radius: number of atoms ? + target_atom = structure_df[evaluation_criteria].idxmax() + target_position = structure_df.loc[target_atom][['x', 'y', 'z']] + # TODO periodicity... + structure_df.loc[:, 'distance_squared'] = structure_df.apply( + lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1) + atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']] + return atom_positions \ No newline at end of file diff --git a/crystal_diffusion/models/mlip/mtp.py b/crystal_diffusion/models/mlip/mtp.py index 474b6c25..5ff09839 100644 --- a/crystal_diffusion/models/mlip/mtp.py +++ b/crystal_diffusion/models/mlip/mtp.py @@ -21,7 +21,8 @@ from monty.tempfile import ScratchDir from pymatgen.core import Structure -from crystal_diffusion.mlip.mtp_utils import MTPInputs, crawl_lammps_directory, prepare_mtp_inputs_from_lammps +from crystal_diffusion.mlip.mtp_utils import (MTPInputs, crawl_lammps_directory, prepare_mtp_inputs_from_lammps, + concat_mtp_inputs) @dataclass(kw_only=True) @@ -61,7 +62,8 @@ def __init__(self, mtp_args: MTPArguments): self.fitted_mtp = None self.elements = None self.mtp_args = mtp_args - os.makedirs(mtp_args.fitted_mtp_savedir, exist_ok=True) + self.savedir = mtp_args.fitted_mtp_savedir + os.makedirs(self.savedir, exist_ok=True) def to_lammps_format(self): """Write the trained MTP in a LAMMPS compatible format.""" @@ -78,44 +80,51 @@ def to_lammps_format(self): pass def evaluate(self, - test_structures: List[Structure], - test_energies: List[float], - test_forces: List[List[float]], - test_stresses: Optional[List[List[float]]] = None, + dataset: MTPInputs, + mlip_name: str = 'mtp_fitted.almtp' ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Evaluate energies, forces, stresses and MaxVol gamma factor of structures with trained MTP. Args: - test_structures: evaluation set of pymatgen Structure Objects. - test_energies: list of total energies of each structure to evaluation in test_structures list. - test_forces: list of calculated (m, 3) forces of each evaluation structure with m atoms in structures list. - m can be varied with each single structure case. - test_stresses (optional): list of calculated (6, ) virial stresses of each evaluation structure in - test_structures list. If None, do not evaluate on stresses. Default to None. + dataset: MTPInputs dataclass with the following elements: + structures: The list of Pymatgen Structure object. + energies: List of total energies of each structure in structures list. + forces: List of (m, 3) forces array of each structure with m atoms in structures list. + m can be varied with each single structure case. + mlip_name: str : filename for the trained MTP. Defaults to mtp_fitted.almtp Returns: dataframe with ground truth energies, forces dataframe with predicted energies, forces, MaxVol gamma (nbh grades) """ - if self.fitted_mtp is None: - raise AttributeError('MTP was not trained. Please call train() before evaluate().') + if not mlip_name.endswith('.almtp'): + mlip_name += '.almtp' + assert os.path.exists(mlip_name), f"Trained MTP does not exists: {mlip_name}" original_file = "original.cfgs" predict_file = "predict.cfgs" - test_structures, test_forces, test_stresses = check_structures_forces_stresses( - test_structures, test_forces, test_stresses + + # TODO if forces are not available... + test_structures, test_forces, _ = check_structures_forces_stresses( + dataset.structure, dataset.forces, stresses=None ) - predict_pool = pool_from(test_structures, test_energies, test_forces, test_stresses) + predict_pool = pool_from(test_structures, dataset.energy, test_forces) + local_mtp_name = "mtp.almtp" with ScratchDir("."): # mlip needs a tmp_work_dir - we will manually copy relevant outputs elsewhere # write the structures to evaluate in a mlp compatible format original_file = self.write_cfg(original_file, cfg_pool=predict_pool) + # TODO how to handle when GT is not available df_orig = self.read_cfgs(original_file, nbh_grade=False) # read original values as a DataFrame + # copy the trained mtp in the scratchdir + shutil.copyfile(mlip_name, os.path.join(os.getcwd(), local_mtp_name)) # calculate_grade is the method to get the forces, energy & maxvol values - cmd = [self.mlp_command, "calculate_grade", self.fitted_mtp, original_file, predict_file] + cmd = [self.mlp_command, "calculate_grade", local_mtp_name, original_file, predict_file] predict_file += '.0' # added by mlp... stdout, rc = self._call_mlip(cmd) + + # check that MTP was called properly if rc != 0: error_msg = f"mlp exited with return code {rc}" msg = stdout.decode("utf-8").split("\n")[:-1] @@ -125,7 +134,7 @@ def evaluate(self, except Exception: error_msg += msg[-1] raise RuntimeError(error_msg) - + # read the config df_predict = self.read_cfgs(predict_file, nbh_grade=True) return df_orig, df_predict @@ -257,12 +266,57 @@ def _call_cmd_to_stdout(cmd: List[str], output_file: TextIO): def prepare_dataset_from_lammps( root_data_dir: str, atom_dict: Dict[int, str], - mode: str = "train" + mode: str = "train", + get_forces: bool = True, ) -> MTPInputs: + """Get the LAMMPS in a folder and organize them as inputs for a MTP + + Args: + root_data_dir: folder to read. Each LAMMPS sample is expected to be in a subfolder. + atom_dict: map from LAMMPS index to atom name. e.g. {1: 'Si'} + mode: subset of samples to get. Data from root_data_dir/*mode*/ folders will be parsed. Defaults to train. + get_forces: if True, get the forces from the samples. Defaults to True. + + Returns: + inputs for MTP in the MTPInputs dataclass + """ lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode) - mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict) + mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict, get_forces=get_forces) return mtp_dataset + @staticmethod + def prepare_dataset_from_numpy( + cartesian_positions: np.ndarray, + box: np.ndarray, + forces: np.ndarray, + energy: float, + atom_type: np.ndarray, + atom_dict: Dict[int, str] = {1: 'Si'} + ): + structure = Structure( + lattice=box, + species=[atom_dict[x] for x in atom_type], + coords=cartesian_positions, + coords_are_cartesian=True + ) + forces = forces.tolist() # from Nx3 np array to a list of length N where each element is a list of 3 forces + return MTPInputs(structure=[structure], forces=[forces], energy=[energy]) + + @staticmethod + def merge_inputs(mtp_inputs: List[MTPInputs]) -> MTPInputs: + """Merge a list of MTPInputs in a single MTPInputs. + + Args: + mtp_inputs: list of MTPInputs + + Returns: + merged MTPInputs + """ + merged_inputs = MTPInputs(structure=[], forces=[], energy=[]) + for x in mtp_inputs: + merged_inputs = concat_mtp_inputs(merged_inputs, x) + return merged_inputs + def train(self, dataset: MTPInputs, mlip_name: str = 'mtp_fitted.almtp') -> str: """Training data with moment tensor method using MLIP-3. @@ -339,6 +393,6 @@ def train(self, dataset: MTPInputs, mlip_name: str = 'mtp_fitted.almtp') -> str: error_msg += msg[-1] raise RuntimeError(error_msg) # copy the fitted mtp outside the working directory - self.fitted_mtp = os.path.join(self.mtp_args.fitted_mtp_savedir, save_fitted_mtp) + self.fitted_mtp = os.path.join(self.savedir, save_fitted_mtp) shutil.copyfile(save_fitted_mtp, self.fitted_mtp) return self.fitted_mtp From a7707c2f8a97cbfad8957fb121ea036ce20d0efb Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Thu, 15 Aug 2024 08:46:56 -0400 Subject: [PATCH 06/13] missing files --- crystal_diffusion/mlip/mtp_utils.py | 48 ++++++++++++++----- .../config/mtp_training.yaml | 15 ++++-- 2 files changed, 46 insertions(+), 17 deletions(-) diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py index 9f2f974c..eedae3fe 100644 --- a/crystal_diffusion/mlip/mtp_utils.py +++ b/crystal_diffusion/mlip/mtp_utils.py @@ -1,7 +1,7 @@ import os import re from dataclasses import dataclass -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Optional, Tuple import numpy as np import yaml @@ -12,24 +12,25 @@ class MTPInputs: """Create a dataclass to train or evaluate a MTP model.""" structure: List[Structure] - forces: List[List[float]] + forces: List[List[List[float]]] # num samples x num atoms x spatial dimension energy: List[float] -def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any]) -> \ - Tuple[List[Structure], List[List[float]]]: +def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, Any], forces_avail: bool = True) -> \ + Tuple[List[Structure], Optional[List[List[float]]]]: """Convert LAMMPS yaml output in a format compatible with MTP training and evaluation methods. Args: filename: path to LAMMPS output file in yaml format atom_dict: mapping from LAMMPS atom indices to atom type (atomic number as int or atom name as str) + forces_avail (optional): if True, get the forces from the LAMMPS output file. Defaults to True. Returns: list of pymatgen Structure containing the atoms and their positions - list of forces (n x 3) for each atom + list of forces (n x 3) for each atom. None if forces_avail is False """ structures = [] - forces = [] + forces = [] if forces_avail else None with (open(filename, 'r') as f): l_yaml = yaml.safe_load_all(f) for d in l_yaml: # loop over LAMMPS outputs and convert in pymatgen Structure objects @@ -47,9 +48,10 @@ def extract_structure_and_forces_from_file(filename: str, atom_dict: Dict[int, A coords=coords, coords_are_cartesian=True) structures.append(pm_structure) - force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']] - structure_forces = [[x[i] for i in force_idx] for x in d['data']] - forces.append(structure_forces) + if forces_avail: + force_idx = [d['keywords'].index(x) for x in ['fx', 'fy', 'fz']] + structure_forces = [[x[i] for i in force_idx] for x in d['data']] + forces.append(structure_forces) return structures, forces @@ -72,7 +74,8 @@ def extract_energy_from_thermo_log(filename: str) -> List[float]: def prepare_mtp_inputs_from_lammps(output_yaml: List[str], thermo_yaml: List[str], - atom_dict: Dict[int, Any] + atom_dict: Dict[int, Any], + get_forces: bool = True, ) -> MTPInputs: """Convert a list of LAMMPS output files and thermodynamic output files to MTP input format. @@ -80,9 +83,10 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str], output_yaml: list of LAMMPS output files as yaml. thermo_yaml: list of LAMMPS thermodynamic output files as yaml. atom_dict: mapping of LAMMPS indices to atom type. + get_forces (optional): if True, get the forces. Defaults to True. Returns: - dataclass used to + dataclass used as inputs to train and evaluation a MTP model """ mtp_inputs = { 'structure': [], @@ -90,9 +94,9 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str], 'forces': [] } for filename in output_yaml: - structures, forces = extract_structure_and_forces_from_file(filename, atom_dict) + structures, forces = extract_structure_and_forces_from_file(filename, atom_dict, get_forces) mtp_inputs['structure'] += structures - mtp_inputs['forces'] += forces + mtp_inputs['forces'] += forces # will be None if get_forces is False for filename in thermo_yaml: mtp_inputs['energy'] += extract_energy_from_thermo_log(filename) mtp_inputs = MTPInputs(structure=mtp_inputs['structure'], @@ -121,3 +125,21 @@ def crawl_lammps_directory(folder_name: str, folder_name_pattern: str= "train") lammps_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("dump.yaml")]) thermo_output_files.extend([os.path.join(dirpath, f) for f in filenames if f.endswith("thermo.yaml")]) return lammps_output_files, thermo_output_files + + +def concat_mtp_inputs(input1: MTPInputs, input2: MTPInputs) -> MTPInputs: + """Merge two MTP inputs data class. + + Args: + input1: first MTPInputs dataset + input2: second MTPInputs dataset + + Returns: + concatenated MTPInputs dataset + """ + concat_inputs = MTPInputs( + structure=input1.structure + input2.structure, + forces=input1.forces + input2.forces, + energy=input1.energy + input2.energy + ) + return concat_inputs diff --git a/experiments/active_learning_benchmark/config/mtp_training.yaml b/experiments/active_learning_benchmark/config/mtp_training.yaml index 1c835a78..b9e35944 100644 --- a/experiments/active_learning_benchmark/config/mtp_training.yaml +++ b/experiments/active_learning_benchmark/config/mtp_training.yaml @@ -1,5 +1,5 @@ active_learning_data: - _target_: crystal_diffusion.active_learning_loop.benchmark.ActiveLearningDataArguments + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/ evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/ output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/ @@ -26,8 +26,15 @@ mlip: # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration". structure_evaluation: - _target_: crystal_diffusion.active_learning_loop.benchmark.StructureEvaluationArguments + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.StructureEvaluationArguments evaluation_criteria: 'nbh_grades' # maxvol coefficient name in the mtp outputs criteria_threshold: 10.0 # atoms with a MaxVol value under this value are considered bad - number_of_structures: None # alternatively, take the N worst samples from the evaluation set - extraction_radius: float = 3 # extract atoms within this radius from the \ No newline at end of file + # number_of_structures: None # alternatively, take the N worst samples from the evaluation set + extraction_radius: 3 # extract atoms within this radius from the worst atom in a structure + +repainting_model: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.RepaintingArguments + model: dev_dummy # placeholder for development + +oracle: + _target_: crystal_diffusion.active_learning_loop.oracle.LAMMPS_for_active_learning From 80eb9702ffe3b1ed57abddf2cef4bbdf7fb5781a Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Thu, 22 Aug 2024 09:29:44 -0400 Subject: [PATCH 07/13] add simple metric for MaxVol improvement --- .../active_learning_loop/benchmark.py | 31 ++++++++++++++++--- .../config/mtp_training.yaml | 4 +-- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py index a01fb8c2..dc56e606 100644 --- a/crystal_diffusion/active_learning_loop/benchmark.py +++ b/crystal_diffusion/active_learning_loop/benchmark.py @@ -254,9 +254,26 @@ def round_of_active_learning_loop(self, trained_mlip: Optional[MTPWithMLIP3] = N new_pred_df = self.evaluate_mlip(mlip_name=new_mtp) return pred_df, new_pred_df - def evaluate_mtp_update(self): - # TODO for benchmarking, compare results before and after finetuning - pass + def evaluate_mtp_update(self, original_predictions: pd.DataFrame, updated_predictions) -> Tuple[float, float]: + """Find the evaluation criteria in the original predictions and the corresponding value after retraining. + + Args: + original_predictions: MLIP predictions before retraining + updated_predictions: MLIP predictions after retraining + + Returns: + worst evaluation_criteria (e.g. MaxVol) in the original evaluation + corresponding value after retraining with new samples. Not guaranteed to be the maximum value. + """ + # find the highest MaxVol in the original predictions - identified by the atom index and structure index + # TODO we assume a max - but it could be a min i + criteria = self.eval_config.evaluation_criteria + atom_index, structure_index, original_value = original_predictions.iloc[ + original_predictions[criteria].argmax()][['atom_index', 'structure_index', criteria]] + updated_value = updated_predictions.loc[ + (updated_predictions['atom_index'] == atom_index) & + (updated_predictions['structure_index'] == structure_index), criteria].values.item() + return original_value, updated_value def get_arguments() -> argparse.Namespace: @@ -277,8 +294,12 @@ def main(): config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/" config_path = os.path.join(config_path, "config", "mtp_training.yaml") al_loop = ActiveLearningLoop(config_path) - al_loop.round_of_active_learning_loop() - + # initial_df, new_df = al_loop.round_of_active_learning_loop() + # initial_df.to_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/before_tuning.csv'), index=False) + # new_df.to_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/after_tuning.csv'), index=False) + initial_df = pd.read_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/before_tuning.csv')) + new_df = pd.read_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/after_tuning.csv')) + al_loop.evaluate_mtp_update(initial_df, new_df) if __name__ == '__main__': main() diff --git a/experiments/active_learning_benchmark/config/mtp_training.yaml b/experiments/active_learning_benchmark/config/mtp_training.yaml index b9e35944..e6ab83b0 100644 --- a/experiments/active_learning_benchmark/config/mtp_training.yaml +++ b/experiments/active_learning_benchmark/config/mtp_training.yaml @@ -1,7 +1,7 @@ active_learning_data: _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments - training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/ - evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/active_loop_debug/ + training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/ + evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/ output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/ mlip: From 4102713aab4479a17d0d8d56b2a44043af8f2161 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Wed, 28 Aug 2024 10:04:41 -0400 Subject: [PATCH 08/13] unit tests for al benchmark --- tests/active_learning_loop/test_benchmark.py | 147 +++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 tests/active_learning_loop/test_benchmark.py diff --git a/tests/active_learning_loop/test_benchmark.py b/tests/active_learning_loop/test_benchmark.py new file mode 100644 index 00000000..ac712970 --- /dev/null +++ b/tests/active_learning_loop/test_benchmark.py @@ -0,0 +1,147 @@ +import os +from unittest.mock import MagicMock, mock_open + +import pandas as pd +import pytest + +from crystal_diffusion.active_learning_loop.benchmark import ActiveLearningLoop + + +class TestActiveLearningLoop: + @pytest.fixture + def mock_yaml_config(self): + return """ + active_learning_data: + key1: value1 + mlip: + key2: value2 + structure_evaluation: + key3: value3 + repainting_model: + key4: value4 + oracle: + key5: value5 + """ + + @pytest.fixture + def meta_config(self): # mock a path to a meta_config yaml file + return "fake_config.yaml" + + @pytest.fixture + def mock_al_loop(self, mocker, mock_yaml_config, meta_config): + # Mock the open function to simulate reading the YAML file + mocker.patch("builtins.open", mock_open(read_data=mock_yaml_config)) + # Mock os.path.exists to always return True + mocker.patch("os.path.exists", return_value=True) + # Mock the instantiate function from hydra.utils + mock_instantiate = mocker.patch("crystal_diffusion.active_learning_loop.benchmark.instantiate") + mock_instantiate.side_effect = lambda x: x # Return the config itself for simplicity + + # Create an instance of ActiveLearningLoop + loop = ActiveLearningLoop(meta_config) + return loop + + def test_parse_config(self, mock_al_loop, mock_yaml_config, meta_config): + + # Assertions to verify that the attributes were correctly set + assert mock_al_loop.data_paths == {'key1': 'value1'} + assert mock_al_loop.mlip_model == {'key2': 'value2'} + assert mock_al_loop.eval_config == {'key3': 'value3'} + assert mock_al_loop.structure_generation == {'key4': 'value4'} + assert mock_al_loop.oracle == {'key5': 'value5'} + + # Verify that the file was opened and the path was checked + open.assert_called_once_with(meta_config, 'r') + os.path.exists.assert_called_once_with(meta_config) + + def test_train_mlip(self, mocker, mock_yaml_config, mock_al_loop): + # Mocking the mlip_model's methods + mock_mlip_model = MagicMock() + mock_mlip_model.prepare_dataset_from_lammps.return_value = "mock_training_set" + mock_mlip_model.train.return_value = "mock_trained_mlip_model" + mock_mlip_model.merge_inputs.return_value = "mock_training_set" + + # Inject the mocked mlip_model into the loop instance + mock_al_loop.mlip_model = mock_mlip_model + mock_al_loop.data_paths = MagicMock(training_data_dir="mock_training_data_dir") + + # Run the train_mlip method without providing a training_set + result = mock_al_loop.train_mlip(round=1) + + # Verify the methods were called with expected parameters + mock_mlip_model.prepare_dataset_from_lammps.assert_called_once_with( + root_data_dir="mock_training_data_dir", + atom_dict=mock_al_loop.atom_dict, + mode="train" + ) + + mock_mlip_model.train.assert_called_once_with("mock_training_set", mlip_name="mlip_round_1") + + # Verify the trained model path is correctly returned + assert result == "mock_trained_mlip_model" + + # Verify that the trained model is appended to the history + assert mock_al_loop.trained_mlips == ["mock_trained_mlip_model"] + + # Test when a training set is provided + custom_training_set = "custom_training_set" + result = mock_al_loop.train_mlip(round=2, training_set=custom_training_set) + + # The prepare_dataset_from_lammps should not be called since we provided a training_set + mock_mlip_model.prepare_dataset_from_lammps.assert_called_once() # No new call + mock_mlip_model.train.assert_called_with(custom_training_set, mlip_name="mlip_round_2") + + assert result == "mock_trained_mlip_model" + assert mock_al_loop.trained_mlips == ["mock_trained_mlip_model", "mock_trained_mlip_model"] + + def test_evaluate_mlip(self, mock_al_loop, tmpdir): + # Mocking the mlip_model's methods + mock_mlip_model = MagicMock() + mock_evaluation_dataset = "mock_evaluation_dataset" + mock_prediction_df = pd.DataFrame({"atom_index": [0, 1], "force": [1.0, 2.0]}) + + # Mocking return values for the prepare_dataset_from_lammps and evaluate methods + mock_mlip_model.prepare_dataset_from_lammps.return_value = mock_evaluation_dataset + mock_mlip_model.evaluate.return_value = (None, mock_prediction_df) + + loop = mock_al_loop + + # Inject the mocked mlip_model into the loop instance + loop.mlip_model = mock_mlip_model + loop.data_paths = MagicMock(evaluation_data_dir="mock_evaluation_data_dir") + + # Run the evaluate_mlip method without specifying mlip_name + result_df = loop.evaluate_mlip(round=1) + + # Verify the prepare_dataset_from_lammps method was called with expected parameters + mock_mlip_model.prepare_dataset_from_lammps.assert_called_once_with( + root_data_dir="mock_evaluation_data_dir", + atom_dict=loop.atom_dict, + mode="evaluation", + get_forces=True + ) + # Verify the evaluate method was called with the correct parameters + expected_mlip_name = os.path.join(mock_mlip_model.savedir, 'mlip_round_1.almtp') + mock_mlip_model.evaluate.assert_called_once_with(mock_evaluation_dataset, mlip_name=expected_mlip_name) + + # Verify the method returns the correct dataframe + pd.testing.assert_frame_equal(result_df, mock_prediction_df) + + # Run the evaluate_mlip method with a custom mlip_name + custom_mlip_name = "custom_mlip.almtp" + result_df = loop.evaluate_mlip(round=2, mlip_name=custom_mlip_name) + + # The evaluate method should be called with the custom mlip_name + mock_mlip_model.evaluate.assert_called_with(mock_evaluation_dataset, mlip_name=custom_mlip_name) + + pd.testing.assert_frame_equal(result_df, mock_prediction_df) + + # Test without forces_available + _ = loop.evaluate_mlip(round=3, forces_available=False) + + mock_mlip_model.prepare_dataset_from_lammps.assert_called_with( + root_data_dir="mock_evaluation_data_dir", + atom_dict=loop.atom_dict, + mode="evaluation", + get_forces=False + ) From 713ebdce2a50e852e62e5a1e85ab42751d1197ce Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Wed, 28 Aug 2024 10:17:52 -0400 Subject: [PATCH 09/13] some flake8 and cleaning the main of benchmark --- .../active_learning_loop/benchmark.py | 29 ++++++------- .../active_learning_loop/config.yaml | 43 +++++++++++++++++++ .../local/active_learning/run_benchmark.sh | 5 +++ 3 files changed, 61 insertions(+), 16 deletions(-) create mode 100644 examples/config_files/active_learning_loop/config.yaml create mode 100644 examples/local/active_learning/run_benchmark.sh diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py index dc56e606..af794e6e 100644 --- a/crystal_diffusion/active_learning_loop/benchmark.py +++ b/crystal_diffusion/active_learning_loop/benchmark.py @@ -1,6 +1,5 @@ import argparse import os -from dataclasses import dataclass from typing import Any, List, Optional, Tuple import numpy as np @@ -8,8 +7,9 @@ import yaml from hydra.utils import instantiate -from crystal_diffusion.active_learning_loop.utils import get_structures_for_retraining, extract_target_region -from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3, MTPInputs +from crystal_diffusion.active_learning_loop.utils import ( + extract_target_region, get_structures_for_retraining) +from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3 class ActiveLearningLoop: @@ -117,9 +117,10 @@ def get_bad_structures(self, prediction_df: pd.DataFrame) -> List[pd.DataFrame]: Returns: list of structures with a high uncertainty criteria. """ + num_structures = self.eval_config.number_of_structures structures_to_retrain = get_structures_for_retraining(prediction_df, criteria_threshold=self.eval_config.criteria_threshold, - number_of_structures=self.eval_config.number_of_structures, + number_of_structures=num_structures, evaluation_criteria=self.eval_config.evaluation_criteria) return structures_to_retrain @@ -160,7 +161,7 @@ def get_structure_candidate_from_generative_model(self, # and hydra instantiate return fixed_atoms else: - raise NotImplemented('Only dev_dummy is supported at the moment.') + raise NotImplementedError('Only dev_dummy is supported at the moment.') def new_structure_to_csv(self, new_structures: List[pd.DataFrame], round: int = 1): """Save the generated structures in a csv format in the output dir @@ -271,8 +272,8 @@ def evaluate_mtp_update(self, original_predictions: pd.DataFrame, updated_predic atom_index, structure_index, original_value = original_predictions.iloc[ original_predictions[criteria].argmax()][['atom_index', 'structure_index', criteria]] updated_value = updated_predictions.loc[ - (updated_predictions['atom_index'] == atom_index) & - (updated_predictions['structure_index'] == structure_index), criteria].values.item() + (updated_predictions['atom_index'] == atom_index) + & (updated_predictions['structure_index'] == structure_index), criteria].values.item() return original_value, updated_value @@ -283,23 +284,19 @@ def get_arguments() -> argparse.Namespace: args: arguments """ parser = argparse.ArgumentParser() - parser.add_argument('--mtp_config', help='path to data directory', required=True) + parser.add_argument('--config', help='path to data directory', required=True) args = parser.parse_args() return args def main(): - # args = get_arguments() + args = get_arguments() # TODO get mtp_config_path from the args - config_path = "/Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/" - config_path = os.path.join(config_path, "config", "mtp_training.yaml") + config_path = args.config al_loop = ActiveLearningLoop(config_path) - # initial_df, new_df = al_loop.round_of_active_learning_loop() - # initial_df.to_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/before_tuning.csv'), index=False) - # new_df.to_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/after_tuning.csv'), index=False) - initial_df = pd.read_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/before_tuning.csv')) - new_df = pd.read_csv(os.path.join(os.getcwd(), '../../experiments/active_learning_benchmark/debug/after_tuning.csv')) + initial_df, new_df = al_loop.round_of_active_learning_loop() al_loop.evaluate_mtp_update(initial_df, new_df) + if __name__ == '__main__': main() diff --git a/examples/config_files/active_learning_loop/config.yaml b/examples/config_files/active_learning_loop/config.yaml new file mode 100644 index 00000000..04e70faf --- /dev/null +++ b/examples/config_files/active_learning_loop/config.yaml @@ -0,0 +1,43 @@ +active_learning_data: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.ActiveLearningDataArguments + training_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/ + evaluation_data_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/data/al_baseline_dev/v1/ + output_dir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/ + +mlip: + _target_: crystal_diffusion.models.mlip.mtp.MTPWithMLIP3 + mtp_args: + _target_: crystal_diffusion.models.mlip.mtp.MTPArguments + mlip_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/mlip-3/ # path to MLIP3 library + name: None # MTP + param: None + unfitted_mtp: 08.almtp # Define the initial mtp file. Default to 08g.amltp + fitted_mtp_savedir: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/benchmark/debug/mtp/ # save directory for the fitted MTP. Defaults to '../' (current wd) + max_dist: 5.0 # The actual radial cutoff. Defaults to 5. + radial_basis_size: 8 # Relevant to number of radial basis function. Defaults to 8. + max_iter: 1000 # The number of maximum iteration. Defaults to 1000. + energy_weight: 1.0 # The weight of energy. Defaults to 1 + force_weight: 1e-2 # The weight of forces. Defaults to 1e-2 + stress_weight: 1e-3 # The weight of stresses. Zero-weight can be assigned. Defaults to 1e-3. + init_params: same # how to initialize parameters if a potential was not pre-fitted: "same" or "random". + scale_by_force: 0 # If > 0 then configurations near equilibrium get more weight. Defaults to 0. + bfgs_conv_tol: 1e-3 # Stop training if error dropped by a factor smaller than this over 50 BFGS iterations. + weighting: vibration # How to weight configuration with different sizes relative to each other. + # Choose from "vibrations", "molecules" and "structures". Defaults to "vibration". + +structure_evaluation: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.StructureEvaluationArguments + evaluation_criteria: 'nbh_grades' # maxvol coefficient name in the mtp outputs + criteria_threshold: 10.0 # atoms with a MaxVol value under this value are considered bad + # number_of_structures: None # alternatively, take the N worst samples from the evaluation set + extraction_radius: 3 # extract atoms within this radius from the worst atom in a structure + +repainting_model: + _target_: crystal_diffusion.active_learning_loop.activelearning_dataclasses.RepaintingArguments + model: dev_dummy # placeholder for development + +oracle: + _target_: crystal_diffusion.active_learning_loop.oracle.LAMMPS_for_active_learning + +initial_df_save_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/debug/before_tuning.csv +updated_df_save_path: /Users/simonb/ic-collab/courtois_collab/crystal_diffusion/experiments/active_learning_benchmark/debug/after_tuning.csv \ No newline at end of file diff --git a/examples/local/active_learning/run_benchmark.sh b/examples/local/active_learning/run_benchmark.sh new file mode 100644 index 00000000..20c65901 --- /dev/null +++ b/examples/local/active_learning/run_benchmark.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +CONFIG=../../config_files/active_learning_loop/config.yaml + +python ../../../crystal_diffusion/active_learning_loop/benchmark.py --config $CONFIG \ No newline at end of file From f83f766b78e4d794c49a86d6ebea44438a37d168 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Wed, 28 Aug 2024 10:59:18 -0400 Subject: [PATCH 10/13] fixing mtp unit tests and some flake errors --- crystal_diffusion/mlip/mtp_utils.py | 29 +++++++++++++- crystal_diffusion/models/mlip/mtp.py | 9 +++-- tests/models/test_mtp.py | 60 ++++++++++++++++------------ 3 files changed, 68 insertions(+), 30 deletions(-) diff --git a/crystal_diffusion/mlip/mtp_utils.py b/crystal_diffusion/mlip/mtp_utils.py index eedae3fe..d2cbf298 100644 --- a/crystal_diffusion/mlip/mtp_utils.py +++ b/crystal_diffusion/mlip/mtp_utils.py @@ -4,8 +4,10 @@ from typing import Any, Dict, List, Optional, Tuple import numpy as np +import pandas as pd import yaml from pymatgen.core import Structure +from sklearn.metrics import mean_absolute_error @dataclass(kw_only=True) @@ -105,7 +107,7 @@ def prepare_mtp_inputs_from_lammps(output_yaml: List[str], return mtp_inputs -def crawl_lammps_directory(folder_name: str, folder_name_pattern: str= "train") -> Tuple[List[str], List[str]]: +def crawl_lammps_directory(folder_name: str, folder_name_pattern: str = "train") -> Tuple[List[str], List[str]]: """Crawl through a folder and find the LAMMPS output files in folders containing a specified pattern in their name. LAMMPS outputs should end with dump.yaml and Thermondynamics variables files should end with thermo.yaml @@ -143,3 +145,28 @@ def concat_mtp_inputs(input1: MTPInputs, input2: MTPInputs) -> MTPInputs: energy=input1.energy + input2.energy ) return concat_inputs + + +def get_metrics_from_pred(df_orig: pd.DataFrame, df_predict: pd.DataFrame) -> Tuple[float, float]: + """Get mean absolute error on energy and forces from the outputs of MTP. + + Args: + df_orig: dataframe with ground truth values + df_predict: dataframe with MTP predictions + + Returns: + MAE on energy in eV/atom and MAE on forces in eV/Å + """ + # from demo in maml + # get a single predicted energy per structure + predicted_energy = df_predict.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'}) + # normalize by number of atoms + predicted_energy = (predicted_energy['energy'] / predicted_energy['atom_index']).to_numpy() + # same for ground truth + gt_energy = df_orig.groupby('structure_index').agg({'energy': 'mean', 'atom_index': 'count'}) + gt_energy = (gt_energy['energy'] / gt_energy['atom_index']).to_numpy() + + predicted_forces = (df_predict[['fx', 'fy', 'fz']].to_numpy().flatten()) + gt_forces = (df_orig[['fx', 'fy', 'fz']].to_numpy().flatten()) + + return mean_absolute_error(predicted_energy, gt_energy), mean_absolute_error(predicted_forces, gt_forces) diff --git a/crystal_diffusion/models/mlip/mtp.py b/crystal_diffusion/models/mlip/mtp.py index 5ff09839..31ce468f 100644 --- a/crystal_diffusion/models/mlip/mtp.py +++ b/crystal_diffusion/models/mlip/mtp.py @@ -21,14 +21,15 @@ from monty.tempfile import ScratchDir from pymatgen.core import Structure -from crystal_diffusion.mlip.mtp_utils import (MTPInputs, crawl_lammps_directory, prepare_mtp_inputs_from_lammps, - concat_mtp_inputs) +from crystal_diffusion.mlip.mtp_utils import (MTPInputs, concat_mtp_inputs, + crawl_lammps_directory, + prepare_mtp_inputs_from_lammps) @dataclass(kw_only=True) class MTPArguments: mlip_path: str # path to MLIP3 library - name: Optional[str]= None # MTP + name: Optional[str] = None # MTP param: Optional[Dict[Any, Any]] = None unfitted_mtp: str = "08.almtp" # Define the initial mtp file. Default to 08g.amltp fitted_mtp_savedir: str = '../' # save directory for the fitted MTP. Defaults to '../' (current wd) @@ -106,7 +107,7 @@ def evaluate(self, # TODO if forces are not available... test_structures, test_forces, _ = check_structures_forces_stresses( - dataset.structure, dataset.forces, stresses=None + dataset.structure, dataset.forces, None ) predict_pool = pool_from(test_structures, dataset.energy, test_forces) local_mtp_name = "mtp.almtp" diff --git a/tests/models/test_mtp.py b/tests/models/test_mtp.py index 5431dd15..13a661ae 100644 --- a/tests/models/test_mtp.py +++ b/tests/models/test_mtp.py @@ -8,10 +8,11 @@ from pymatgen.core import Structure from sklearn.metrics import mean_absolute_error -from crystal_diffusion.models.mtp import MTPWithMLIP3 -from crystal_diffusion.train_mtp import ( - extract_energy_from_thermo_log, extract_structure_and_forces_from_file, - get_metrics_from_pred, prepare_mtp_inputs_from_lammps) +from crystal_diffusion.mlip.mtp_utils import ( + MTPInputs, extract_energy_from_thermo_log, + extract_structure_and_forces_from_file, get_metrics_from_pred, + prepare_mtp_inputs_from_lammps) +from crystal_diffusion.models.mlip.mtp import MTPArguments, MTPWithMLIP3 class FakeStructure: @@ -38,15 +39,15 @@ def mock_popen(mocker): # Mock the external dependencies and method calls within the MTPWithMLIP3.train method -def test_train(mocker, mock_popen): +def test_train(mocker, mock_popen, tmpdir): # Mock os.path.exists to always return True mocker.patch("os.path.exists", return_value=True) # Mock check_structures_forces_stresses to return a value without needing real input - mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses", side_effect=passthrough) + mocker.patch("crystal_diffusion.models.mlip.mtp.check_structures_forces_stresses", side_effect=passthrough) # Mock pool_from to return a simplified pool object - mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="simple_pool_object") + mocker.patch("crystal_diffusion.models.mlip.mtp.pool_from", return_value="simple_pool_object") # Mock self.write_cfg to simulate creating a config file without file operations mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg") @@ -54,19 +55,23 @@ def test_train(mocker, mock_popen): mocker.patch("shutil.copyfile", return_value=None) # Initialize MTPWithMLIP3 with mock parameters - model = MTPWithMLIP3(mlip_path="/mock/path", name="test_model") + mtp_args = MTPArguments( + mlip_path="/mock/path", + name="test_model", + unfitted_mtp="08.almtp", + fitted_mtp_savedir=tmpdir + ) + model = MTPWithMLIP3(mtp_args) # Call the train method + mtp_inputs = MTPInputs( + structure=[FakeStructure(['H', 'O']), FakeStructure(['Si'])], + forces=[], + energy=[1, 2] + ) - return_code = model.train( - train_structures=[FakeStructure(['H', 'O']), FakeStructure(['Si'])], - train_energies=[1, 2], - train_forces=[], - train_stresses=[], - unfitted_mtp="08.almtp", - fitted_mtp_savedir="/mock/dir" + _ = model.train( + mtp_inputs, ) - # Assert the expected results - assert return_code == 0 # The train method should return the mocked subprocess success return code from mock_open # Assert that mocked methods were called model.write_cfg.assert_called() @@ -91,19 +96,17 @@ def mtp_instance(mocker): return instance -# def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen): def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen): test_structures = [fake_structure] test_energies = [1.0] test_forces = [[[0, 0, 0]]] - test_stresses = None # or appropriate mock stresses # Mock check_structures_forces_stresses to return the arguments unmodified - mocker.patch("crystal_diffusion.models.mtp.check_structures_forces_stresses", + mocker.patch("crystal_diffusion.models.mlip.mtp.check_structures_forces_stresses", side_effect=lambda s, f, st: (s, f, st)) # Mock pool_from to return a mocked value - mocker.patch("crystal_diffusion.models.mtp.pool_from", return_value="mock_pool") + mocker.patch("crystal_diffusion.models.mlip.mtp.pool_from", return_value="mock_pool") # Mock self.write_cfg to simulate creating a config file without file operations mocker.patch.object(MTPWithMLIP3, "write_cfg", return_value="mock_filename.cfg") @@ -116,8 +119,14 @@ def test_evaluate(mocker, fake_structure, mtp_instance, mock_popen): mocker.patch("shutil.copyfile", return_value=None) mocker.patch("os.path.exists", return_value=True) + mtp_inputs = MTPInputs( + structure=test_structures, + forces=test_forces, + energy=test_energies + ) + # Perform the test - df_orig, df_predict = mtp_instance.evaluate(test_structures, test_energies, test_forces, test_stresses) + df_orig, df_predict = mtp_instance.evaluate(mtp_inputs) # Assertions can vary based on the real output of `read_cfgs` # Here's an example assertion assuming `read_cfgs` returns a string in this mocked scenario @@ -209,12 +218,13 @@ def test_extract_energy_from_thermo_log(tmpdir): @pytest.fixture def mock_extract_energy_from_thermo_log(mocker): - return mocker.patch('crystal_diffusion.train_mtp.extract_energy_from_thermo_log', return_value=[]) + return mocker.patch('crystal_diffusion.mlip.mtp_utils.extract_energy_from_thermo_log', return_value=[]) @pytest.fixture def mock_extract_structure_and_forces(mocker): - return mocker.patch('crystal_diffusion.train_mtp.extract_structure_and_forces_from_file', return_value=([], [])) + return mocker.patch('crystal_diffusion.mlip.mtp_utils.extract_structure_and_forces_from_file', + return_value=([], [])) def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_extract_energy_from_thermo_log, tmpdir): @@ -230,7 +240,7 @@ def test_prepare_mtp_inputs_from_lammps(mock_extract_structure_and_forces, mock_ # Verify that the mocks were called correctly assert mock_extract_structure_and_forces.call_count == 2 - mock_extract_structure_and_forces.assert_called_with(output_yaml_files[1], atom_dict) + mock_extract_structure_and_forces.assert_called_with(output_yaml_files[1], atom_dict, True) assert mock_extract_energy_from_thermo_log.call_count == 2 mock_extract_energy_from_thermo_log.assert_called_with(thermo_yaml_files[1]) From 259c00307156ff1332de85953acc53770d9dc57e Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Tue, 3 Sep 2024 12:33:15 -0400 Subject: [PATCH 11/13] flake8 fixes --- .../activelearning_dataclasses.py | 2 +- crystal_diffusion/active_learning_loop/utils.py | 4 ++-- crystal_diffusion/mlip/mtp_train.py | 16 +++++++--------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py index d4880f80..f0ffcd89 100644 --- a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py +++ b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py @@ -10,7 +10,7 @@ class ActiveLearningDataArguments: @dataclass(kw_only=True) class StructureEvaluationArguments: - evaluation_criteria: str ='nbh_grades' + evaluation_criteria: str = 'nbh_grades' criteria_threshold: float = 10 number_of_structures: int = None extraction_radius: float = 3 diff --git a/crystal_diffusion/active_learning_loop/utils.py b/crystal_diffusion/active_learning_loop/utils.py index 4fac563e..02833f89 100644 --- a/crystal_diffusion/active_learning_loop/utils.py +++ b/crystal_diffusion/active_learning_loop/utils.py @@ -46,7 +46,7 @@ def get_structures_for_retraining(prediction_df: pd.DataFrame, def extract_target_region(structure_df: pd.DataFrame, extraction_radius: float, - evaluation_criteria: str ='nbh_grades') -> pd.DataFrame: + evaluation_criteria: str = 'nbh_grades') -> pd.DataFrame: """Extract the atom with the worst evaluation criteria and all the atoms within a distance extraction_radious. Args: @@ -65,4 +65,4 @@ def extract_target_region(structure_df: pd.DataFrame, structure_df.loc[:, 'distance_squared'] = structure_df.apply( lambda x: sum([(x[i] - target_position[i]) ** 2 for i in ['x', 'y', 'z']]), axis=1) atom_positions = structure_df.loc[structure_df['distance_squared'] <= extraction_radius ** 2, ['x', 'y', 'z']] - return atom_positions \ No newline at end of file + return atom_positions diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py index 8e449f80..ae70b0c9 100644 --- a/crystal_diffusion/mlip/mtp_train.py +++ b/crystal_diffusion/mlip/mtp_train.py @@ -3,28 +3,26 @@ Running the main() runs a debugging example. Entry points are train_mtp. """ import argparse -from dataclasses import dataclass -from typing import Any, Dict, List, NamedTuple, Tuple +from typing import Dict, Tuple -import numpy as np import pandas as pd -import yaml -from pymatgen.core import Structure from sklearn.metrics import mean_absolute_error +from crystal_diffusion.mlip.mtp_utils import (MTPInputs, + crawl_lammps_directory, + prepare_mtp_inputs_from_lammps) from crystal_diffusion.models.mlip.mtp import MTPWithMLIP3 -from crystal_diffusion.mlip.mtp_utils import prepare_mtp_inputs_from_lammps, crawl_lammps_directory, MTPInputs atom_dict = {1: 'Si'} -def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTP_Inputs: +def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTPInputs: lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode) mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict) return mtp_dataset -def train_mtp(train_inputs: MTP_Inputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3: +def train_mtp(train_inputs: MTPInputs, mlip_folder_path: str, save_dir: str) -> MTPWithMLIP3: """Create and train an MTP potential. Args: @@ -52,7 +50,7 @@ def train_mtp(train_inputs: MTP_Inputs, mlip_folder_path: str, save_dir: str) -> return mtp -def evaluate_mtp(eval_inputs: MTP_Inputs, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]: +def evaluate_mtp(eval_inputs: MTPInputs, mtp: MTPWithMLIP3) -> Tuple[pd.DataFrame, pd.DataFrame]: """Evaluate a trained MTP potential. Args: From 9f6eca4602959f7b9691d889eb05fdb229dcff49 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Tue, 3 Sep 2024 12:52:59 -0400 Subject: [PATCH 12/13] some missing docstrings --- .../activelearning_dataclasses.py | 3 +++ .../active_learning_loop/benchmark.py | 10 +++++++--- .../active_learning_loop/oracle.py | 2 ++ crystal_diffusion/mlip/mtp_train.py | 11 +++++++++++ crystal_diffusion/models/mlip/mtp.py | 18 ++++++++++++++++-- 5 files changed, 39 insertions(+), 5 deletions(-) diff --git a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py index f0ffcd89..c5333d3c 100644 --- a/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py +++ b/crystal_diffusion/active_learning_loop/activelearning_dataclasses.py @@ -3,6 +3,7 @@ @dataclass(kw_only=True) class ActiveLearningDataArguments: + """Paths to the training, validaition datasets and output directory.""" training_data_dir: str # training data directory evaluation_data_dir: str # evaluation data directory output_dir: str # directory where to save the results @@ -10,6 +11,7 @@ class ActiveLearningDataArguments: @dataclass(kw_only=True) class StructureEvaluationArguments: + """Parameters related to the MLIP evaluation.""" evaluation_criteria: str = 'nbh_grades' criteria_threshold: float = 10 number_of_structures: int = None @@ -18,4 +20,5 @@ class StructureEvaluationArguments: @dataclass(kw_only=True) class RepaintingArguments: + """Parameters related to the structure generation model.""" model: str = 'dev_dummy' diff --git a/crystal_diffusion/active_learning_loop/benchmark.py b/crystal_diffusion/active_learning_loop/benchmark.py index af794e6e..7b231d3a 100644 --- a/crystal_diffusion/active_learning_loop/benchmark.py +++ b/crystal_diffusion/active_learning_loop/benchmark.py @@ -13,11 +13,14 @@ class ActiveLearningLoop: + """Method to train, evaluate and fine-tune a MLIP.""" def __init__(self, meta_config: str, ): - """Active learning benchmark with methods to train & evaluate a MLIP, isolate bad sub-structures, repaint - new structures and retrain the MLIP + """Active learning benchmark. + + Includes methods to train & evaluate a MLIP, isolate bad sub-structures, repaint new structures and retrain + the MLIP. Args: meta_config: path to a yaml configuration with the parameters for the modules in the class @@ -164,7 +167,7 @@ def get_structure_candidate_from_generative_model(self, raise NotImplementedError('Only dev_dummy is supported at the moment.') def new_structure_to_csv(self, new_structures: List[pd.DataFrame], round: int = 1): - """Save the generated structures in a csv format in the output dir + """Save the generated structures in a csv format in the output dir. Args: new_structures: structures proposed by the generative model @@ -290,6 +293,7 @@ def get_arguments() -> argparse.Namespace: def main(): + """Example to do an active learning loop once.""" args = get_arguments() # TODO get mtp_config_path from the args config_path = args.config diff --git a/crystal_diffusion/active_learning_loop/oracle.py b/crystal_diffusion/active_learning_loop/oracle.py index cb6edbe5..a1893f5c 100644 --- a/crystal_diffusion/active_learning_loop/oracle.py +++ b/crystal_diffusion/active_learning_loop/oracle.py @@ -8,7 +8,9 @@ class LAMMPS_for_active_learning: + """Oracle using LAMMPS to get the energy and forces on atoms.""" def __init__(self): + """Initialize the class.""" pass def __call__(self, diff --git a/crystal_diffusion/mlip/mtp_train.py b/crystal_diffusion/mlip/mtp_train.py index ae70b0c9..bcf527ed 100644 --- a/crystal_diffusion/mlip/mtp_train.py +++ b/crystal_diffusion/mlip/mtp_train.py @@ -17,6 +17,17 @@ def prepare_dataset(root_data_dir: str, atom_dict: Dict[int, str], mode: str = "train") -> MTPInputs: + """Prepare the dataset in a given directory into a MTP format. + + Args: + root_data_dir: root data directory to parse + atom_dict: map between an index and an atom type + mode: type of data to look for. e.g. train will only look at the folders in root_data_dirwith "train" in their + name. Defaults to train. + + Returns: + data in the MTPInputs dataclass + """ lammps_outputs, thermo_outputs = crawl_lammps_directory(root_data_dir, mode) mtp_dataset = prepare_mtp_inputs_from_lammps(lammps_outputs, thermo_outputs, atom_dict) return mtp_dataset diff --git a/crystal_diffusion/models/mlip/mtp.py b/crystal_diffusion/models/mlip/mtp.py index 31ce468f..bc822cf6 100644 --- a/crystal_diffusion/models/mlip/mtp.py +++ b/crystal_diffusion/models/mlip/mtp.py @@ -28,6 +28,7 @@ @dataclass(kw_only=True) class MTPArguments: + """Arguments to train an MTP with the MLIP3 library.""" mlip_path: str # path to MLIP3 library name: Optional[str] = None # MTP param: Optional[Dict[Any, Any]] = None @@ -270,7 +271,7 @@ def prepare_dataset_from_lammps( mode: str = "train", get_forces: bool = True, ) -> MTPInputs: - """Get the LAMMPS in a folder and organize them as inputs for a MTP + """Get the LAMMPS in a folder and organize them as inputs for a MTP. Args: root_data_dir: folder to read. Each LAMMPS sample is expected to be in a subfolder. @@ -293,7 +294,20 @@ def prepare_dataset_from_numpy( energy: float, atom_type: np.ndarray, atom_dict: Dict[int, str] = {1: 'Si'} - ): + ) -> MTPInputs: + """Convert numpy array variables to a format compatible with MTP. + + Args: + cartesian_positions: atomic positions in Angstrom as a (n_atom, 3) array. + box: unit cell description as a (3, 3) array. + forces: forces on each atom as a (n_atom, 3) array + energy: energy of the configuration + atom_type: indices for each atom in the structure as a (n_atom,) array + atom_dict: map between atom indices and atom types + + Returns: + data formatted at an input for MTP. + """ structure = Structure( lattice=box, species=[atom_dict[x] for x in atom_type], From 73c647df9098e049877ebdca5335b26e979fcab7 Mon Sep 17 00:00:00 2001 From: Simon Blackburn Date: Tue, 3 Sep 2024 13:19:37 -0400 Subject: [PATCH 13/13] fixing lightning version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7bc45fec..d9681bfb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ pyyaml==6.0.1 pytest==7.1.2 pytest-cov==3.0.0 pytest-mock==3.12.0 -pytorch_lightning>=2.2.0 +pytorch_lightning==2.2.1 pytype==2024.2.13 sphinx==7.2.6 sphinx-autoapi==3.0.0