autoatml · JaGeo · Dec 21, 2024 · Dec 21, 2024 · Dec 21, 2024 · Dec 21, 2024
diff --git a/src/autoplex/auto/phonons/flows.py b/src/autoplex/auto/phonons/flows.py
diff --git a/src/autoplex/auto/phonons/jobs.py b/src/autoplex/auto/phonons/jobs.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 
 import numpy as np
+from atomate2.common.schemas.phonons import PhononBSDOSDoc
 from atomate2.vasp.flows.core import DoubleRelaxMaker
 from atomate2.vasp.jobs.base import BaseVaspMaker
 from atomate2.vasp.jobs.core import StaticMaker, TightRelaxMaker
@@ -25,6 +26,128 @@
 from autoplex.data.phonons.jobs import reduce_supercell_size
 
 
+@job
+def do_iterative_rattled_structures(
+    workflow_maker_gen_0,
+    workflow_maker_gen_1,
+    structure_list: list[Structure],
+    mp_ids,
+    dft_references: list[PhononBSDOSDoc] | None = None,
+    benchmark_structures: list[Structure] | None = None,
+    benchmark_mp_ids: list[str] | None = None,
+    pre_xyz_files: list[str] | None = None,
+    pre_database_dir: str | None = None,
+    random_seed: int | None = None,
+    fit_kwargs_list: list | None = None,
+    number_of_iteration=0,
+    rms=0.2,
+    max_iteration=5,
+    rms_max=0.2,
+    previous_output=None,
+):
+    """
+    Job to run CompleteDFTvsMLBenchmarkWorkflow in an iterative manner.
+
+    Parameters
+    ----------
+    workflow_maker_gen_0: CompleteDFTvsMLBenchmarkWorkflow.
+        First Iteration will be performed with this flow.
+    workflow_maker_gen_1: CompleteDFTvsMLBenchmarkWorkflow.
+        All Iterations after the first one will be performed with this flow.
+    structure_list:
+            List of pymatgen structures.
+    mp_ids:
+        Materials Project IDs.
+    dft_references: list[PhononBSDOSDoc] | None
+        List of DFT reference files containing the PhononBSDOCDoc object.
+        Reference files have to refer to a finite displacement of 0.01.
+        For benchmarking, only 0.01 is supported
+    benchmark_structures: list[Structure] | None
+        The pymatgen structure for benchmarking.
+    benchmark_mp_ids: list[str] | None
+        Materials Project ID of the benchmarking structure.
+    pre_xyz_files: list[str] or None
+        Names of the pre-database train xyz file and test xyz file.
+    pre_database_dir: str or None
+        The pre-database directory.
+    random_seed: int | None
+        Random seed.
+    fit_kwargs_list : list[dict].
+        Dict including MLIP fit keyword args.
+    max_iterations: int.
+        Maximum number of iterations to run.
+    rms_max: float.
+        Will stop once the best potential has a max rmse below this value.
+    previous_output: dict | None.
+        Dict including the output of the previous flow.
+    """
+    if rms is None or (number_of_iteration < max_iteration and rms > rms_max):
+        jobs = []
+
+        if number_of_iteration == 0:
+            workflow_maker = workflow_maker_gen_0
+            job1 = workflow_maker_gen_0.make(
+                structure_list=structure_list,
+                mp_ids=mp_ids,
+                dft_references=dft_references,
+                benchmark_structures=benchmark_structures,
+                benchmark_mp_ids=benchmark_mp_ids,
+                pre_xyz_files=pre_xyz_files,
+                pre_database_dir=pre_database_dir,
+                random_seed=random_seed,
+                fit_kwargs_list=fit_kwargs_list,
+            )
+        else:
+            workflow_maker = workflow_maker_gen_1
+            job1 = workflow_maker_gen_1.make(
+                structure_list=structure_list,
+                mp_ids=mp_ids,
+                dft_references=dft_references,
+                benchmark_structures=benchmark_structures,
+                benchmark_mp_ids=benchmark_mp_ids,
+                pre_xyz_files=pre_xyz_files,
+                pre_database_dir=pre_database_dir,
+                random_seed=random_seed,
+                fit_kwargs_list=fit_kwargs_list,
+            )
+
+        # rms needs to be computed somehow
+        job1.append_name("_" + str(number_of_iteration))
+        jobs.append(job1)
+        # order is the same as in the scaling "scale_cells"
+        if workflow_maker.volume_custom_scale_factors is not None:
+            random_seed = random_seed + len(workflow_maker.volume_custom_scale_factors)
+        elif workflow_maker.n_structures is not None:
+            random_seed = random_seed + workflow_maker.n_structures
+
+        job2 = do_iterative_rattled_structures(
+            workflow_maker_gen_0=workflow_maker_gen_0,
+            workflow_maker_gen_1=workflow_maker_gen_1,
+            structure_list=structure_list,
+            mp_ids=mp_ids,
+            dft_references=job1.output["dft_references"],
+            # TODO: check if they should be optimized
+            benchmark_structures=job1.output["benchmark_structures"],
+            benchmark_mp_ids=job1.output["benchmark_mp_ids"],
+            pre_xyz_files=job1.output["pre_xyz_files"],
+            pre_database_dir=job1.output["pre_database_dir"],
+            random_seed=random_seed,
+            fit_kwargs_list=fit_kwargs_list,
+            number_of_iteration=number_of_iteration + 1,
+            rms=job1.output["rms"],
+            max_iteration=max_iteration,
+            rms_max=rms_max,
+            previous_output=job1.output,
+        )
+        jobs.append(job2)
+        # benchmark stuff has to be passed into the complete stuff later on instead of recalculating it every time
+        # random seed update might be the hardest part.
+        return Response(replace=Flow(jobs), output=job2.output)
+    # give a nicer output # what do we need to restart?
+    # should be the same as for the completeworkflow
+    return previous_output
+
+
 @job
 def complete_benchmark(  # this function was put here to prevent circular import
     ml_path: list,
@@ -137,6 +260,7 @@ def complete_benchmark(  # this function was put here to prevent circular import
             ml_potential = Path(path) / "deployed_nequip_model.pth"
         else:  # MACE
             # treat finetuned potentials
+            # TODO: fix this naming issue (depends on input)
             ml_potential_fine = Path(path) / "MACE_final.model"
             ml_potential = (
                 ml_potential_fine
@@ -163,9 +287,10 @@ def complete_benchmark(  # this function was put here to prevent circular import
                 if (
                     benchmark_mp_ids[ibenchmark_structure] in mp_ids
                 ) and add_dft_phonon_struct:
+
                     dft_references = fit_input[benchmark_mp_ids[ibenchmark_structure]][
                         "phonon_data"
-                    ]["001"]
+                    ][f"{int(displacement * 100):03d}"]
                 else:
                     dft_phonons = dft_phonopy_gen_data(
                         structure=benchmark_structure,
@@ -178,7 +303,9 @@ def complete_benchmark(  # this function was put here to prevent circular import
                         supercell_settings=supercell_settings,
                     )
                     jobs.append(dft_phonons)
-                    dft_references = dft_phonons.output["phonon_data"]["001"]
+                    dft_references = dft_phonons.output["phonon_data"][
+                        f"{int(displacement * 100):03d}"
+                    ]
 
                 add_data_bm = PhononBenchmarkMaker(name="Benchmark").make(
                     ml_model=ml_model,
@@ -225,7 +352,10 @@ def complete_benchmark(  # this function was put here to prevent circular import
             jobs.append(add_data_bm)
             collect_output.append(add_data_bm.output)
 
-    return Response(replace=Flow(jobs), output=collect_output)
+    return Response(
+        replace=Flow(jobs),
+        output={"bm_output": collect_output, "dft_references": dft_references},
+    )
 
 
 @job
@@ -619,3 +749,56 @@ def get_iso_atom(
         },
     )
     return Response(replace=flow)
+
+
+@job
+def get_output(
+    metrics: list,
+    benchmark_structures: list[Structure] | None = None,
+    benchmark_mp_ids: list[str] | None = None,
+    dft_references: list[PhononBSDOSDoc] | None = None,
+    pre_xyz_files: list[str] | None = None,
+    pre_database_dir: str | None = None,
+    fit_kwargs_list: list | None = None,
+):
+    """
+    Job to collect all output infos for potential restarts.
+
+    Parameters
+    ----------
+    metrics: list[dict]
+        List of metric dictionaries from complete_benchmark jobs.
+    dft_references: list[PhononBSDOSDoc] | None
+        List of DFT reference files containing the PhononBSDOCDoc object.
+        Reference files have to refer to a finite displacement of 0.01.
+        For benchmarking, only 0.01 is supported
+    benchmark_structures: list[Structure] | None
+        The pymatgen structure for benchmarking.
+    benchmark_mp_ids: list[str] | None
+        Materials Project ID of the benchmarking structure.
+    pre_xyz_files: list[str] or None
+        Names of the pre-database train xyz file and test xyz file.
+    pre_database_dir: str or None
+        The pre-database directory.
+    fit_kwargs_list : list[dict].
+        Dict including MLIP fit keyword args.
+    """
+    # TODO: potentially evaluation of imaginary modes
+    rms_list = []
+    for metric in metrics:
+        rms = 1000.0
+        for metri in metric:
+            if metri["benchmark_phonon_rmse"] < rms:
+                rms = metri["benchmark_phonon_rmse"]
+        rms_list.append(rms)
+
+    return {
+        "metrics": metrics,
+        "rms": min(rms_list),
+        "benchmark_structures": benchmark_structures,
+        "benchmark_mp_ids": benchmark_mp_ids,
+        "dft_references": dft_references,
+        "pre_xyz_files": pre_xyz_files,
+        "pre_database_dir": pre_database_dir,
+        "fit_kwargs_list": fit_kwargs_list,
+    }
diff --git a/src/autoplex/benchmark/phonons/jobs.py b/src/autoplex/benchmark/phonons/jobs.py
@@ -25,9 +25,8 @@ def write_benchmark_metrics(
     -------
     A text file with root mean squared error between DFT and ML potential phonon band-structure
     """
-    # TODO: fix this part
     metrics_flattened = [item for sublist in metrics for item in sublist]
-    # TODO: think about a better solution here
+
     # the following code assumes all benchmark structures have the same composition
     structure_composition = benchmark_structures[0].composition.reduced_formula
     with open(

diff --git a/src/autoplex/data/common/jobs.py b/src/autoplex/data/common/jobs.py
@@ -238,8 +238,9 @@ def generate_randomized_structures(
     if supercell_matrix is None:
         supercell_matrix = [[2, 0, 0], [0, 2, 0], [0, 0, 2]]
 
-    if n_structures < 10:
-        n_structures = 10
+    # TODO: remove this part
+    # if n_structures < 10:
+    #    n_structures = 10
 
     supercell = get_supercell(
         unitcell=get_phonopy_structure(structure),

diff --git a/src/autoplex/data/common/utils.py b/src/autoplex/data/common/utils.py
@@ -165,8 +165,13 @@ def scale_cell(
     atoms = AseAtomsAdaptor.get_atoms(structure)
     distorted_cells = []
 
-    if volume_scale_factor_range is not None:
-        # range is specified
+    if volume_custom_scale_factors is not None:
+        scale_factors_defined = volume_custom_scale_factors
+        warnings.warn("Using your custom lattice scale factors", stacklevel=2)
+    if volume_custom_scale_factors is None:
+        if volume_scale_factor_range is None:
+            volume_scale_factor_range = [0.90, 1.1]
+
         scale_factors_defined = np.arange(
             volume_scale_factor_range[0],
             volume_scale_factor_range[1]
@@ -185,18 +190,6 @@ def scale_cell(
             stacklevel=2,
         )
 
-    else:  # range is not specified
-        if volume_custom_scale_factors is None:
-            # use default scale factors if not specified
-            scale_factors_defined = [0.90, 0.95, 0.98, 0.99, 1.01, 1.02, 1.05, 1.10]
-            warnings.warn(
-                "Using default lattice scale factors of [0.90, 0.95, 0.98, 0.99, 1.01, 1.02, 1.05, 1.10]",
-                stacklevel=2,
-            )
-        else:
-            scale_factors_defined = volume_custom_scale_factors
-            warnings.warn("Using your custom lattice scale factors", stacklevel=2)
-
     for scale_factor in scale_factors_defined:
         # make copy of ground state
         cell = atoms.copy()

diff --git a/src/autoplex/fitting/common/flows.py b/src/autoplex/fitting/common/flows.py
@@ -183,8 +183,10 @@ def make(
             jobs.append(mlip_fit_job)
 
             return Flow(jobs=jobs, output=mlip_fit_job.output, name=self.name)
-        # this will only run if train.extxyz and test.extxyz files are present in the database_dir
 
+        # this will only run if train.extxyz and test.extxyz files are present in the database_dir
+        # TODO: shouldn't this be the exception rather then the default run?!
+        # TODO: I assume we always want to use data from before?
         if isinstance(self.database_dir, str):
             self.database_dir = Path(self.database_dir)
 
@@ -300,6 +302,33 @@ def make(
                     logging.info(
                         f"File {file_name} has been copied to {destination_file_path}"
                     )
+            if len(self.pre_xyz_files) == 2:
+                # join to one file and then split again afterwards
+                # otherwise, split percentage will not be true
+                destination_file_path = os.path.join(
+                    current_working_directory, "vasp_ref.extxyz"
+                )
+                for file_name in self.pre_xyz_files:
+                    # TODO: if it makes sense to remove isolated atoms from other files as well
+                    atoms_list = ase.io.read(
+                        os.path.join(self.pre_database_dir, file_name), index=":"
+                    )
+                    new_atoms_list = [
+                        atoms
+                        for atoms in atoms_list
+                        if atoms.info["config_type"] != "IsolatedAtom"
+                    ]
+
+                    ase.io.write(destination_file_path, new_atoms_list, append=True)
+
+                    logging.info(
+                        f"File {self.pre_xyz_files[0]} has been copied to {destination_file_path}"
+                    )
+
+            elif len(self.pre_xyz_files) > 2:
+                raise ValueError(
+                    "Please provide a train and a test extxyz file (two files in total) for the pre_xyz_files."
+                )
 
         vaspoutput_2_extended_xyz(
             path_to_vasp_static_calcs=list_of_vasp_calc_dirs,
@@ -310,28 +339,13 @@ def make(
             atom_wise_regularization=self.atom_wise_regularization,
         )
 
+        # TODO: remove too many isolated atoms
         write_after_distillation_data_split(
             self.distillation, self.force_max, self.split_ratio
         )
 
         # Merging database
-        if self.pre_database_dir and os.path.exists(self.pre_database_dir):
-            if len(self.pre_xyz_files) == 2:
-                files_new = ["train.extxyz", "test.extxyz"]
-                for file_name, file_new in zip(self.pre_xyz_files, files_new):
-                    with (
-                        open(
-                            os.path.join(self.pre_database_dir, file_name)
-                        ) as pre_xyz_file,
-                        open(file_new, "a") as xyz_file,
-                    ):
-                        xyz_file.write(pre_xyz_file.read())
-                    logging.info(f"File {file_name} has been copied to {file_new}")
-
-            elif len(self.pre_xyz_files) > 2:
-                raise ValueError(
-                    "Please provide a train and a test extxyz file (two files in total) for the pre_xyz_files."
-                )
+        # TODO: does a merge happen here?
         if self.regularization:
             base_dir = os.getcwd()
             folder_name = os.path.join(base_dir, "without_regularization")

diff --git a/src/autoplex/fitting/common/jobs.py b/src/autoplex/fitting/common/jobs.py
@@ -174,4 +174,5 @@ def machine_learning_fit(
         "train_error": train_test_error["train_error"],
         "test_error": train_test_error["test_error"],
         "convergence": check_conv,
+        "database_dir": database_dir,
     }