From a9f4a7593d23b2560d8c759939af760f696bf01b Mon Sep 17 00:00:00 2001 From: Sharp Londe <93334987+SharpLonde@users.noreply.github.com> Date: Tue, 19 Dec 2023 19:18:43 +0800 Subject: [PATCH] Added `DefaultDataset` (#12) * Prototype code for loading Hamiltonian * add 'ABACUSDataset' in data module * modified "basis.dat" storage & can load overlap * recover some original dataset settings * add ABACUSDataset in init * Add the in memory version of ABACUSDataset * add ABACUSInMemoryDataset in data package * Added `DefaultDataset` and unified `ABACUSDataset` * improved DefaultDataset & add `dptb data` entrypoint for preprocess * update `build_dataset` --- dptb/data/__init__.py | 2 + dptb/data/build.py | 72 ++++++++-- dptb/data/dataset/__init__.py | 5 +- dptb/data/dataset/_abacus_dataset.py | 129 ++++++++++------- dptb/data/dataset/_default_dataset.py | 200 ++++++++++++++++++++++++++ dptb/data/interfaces/abacus.py | 14 +- dptb/entrypoints/data.py | 46 ++++++ dptb/entrypoints/main.py | 18 +++ 8 files changed, 416 insertions(+), 70 deletions(-) create mode 100644 dptb/data/dataset/_default_dataset.py create mode 100644 dptb/entrypoints/data.py diff --git a/dptb/data/__init__.py b/dptb/data/__init__.py index 2318e9b3..2efca699 100644 --- a/dptb/data/__init__.py +++ b/dptb/data/__init__.py @@ -17,6 +17,7 @@ HDF5Dataset, ABACUSDataset, ABACUSInMemoryDataset, + DefaultDataset ) from .dataloader import DataLoader, Collater, PartialSampler from .build import dataset_from_config @@ -35,6 +36,7 @@ HDF5Dataset, ABACUSDataset, ABACUSInMemoryDataset, + DefaultDataset, DataLoader, Collater, PartialSampler, diff --git a/dptb/data/build.py b/dptb/data/build.py index d2db9d68..62cdcee0 100644 --- a/dptb/data/build.py +++ b/dptb/data/build.py @@ -1,6 +1,6 @@ import inspect from importlib import import_module -from dptb.data.dataset import ABACUSDataset +from dptb.data.dataset import ABACUSDataset, ABACUSInMemoryDataset, DefaultDataset from dptb import data from dptb.data.transforms import TypeMapper, OrbitalMapper from dptb.data import AtomicDataset, register_fields @@ -103,16 +103,68 @@ def build_dataset(set_options, common_options): "r_max": common_options["bond_cutoff"], "er_max": common_options.get("env_cutoff", None), "oer_max": common_options.get("onsite_cutoff", None), - "pbc": set_options["pbc"], - "reduce_edge": set_options["reduce_edge"], + "reduce_edge": set_options.get("reduce_edge", None) } - dataset = ABACUSDataset( - root=set_options["root"], - preprocess_path=set_options["preprocess_path"], - h5file_names=set_options["file_names"], - AtomicData_options=AtomicDataOptions, - type_mapper=OrbitalMapper(basis=common_options["basis"]), - ) + type = set_options["type"] + + # input in set_option needed for ABACUS Dataset: + # "root": `.pth` file is saved in root, NO data read from here. + # "preprocess_dir": the same of "preprocess_dir" assigned in `dptb data`, + # contains all necessary data files generated by `dptb data`. + # "pbc": must be specifiy here, true / false. + # "included_frames": optional list, for loading InMemory version. + # Example: + # "train": { + # "type": "ABACUSInMemoryDataset", + # "root": "no/AtomicData/files/here", + # "preprocess_dir": "same/as/in/dptb_data/input_json", + # "pbc": true, + # "included_frames": [1,2,3] + # } + if type == "ABACUSDataset": + assert "pbc" in set_options, "PBC must be provided in `data_options` when loading ABACUS dataset." + AtomicDataOptions["pbc"] = set_options["pbc"] + dataset = ABACUSDataset( + root=set_options["root"], + preprocess_dir=set_options["preprocess_dir"], + AtomicData_options=AtomicDataOptions, + ) + elif type == "ABACUSInMemoryDataset": + assert "pbc" in set_options, "PBC must be provided in `data_options` when loading ABACUS dataset." + AtomicDataOptions["pbc"] = set_options["pbc"] + dataset = ABACUSInMemoryDataset( + root=set_options["root"], + preprocess_dir=set_options["preprocess_dir"], + include_frames=set_options.get("include_frames"), + AtomicData_options=AtomicDataOptions, + ) + + # input in common_option for Default Dataset: + # "lcao_basis": optional, dict like {"C": "2s2p1d"}. + # Must be provided when loading Hamiltonian. + # input in set_option for Default Dataset: + # "root": main dir storing all trajectory folders. + # "prefix": optional, load selected trajectory folders. + # Example: + # "train": { + # "type": "DefaultDataset", + # "root": "foo/bar/data_files_nere", + # "prefix": "traj" + # } + elif type == "DefaultDataset": + if "basis" in common_options: + idp = OrbitalMapper(common_options["basis"]) + else: + idp = None + dataset = DefaultDataset( + root=set_options["root"], + AtomicData_options=AtomicDataOptions, + type_mapper=idp, + prefix=set_options.get("prefix", None) + ) + + else: + raise ValueError(f"Not support dataset type: {type}.") return dataset diff --git a/dptb/data/dataset/__init__.py b/dptb/data/dataset/__init__.py index 129e1380..c8dca670 100644 --- a/dptb/data/dataset/__init__.py +++ b/dptb/data/dataset/__init__.py @@ -2,12 +2,13 @@ from ._ase_dataset import ASEDataset from ._npz_dataset import NpzDataset from ._hdf5_dataset import HDF5Dataset -from ._abacus_dataset import ABACUSDataset +from ._abacus_dataset import ABACUSDataset, ABACUSInMemoryDataset from ._deeph_dataset import DeePHE3Dataset -from ._abacus_dataset_mem import ABACUSInMemoryDataset +from ._default_dataset import DefaultDataset __all__ = [ + DefaultDataset, DeePHE3Dataset, ABACUSInMemoryDataset, ABACUSDataset, diff --git a/dptb/data/dataset/_abacus_dataset.py b/dptb/data/dataset/_abacus_dataset.py index d5daf591..68082655 100644 --- a/dptb/data/dataset/_abacus_dataset.py +++ b/dptb/data/dataset/_abacus_dataset.py @@ -1,5 +1,6 @@ from typing import Dict, Any, List, Callable, Union, Optional import os + import numpy as np import h5py @@ -10,72 +11,98 @@ AtomicDataDict, ) from ..transforms import TypeMapper, OrbitalMapper -from ._base_datasets import AtomicDataset -from dptb.nn.hamiltonian import E3Hamiltonian +from ._base_datasets import AtomicDataset, AtomicInMemoryDataset +#from dptb.nn.hamiltonian import E3Hamiltonian from dptb.data.interfaces.ham_to_feature import ham_block_to_feature orbitalLId = {0:"s", 1:"p", 2:"d", 3:"f"} +def _abacus_h5_reader(h5file_path, AtomicData_options): + data = h5py.File(h5file_path, "r") + atomic_data = AtomicData.from_points( + pos = data["pos"][:], + cell = data["cell"][:], + atomic_numbers = data["atomic_numbers"][:], + **AtomicData_options, + ) + if "hamiltonian_blocks" in data: + basis = {} + for key, value in data["basis"].items(): + basis[key] = [(f"{i+1}" + orbitalLId[l]) for i, l in enumerate(value)] + idp = OrbitalMapper(basis) + # e3 = E3Hamiltonian(idp=idp, decompose=True) + ham_block_to_feature(atomic_data, idp, data.get("hamiltonian_blocks", False), data.get("overlap_blocks", False)) + # with torch.no_grad(): + # atomic_data = e3(atomic_data.to_dict()) + # atomic_data = AtomicData.from_dict(atomic_data) + + if "eigenvalues" in data and "kpionts" in data: + atomic_data[AtomicDataDict.KPOINT_KEY] = torch.as_tensor(data["kpoints"][:], dtype=torch.get_default_dtype()) + atomic_data[AtomicDataDict.ENERGY_EIGENVALUE_KEY] = torch.as_tensor(data["eigenvalues"][:], dtype=torch.get_default_dtype()) + return atomic_data + +# Lazy loading class, built for large dataset. + class ABACUSDataset(AtomicDataset): def __init__( self, root: str, - key_mapping: Dict[str, str] = { - "pos": AtomicDataDict.POSITIONS_KEY, - "energy": AtomicDataDict.TOTAL_ENERGY_KEY, - "atomic_numbers": AtomicDataDict.ATOMIC_NUMBERS_KEY, - "kpoints": AtomicDataDict.KPOINT_KEY, - "eigenvalues": AtomicDataDict.ENERGY_EIGENVALUE_KEY, - }, - preprocess_path: str = None, - h5file_names: Optional[str] = None, - AtomicData_options: Dict[str, Any] = {}, - type_mapper: Optional[TypeMapper] = None, + preprocess_dir: str, + AtomicData_options: Dict[str, Any] = {}, + type_mapper: Optional[TypeMapper] = None, ): super().__init__(root=root, type_mapper=type_mapper) - self.key_mapping = key_mapping - self.key_list = list(key_mapping.keys()) - self.value_list = list(key_mapping.values()) - self.file_names = h5file_names - self.preprocess_path = preprocess_path - + self.preprocess_dir = preprocess_dir + self.file_name = np.loadtxt(os.path.join(self.preprocess_dir, 'AtomicData_file.txt'), dtype=str) self.AtomicData_options = AtomicData_options - # self.r_max = AtomicData_options["r_max"] - # self.er_max = AtomicData_options["er_max"] - # self.oer_max = AtomicData_options["oer_max"] - # self.pbc = AtomicData_options["pbc"] - - self.index = None - self.num_examples = len(h5file_names) + self.num_examples = len(self.file_name) def get(self, idx): - file_name = self.file_names[idx] - file = os.path.join(self.preprocess_path, file_name) - data = h5py.File(file, "r") + name = self.file_name[idx] + h5_file = os.path.join(self.preprocess_dir, name) + atomic_data = _abacus_h5_reader(h5_file, self.AtomicData_options) + return atomic_data + + def len(self) -> int: + return self.num_examples + +# In memory version. - atomic_data = AtomicData.from_points( - pos = data["pos"][:], - cell = data["cell"][:], - atomic_numbers = data["atomic_numbers"][:], - **self.AtomicData_options, - ) +class ABACUSInMemoryDataset(AtomicInMemoryDataset): + + def __init__( + self, + root: str, + preprocess_dir: str, + url: Optional[str] = None, + AtomicData_options: Dict[str, Any] = {}, + include_frames: Optional[List[int]] = None, + type_mapper: TypeMapper = None, + ): + self.preprocess_dir = preprocess_dir + self.file_name = np.loadtxt(os.path.join(self.preprocess_dir, 'AtomicData_file.txt'), dtype=str) - if data["hamiltonian_blocks"]: - basis = {} - for key, value in data["basis"].items(): - basis[key] = [(f"{i+1}" + orbitalLId[l]) for i, l in enumerate(value)] - idp = OrbitalMapper(basis) - # e3 = E3Hamiltonian(idp=idp, decompose=True) - ham_block_to_feature(atomic_data, idp, data.get("hamiltonian_blocks", False), data.get("overlap_blocks", False)) - # with torch.no_grad(): - # atomic_data = e3(atomic_data.to_dict()) - # atomic_data = AtomicData.from_dict(atomic_data) - if data.get("eigenvalue") and data.get("kpoint"): - atomic_data[AtomicDataDict.KPOINT_KEY] = torch.as_tensor(data["kpoint"][:], dtype=torch.get_default_dtype()) - atomic_data[AtomicDataDict.ENERGY_EIGENVALUE_KEY] = torch.as_tensor(data["eigenvalue"][:], dtype=torch.get_default_dtype()) + super(ABACUSInMemoryDataset, self).__init__( + file_name=self.file_name, + url=url, + root=root, + AtomicData_options=AtomicData_options, + include_frames=include_frames, + type_mapper=type_mapper, + ) - return atomic_data + def get_data(self): + data = [] + for name in self.file_name: + h5_file = os.path.join(self.preprocess_dir, name) + data.append(_abacus_h5_reader(h5_file, self.AtomicData_options)) + return data - def len(self) -> int: - return self.num_examples \ No newline at end of file + @property + def raw_file_names(self): + return "AtomicData.h5" + + @property + def raw_dir(self): + return self.root \ No newline at end of file diff --git a/dptb/data/dataset/_default_dataset.py b/dptb/data/dataset/_default_dataset.py new file mode 100644 index 00000000..0fea13df --- /dev/null +++ b/dptb/data/dataset/_default_dataset.py @@ -0,0 +1,200 @@ +from typing import Dict, Any, List, Callable, Union, Optional +import os + +import numpy as np +import h5py + +import torch + +from .. import ( + AtomicData, + AtomicDataDict, +) +from ..transforms import TypeMapper, OrbitalMapper +from ._base_datasets import AtomicDataset, AtomicInMemoryDataset +#from dptb.nn.hamiltonian import E3Hamiltonian +from dptb.data.interfaces.ham_to_feature import ham_block_to_feature +from dptb.utils.tools import j_loader + +class _TrajData(object): + ''' + Input file format in a trajectory (shape): + "info.json": includes infomation in the data files. + "cell.dat": fixed cell (3, 3) or variable cells (nframes, 3, 3). Unit: Angstrom + "atomic_numbers.dat": (natoms) or (nframes, natoms) + "positions.dat": concentrate all positions in one file, (nframes * natoms, 3). Can be cart or frac. + + Optional: + "eigenvalues.npy": concentrate all engenvalues in one file, (nframes, nkpoints, nbands) + "kpoints.npy": MUST be provided when loading `eigenvalues.npy`, (nkpoints, 3) or (nframes, nkpints, 3) + "hamiltonians.h5": h5 file storing atom-wise hamiltonian blocks labeled by frames id and `i0_jR_Rx_Ry_Rz`. + "overlaps.h5": the same format of overlap blocks as `hamiltonians.h5` + ''' + def __init__(self, root: str, AtomicData_options: Dict[str, Any] = {},): + self.root = root + self.AtomicData_options = AtomicData_options + self.info = j_loader(os.path.join(root, "info.json")) + + self.data = {} + cell = np.loadtxt(os.path.join(root, "cell.dat")) + if cell.shape[0] == 3: + # same cell size, then copy it to all frames. + cell = np.expand_dims(cell, axis=0) + self.data["cell"] = np.broadcast_to(cell, (self.info["nframes"], 3, 3)) + elif cell.shape[0] == self.info["nframes"] * 3: + self.data["cell"] = cell.reshape(self.info["nframes"], 3, 3) + else: + raise ValueError("Wrong cell dimensions.") + atomic_numbers = np.loadtxt(os.path.join(root, "atomic_numbers.dat")) + if len(atomic_numbers.shape) == 1: + # same atomic_numbers, copy it to all frames. + if atomic_numbers.shape[0] == self.info["natoms"]: + atomic_numbers = np.expand_dims(atomic_numbers, axis=0) + self.data["atomic_numbers"] = np.broadcast_to(atomic_numbers, (self.info["nframes"], + self.info["natoms"])) + else: + raise ValueError("Atomic numbers not equal to natoms in info.json. ") + elif atomic_numbers.shape[0] == self.info["natoms"] * self.info["nframes"]: + self.data["atomic_numbers"] = atomic_numbers.reshape(self.info["nframes"], + self.info["natoms"]) + else: + raise ValueError("Wrong atomic_number dimensions.") + pos = np.loadtxt(os.path.join(root, "positions.dat")) + assert pos.shape[0] == self.info["nframes"] * self.info["natoms"] + pos = pos.reshape(self.info["nframes"], self.info["natoms"], 3) + if self.info["pos_type"] == "cart": + self.data["pos"] = pos + elif self.info["pos_type"] == "frac": + self.data["pos"] = pos @ self.data["cell"] + else: + raise NameError("Position type must be cart / frac.") + + if os.path.exists(os.path.join(self.root, "eigenvalues.npy")): + assert os.path.exists(os.path.join(self.root, "kpoints.npy")) + kpoints = np.load(os.path.join(self.root, "kpoints.npy")) + if len(kpoints.shape) == 2: + # same kpoints, then copy it to all frames. + if kpoints.shape[0] == self.info["bandinfo"]["nkpoints"]: + kpoints = np.expand_dims(kpoints, axis=0) + self.data["kpoints"] = np.broadcast_to(kpoints, (self.info["nframes"], + self.info["bandinfo"]["nkpoints"], 3)) + else: + raise ValueError("kpoints in .npy not equal to nkpoints in bandinfo. ") + elif atomic_numbers.shape[0] == self.info["nframes"]: + self.data["kpoints"] = kpoints + else: + raise ValueError("Wrong kpoint dimensions.") + eigenvalues = np.load(os.path.join(self.root, "eigenvalues.npy")) + assert eigenvalues.shape[0] == self.info["nframes"] + assert eigenvalues.shape[1] == self.info["bandinfo"]["nkpoints"] + assert eigenvalues.shape[2] == self.info["bandinfo"]["nbands"] + self.data["eigenvalues"] = eigenvalues + #self.data["eigenvalues"] = eigenvalues.reshape(self.info["nframes"], + # self.info["bandinfo"]["nkpoints"], + # self.info["bandinfo"]["nbands"]) + if os.path.exists(os.path.join(self.root, "hamiltonians.h5")): + self.data["hamiltonian_blocks"] = h5py.File(os.path.join(self.root, "hamiltonians.h5"), "r") + if os.path.exists(os.path.join(self.root, "overlaps.h5")): + self.data["overlap_blocks"] = h5py.File(os.path.join(self.root, "overlaps.h5"), "r") + + def toAtomicDataList(self, idp: TypeMapper = None): + data_list = [] + for frame in range(self.info["nframes"]): + atomic_data = AtomicData.from_points( + pos = self.data["pos"][frame][:], + cell = self.data["cell"][frame][:], + atomic_numbers = self.data["atomic_numbers"][frame], + pbc = self.info["pbc"], + **self.AtomicData_options) + if "hamiltonian_blocks" in self.data: + assert idp is not None, "LCAO Basis must be provided for loading Hamiltonian." + if "overlap_blocks" not in self.data: + self.data["overlap_blocks"] = False + # e3 = E3Hamiltonian(idp=idp, decompose=True) + ham_block_to_feature(atomic_data, idp, + self.data["hamiltonian_blocks"][str(frame)], + self.data["overlap_blocks"][str(frame)]) + # with torch.no_grad(): + # atomic_data = e3(atomic_data.to_dict()) + # atomic_data = AtomicData.from_dict(atomic_data) + if "eigenvalues" in self.data and "kpoints" in self.data: + bandinfo = self.info["bandinfo"] + atomic_data[AtomicDataDict.KPOINT_KEY] = torch.as_tensor(self.data["kpoints"][frame][:], + dtype=torch.get_default_dtype()) + if bandinfo["emin"] is not None and bandinfo["emax"] is not None: + atomic_data[AtomicDataDict.ENERGY_WINDOWS_KEY] = torch.as_tensor([bandinfo["emin"], bandinfo["emax"]], + dtype=torch.get_default_dtype()) + if bandinfo["band_min"] is not None and bandinfo["band_max"] is not None: + atomic_data[AtomicDataDict.BAND_WINDOW_KEY] = torch.as_tensor([bandinfo["band_min"], bandinfo["band_max"]], + dtype=torch.get_default_dtype()) + atomic_data[AtomicDataDict.ENERGY_EIGENVALUE_KEY] = torch.as_tensor(self.data["eigenvalues"][frame][bandinfo["band_min"]:bandinfo["band_max"]], + dtype=torch.get_default_dtype()) + else: + atomic_data[AtomicDataDict.ENERGY_EIGENVALUE_KEY] = torch.as_tensor(self.data["eigenvalues"][frame], + dtype=torch.get_default_dtype()) + data_list.append(atomic_data) + return data_list + + +class DefaultDataset(AtomicInMemoryDataset): + + def __init__( + self, + root: str, + prefix: Optional[str] = None, + url: Optional[str] = None, + AtomicData_options: Dict[str, Any] = {}, + include_frames: Optional[List[int]] = None, + type_mapper: TypeMapper = None, + ): + self.file_name = [] + for dir_name in os.listdir(root): + if os.path.isdir(os.path.join(root, dir_name)): + if prefix is not None: + if dir_name[:len(prefix)] == prefix: + self.file_name.append(dir_name) + else: + self.file_name.append(dir_name) + # the type_mapper must be stored here in order to load Hamiltonian. + #all_basis = [] + #for file in self.file_name: + # file_info = j_loader(os.path.join(file, "info.json")) + # all_basis.append(file_info["basis"]) + #sort_basis = {} + #for basis in all_basis: + # for symbol, orbitals in basis.items(): + # if symbol not in sort_basis: + # sort_basis[symbol] = orbitals + #type_mapper = OrbitalMapper(sort_basis) + super().__init__( + file_name=self.file_name, + url=url, + root=root, + AtomicData_options=AtomicData_options, + include_frames=include_frames, + type_mapper=type_mapper, + ) + + def setup_data(self): + self.data = [] + for file in self.file_name: + subdata = _TrajData(os.path.join(self.root, file), self.AtomicData_options) + self.data.append(subdata) + + def get_data(self): + self.setup_data() + all_data = [] + for subdata in self.data: + # the type_mapper here is loaded in `dataset` type as `transform` attritube + subdata_list = subdata.toAtomicDataList(self.transform) + all_data += subdata_list + return all_data + + @property + def raw_file_names(self): + return "Null" + + @property + def raw_dir(self): + return self.root + \ No newline at end of file diff --git a/dptb/data/interfaces/abacus.py b/dptb/data/interfaces/abacus.py index 4d4eae80..0430ba39 100644 --- a/dptb/data/interfaces/abacus.py +++ b/dptb/data/interfaces/abacus.py @@ -45,7 +45,7 @@ def transform(self, mat, l_lefts, l_rights): block_rights = block_diag(*[self.get_U(l_right) for l_right in l_rights]) return block_lefts @ mat @ block_rights.T -def recursive_parse(input_dir, preprocess_dir, data_name="OUT.ABACUS", only_overlap=False, get_Ham=False, add_overlap=False, get_eigenvalues=False): +def recursive_parse(input_dir, preprocess_dir, data_name="OUT.ABACUS", only_overlap=False, get_Hamiltonian=False, add_overlap=False, get_eigenvalues=False): input_dir = os.path.abspath(input_dir) preprocess_dir = os.path.abspath(preprocess_dir) os.makedirs(preprocess_dir, exist_ok=True) @@ -57,9 +57,9 @@ def recursive_parse(input_dir, preprocess_dir, data_name="OUT.ABACUS", only_over if os.path.exists(os.path.join(input_dir, file, data_name, "hscsr.tgz")): os.system("cd "+os.path.join(input_dir, file, data_name) + " && tar -zxvf hscsr.tgz && mv OUT.ABACUS/* ./") try: - _abacus_parse(os.path.join(input_dir, file), os.path.join(preprocess_dir, file), data_name, only_S=only_overlap, get_Ham=get_Ham, + _abacus_parse(os.path.join(input_dir, file), os.path.join(preprocess_dir, file), data_name, only_S=only_overlap, get_Ham=get_Hamiltonian, add_overlap=add_overlap, get_eigenvalues=get_eigenvalues) - h5file_names.append(os.path.join(preprocess_dir, file, "AtomicData.h5")) + h5file_names.append(os.path.join(file, "AtomicData.h5")) except Exception as e: print(f"Error in {data_name}: {e}") continue @@ -128,7 +128,7 @@ def find_target_line(f, target): site_norbits_dict[atom_type] = current_site_norbits orbital_types_dict[atom_type] = current_orbital_types - print(orbital_types_dict) + #print(orbital_types_dict) line = find_target_line(f, "TOTAL ATOM NUMBER") assert line is not None, 'Cannot find "TOTAL ATOM NUMBER" in log file' @@ -305,7 +305,7 @@ def parse_matrix(matrix_path, factor, spinful=False): assert len(band) == len(kpts) np.savetxt(os.path.join(output_path, "kpoints.dat"), kpts) - np.savetxt(os.path.join(output_path, "eigenvalue.dat"), band) + np.savetxt(os.path.join(output_path, "eigenvalues.dat"), band) with h5py.File(os.path.join(output_path, "AtomicData.h5"), "w") as f: f["cell"] = lattice @@ -323,8 +323,8 @@ def parse_matrix(matrix_path, factor, spinful=False): # else: # f["hamiltonian_blocks"] = False if get_eigenvalues: - f["kpoint"] = kpts - f["eigenvalue"] = band + f["kpoints"] = kpts + f["eigenvalues"] = band # else: # f["kpoint"] = False # f["eigenvalue"] = False diff --git a/dptb/entrypoints/data.py b/dptb/entrypoints/data.py new file mode 100644 index 00000000..d53df6d0 --- /dev/null +++ b/dptb/entrypoints/data.py @@ -0,0 +1,46 @@ +import os +from typing import Dict, List, Optional, Any +from dptb.utils.tools import j_loader +from dptb.utils.argcheck import normalize +from dptb.data.interfaces.abacus import recursive_parse + +def data( + INPUT: str, + log_level: int, + log_path: Optional[str], + **kwargs +): + jdata = j_loader(INPUT) + + # ABACUS parsing input like: + # { "type": "ABACUS", + # "root": "foo/bar", + # "parse_arguments": { + # "input_dir": "alice/bob", + # "preprocess_dir": "charlie/david", + # "only_overlap": false, + # "get_Hamiltonian": true, + # "add_overlap": true, + # "get_eigenvalues": true } } + if jdata["type"] == "ABACUS": + root = jdata["root"] + abacus_args = jdata["parse_arguments"] + assert abacus_args.get("input_dir") is not None, "ABACUS calculation results MUST be provided." + + if abacus_args.get("preprocess_dir") is None: + # create a new preprocess dir under root if not given + print("Creating new preprocess dictionary...") + os.mkdir(os.path.join(root, "preprocess")) + abacus_args["preprocess_dir"] = os.path.join(root, "preprocess") + + print("Begin parsing ABACUS output...") + h5_filenames = recursive_parse(**abacus_args) + print("Finished parsing ABACUS output.") + + # write all h5 files to be used in building AtomicData + with open(os.path.join(abacus_args["preprocess_dir"], "AtomicData_file.txt"), "w") as f: + for filename in h5_filenames: + f.write(filename + "\n") + + else: + raise Exception("Not supported software output.") \ No newline at end of file diff --git a/dptb/entrypoints/main.py b/dptb/entrypoints/main.py index 338a56c3..e1464688 100644 --- a/dptb/entrypoints/main.py +++ b/dptb/entrypoints/main.py @@ -7,6 +7,7 @@ from dptb.entrypoints.test import _test from dptb.entrypoints.run import run from dptb.entrypoints.bond import bond +from dptb.entrypoints.data import data from dptb.utils.loggers import set_log_handles def get_ll(log_level: str) -> int: @@ -246,6 +247,20 @@ def main_parser() -> argparse.ArgumentParser: help="Use nnsktb correction when training dptb", ) + # preprocess data + parser_data = subparsers.add_parser( + "data", + parents=[parser_log], + help="preprocess software output", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser_data.add_argument( + "INPUT", help="the input parameter file in json or yaml format", + type=str, + default=None + ) + return parser def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace: @@ -293,3 +308,6 @@ def main(): elif args.command == 'run': run(**dict_args) + + elif args.command == 'data': + data(**dict_args)