Skip to content

Commit

Permalink
Added DefaultDataset (#12)
Browse files Browse the repository at this point in the history
* Prototype code for loading Hamiltonian

* add 'ABACUSDataset' in data module

* modified "basis.dat" storage & can load overlap

* recover some original dataset settings

* add ABACUSDataset in init

* Add the in memory version of ABACUSDataset

* add ABACUSInMemoryDataset in data package

* Added `DefaultDataset` and unified `ABACUSDataset`

* improved DefaultDataset & add `dptb data` entrypoint for preprocess

* update `build_dataset`
  • Loading branch information
SharpLonde authored Dec 19, 2023
1 parent d02323e commit a9f4a75
Show file tree
Hide file tree
Showing 8 changed files with 416 additions and 70 deletions.
2 changes: 2 additions & 0 deletions dptb/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
HDF5Dataset,
ABACUSDataset,
ABACUSInMemoryDataset,
DefaultDataset
)
from .dataloader import DataLoader, Collater, PartialSampler
from .build import dataset_from_config
Expand All @@ -35,6 +36,7 @@
HDF5Dataset,
ABACUSDataset,
ABACUSInMemoryDataset,
DefaultDataset,
DataLoader,
Collater,
PartialSampler,
Expand Down
72 changes: 62 additions & 10 deletions dptb/data/build.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import inspect
from importlib import import_module
from dptb.data.dataset import ABACUSDataset
from dptb.data.dataset import ABACUSDataset, ABACUSInMemoryDataset, DefaultDataset
from dptb import data
from dptb.data.transforms import TypeMapper, OrbitalMapper
from dptb.data import AtomicDataset, register_fields
Expand Down Expand Up @@ -103,16 +103,68 @@ def build_dataset(set_options, common_options):
"r_max": common_options["bond_cutoff"],
"er_max": common_options.get("env_cutoff", None),
"oer_max": common_options.get("onsite_cutoff", None),
"pbc": set_options["pbc"],
"reduce_edge": set_options["reduce_edge"],
"reduce_edge": set_options.get("reduce_edge", None)
}

dataset = ABACUSDataset(
root=set_options["root"],
preprocess_path=set_options["preprocess_path"],
h5file_names=set_options["file_names"],
AtomicData_options=AtomicDataOptions,
type_mapper=OrbitalMapper(basis=common_options["basis"]),
)
type = set_options["type"]

# input in set_option needed for ABACUS Dataset:
# "root": `.pth` file is saved in root, NO data read from here.
# "preprocess_dir": the same of "preprocess_dir" assigned in `dptb data`,
# contains all necessary data files generated by `dptb data`.
# "pbc": must be specifiy here, true / false.
# "included_frames": optional list, for loading InMemory version.
# Example:
# "train": {
# "type": "ABACUSInMemoryDataset",
# "root": "no/AtomicData/files/here",
# "preprocess_dir": "same/as/in/dptb_data/input_json",
# "pbc": true,
# "included_frames": [1,2,3]
# }
if type == "ABACUSDataset":
assert "pbc" in set_options, "PBC must be provided in `data_options` when loading ABACUS dataset."
AtomicDataOptions["pbc"] = set_options["pbc"]
dataset = ABACUSDataset(
root=set_options["root"],
preprocess_dir=set_options["preprocess_dir"],
AtomicData_options=AtomicDataOptions,
)
elif type == "ABACUSInMemoryDataset":
assert "pbc" in set_options, "PBC must be provided in `data_options` when loading ABACUS dataset."
AtomicDataOptions["pbc"] = set_options["pbc"]
dataset = ABACUSInMemoryDataset(
root=set_options["root"],
preprocess_dir=set_options["preprocess_dir"],
include_frames=set_options.get("include_frames"),
AtomicData_options=AtomicDataOptions,
)

# input in common_option for Default Dataset:
# "lcao_basis": optional, dict like {"C": "2s2p1d"}.
# Must be provided when loading Hamiltonian.
# input in set_option for Default Dataset:
# "root": main dir storing all trajectory folders.
# "prefix": optional, load selected trajectory folders.
# Example:
# "train": {
# "type": "DefaultDataset",
# "root": "foo/bar/data_files_nere",
# "prefix": "traj"
# }
elif type == "DefaultDataset":
if "basis" in common_options:
idp = OrbitalMapper(common_options["basis"])
else:
idp = None
dataset = DefaultDataset(
root=set_options["root"],
AtomicData_options=AtomicDataOptions,
type_mapper=idp,
prefix=set_options.get("prefix", None)
)

else:
raise ValueError(f"Not support dataset type: {type}.")

return dataset
5 changes: 3 additions & 2 deletions dptb/data/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
from ._ase_dataset import ASEDataset
from ._npz_dataset import NpzDataset
from ._hdf5_dataset import HDF5Dataset
from ._abacus_dataset import ABACUSDataset
from ._abacus_dataset import ABACUSDataset, ABACUSInMemoryDataset
from ._deeph_dataset import DeePHE3Dataset
from ._abacus_dataset_mem import ABACUSInMemoryDataset
from ._default_dataset import DefaultDataset


__all__ = [
DefaultDataset,
DeePHE3Dataset,
ABACUSInMemoryDataset,
ABACUSDataset,
Expand Down
129 changes: 78 additions & 51 deletions dptb/data/dataset/_abacus_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Dict, Any, List, Callable, Union, Optional
import os

import numpy as np
import h5py

Expand All @@ -10,72 +11,98 @@
AtomicDataDict,
)
from ..transforms import TypeMapper, OrbitalMapper
from ._base_datasets import AtomicDataset
from dptb.nn.hamiltonian import E3Hamiltonian
from ._base_datasets import AtomicDataset, AtomicInMemoryDataset
#from dptb.nn.hamiltonian import E3Hamiltonian
from dptb.data.interfaces.ham_to_feature import ham_block_to_feature

orbitalLId = {0:"s", 1:"p", 2:"d", 3:"f"}

def _abacus_h5_reader(h5file_path, AtomicData_options):
data = h5py.File(h5file_path, "r")
atomic_data = AtomicData.from_points(
pos = data["pos"][:],
cell = data["cell"][:],
atomic_numbers = data["atomic_numbers"][:],
**AtomicData_options,
)
if "hamiltonian_blocks" in data:
basis = {}
for key, value in data["basis"].items():
basis[key] = [(f"{i+1}" + orbitalLId[l]) for i, l in enumerate(value)]
idp = OrbitalMapper(basis)
# e3 = E3Hamiltonian(idp=idp, decompose=True)
ham_block_to_feature(atomic_data, idp, data.get("hamiltonian_blocks", False), data.get("overlap_blocks", False))
# with torch.no_grad():
# atomic_data = e3(atomic_data.to_dict())
# atomic_data = AtomicData.from_dict(atomic_data)

if "eigenvalues" in data and "kpionts" in data:
atomic_data[AtomicDataDict.KPOINT_KEY] = torch.as_tensor(data["kpoints"][:], dtype=torch.get_default_dtype())
atomic_data[AtomicDataDict.ENERGY_EIGENVALUE_KEY] = torch.as_tensor(data["eigenvalues"][:], dtype=torch.get_default_dtype())
return atomic_data

# Lazy loading class, built for large dataset.

class ABACUSDataset(AtomicDataset):

def __init__(
self,
root: str,
key_mapping: Dict[str, str] = {
"pos": AtomicDataDict.POSITIONS_KEY,
"energy": AtomicDataDict.TOTAL_ENERGY_KEY,
"atomic_numbers": AtomicDataDict.ATOMIC_NUMBERS_KEY,
"kpoints": AtomicDataDict.KPOINT_KEY,
"eigenvalues": AtomicDataDict.ENERGY_EIGENVALUE_KEY,
},
preprocess_path: str = None,
h5file_names: Optional[str] = None,
AtomicData_options: Dict[str, Any] = {},
type_mapper: Optional[TypeMapper] = None,
preprocess_dir: str,
AtomicData_options: Dict[str, Any] = {},
type_mapper: Optional[TypeMapper] = None,
):
super().__init__(root=root, type_mapper=type_mapper)
self.key_mapping = key_mapping
self.key_list = list(key_mapping.keys())
self.value_list = list(key_mapping.values())
self.file_names = h5file_names
self.preprocess_path = preprocess_path

self.preprocess_dir = preprocess_dir
self.file_name = np.loadtxt(os.path.join(self.preprocess_dir, 'AtomicData_file.txt'), dtype=str)
self.AtomicData_options = AtomicData_options
# self.r_max = AtomicData_options["r_max"]
# self.er_max = AtomicData_options["er_max"]
# self.oer_max = AtomicData_options["oer_max"]
# self.pbc = AtomicData_options["pbc"]

self.index = None
self.num_examples = len(h5file_names)
self.num_examples = len(self.file_name)

def get(self, idx):
file_name = self.file_names[idx]
file = os.path.join(self.preprocess_path, file_name)
data = h5py.File(file, "r")
name = self.file_name[idx]
h5_file = os.path.join(self.preprocess_dir, name)
atomic_data = _abacus_h5_reader(h5_file, self.AtomicData_options)
return atomic_data

def len(self) -> int:
return self.num_examples

# In memory version.

atomic_data = AtomicData.from_points(
pos = data["pos"][:],
cell = data["cell"][:],
atomic_numbers = data["atomic_numbers"][:],
**self.AtomicData_options,
)
class ABACUSInMemoryDataset(AtomicInMemoryDataset):

def __init__(
self,
root: str,
preprocess_dir: str,
url: Optional[str] = None,
AtomicData_options: Dict[str, Any] = {},
include_frames: Optional[List[int]] = None,
type_mapper: TypeMapper = None,
):
self.preprocess_dir = preprocess_dir
self.file_name = np.loadtxt(os.path.join(self.preprocess_dir, 'AtomicData_file.txt'), dtype=str)

if data["hamiltonian_blocks"]:
basis = {}
for key, value in data["basis"].items():
basis[key] = [(f"{i+1}" + orbitalLId[l]) for i, l in enumerate(value)]
idp = OrbitalMapper(basis)
# e3 = E3Hamiltonian(idp=idp, decompose=True)
ham_block_to_feature(atomic_data, idp, data.get("hamiltonian_blocks", False), data.get("overlap_blocks", False))
# with torch.no_grad():
# atomic_data = e3(atomic_data.to_dict())
# atomic_data = AtomicData.from_dict(atomic_data)
if data.get("eigenvalue") and data.get("kpoint"):
atomic_data[AtomicDataDict.KPOINT_KEY] = torch.as_tensor(data["kpoint"][:], dtype=torch.get_default_dtype())
atomic_data[AtomicDataDict.ENERGY_EIGENVALUE_KEY] = torch.as_tensor(data["eigenvalue"][:], dtype=torch.get_default_dtype())
super(ABACUSInMemoryDataset, self).__init__(
file_name=self.file_name,
url=url,
root=root,
AtomicData_options=AtomicData_options,
include_frames=include_frames,
type_mapper=type_mapper,
)

return atomic_data
def get_data(self):
data = []
for name in self.file_name:
h5_file = os.path.join(self.preprocess_dir, name)
data.append(_abacus_h5_reader(h5_file, self.AtomicData_options))
return data

def len(self) -> int:
return self.num_examples
@property
def raw_file_names(self):
return "AtomicData.h5"

@property
def raw_dir(self):
return self.root
Loading

0 comments on commit a9f4a75

Please sign in to comment.