diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py new file mode 100644 index 00000000..ad6753ea --- /dev/null +++ b/crystal_diffusion/data/parse_lammps_outputs.py @@ -0,0 +1,68 @@ +import argparse +import os +from collections import defaultdict + +import pandas as pd +import yaml + + +def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: str): + """Parse a LAMMPS output file and save in a .csv format. + + Args: + lammps_dump: LAMMPS output file + lammps_thermo_log: LAMMPS thermodynamic variables output file + output_name: name of parsed output written by the script + """ + if not os.path.exists(lammps_dump): + raise ValueError(f'{lammps_dump} does not exist. Please provide a valid LAMMPS dump file as yaml.') + + if not os.path.exists(lammps_thermo_log): + raise ValueError(f'{lammps_thermo_log} does not exist. Please provide a valid LAMMPS thermo log file as yaml.') + + # get the atom information (positions and forces) from the LAMMPS 'dump' file + with open(lammps_dump, 'r') as f: + dump_yaml = yaml.safe_load_all(f) + # every MD iteration is saved as a separate document in the yaml file + # prepare a dataframe to get all the data + pd_data = defaultdict(list) + for doc in dump_yaml: # loop over MD steps + if 'id' not in doc['keywords']: # sanity check + raise ValueError('id should be in LAMMPS dump file') + atoms_info = defaultdict(list) # store information on atoms positions and forces here + for data in doc['data']: # loop over the atoms to get their positions and forces + for key, v in zip(doc['keywords'], data): + if key not in ['id', 'type', 'x', 'y', 'z', 'fx', 'fy', 'fz']: + continue + else: + atoms_info[key].append(v) # get positions or forces + # add the information about that MD step to the dataframe + for k, v in atoms_info.items(): # k should be id, type, x, y, z, fx, fy, fz + pd_data[k].append(v) + + # get the total energy from the LAMMPS second output + with open(lammps_thermo_log, 'r') as f: + log_yaml = yaml.safe_load(f) + kin_idx = log_yaml['keywords'].index('KinEng') + pot_idx = log_yaml['keywords'].index('PotEng') + pd_data['energy'] = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']] + + if not output_name.endswith('.parquet'): + output_name += '.parquet' + + pd.DataFrame(pd_data).to_parquet(output_name, engine='pyarrow', index=False) + + +def main(): + """Main script to parse LAMMPS files and output a single parquet file.""" + parser = argparse.ArgumentParser(description="Convert LAMMPS outputs in parquet file compatible with a dataloader.") + parser.add_argument("--dump_file", type=str, help="LAMMPS dump file in yaml format.") + parser.add_argument("--thermo_file", type=str, help="LAMMPS thermo output file in yaml format.") + parser.add_argument("--output_name", type=str, help="Output name") + args = parser.parse_args() + + parse_lammps_output(args.dump_file, args.thermo_file, args.output_name) + + +if __name__ == '__main__': + main() diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps new file mode 100755 index 00000000..c2f77445 --- /dev/null +++ b/data/lammps_input_example.lammps @@ -0,0 +1,31 @@ +log log.si-${T}-${S}.lammps + +units metal +atom_style atomic +atom_modify map array + +lattice diamond 5.43 +region simbox block 0 ${S} 0 ${S} 0 ${S} +create_box 1 simbox +create_atoms 1 region simbox + +#read_dump ${DUMP} ${STEP} x y z vx vy vz fx fy fz box yes replace no purge yes add yes + +mass 1 28.0855 + +group Si type 1 + +pair_style sw +pair_coeff * * si.sw Si + +velocity all create ${T} 62177 + +dump 1 all yaml 1 dump.si-${T}-${S}.yaml id type x y z fx fy fz + +thermo_style yaml +thermo 1 +#==========================Output files======================== + +fix 1 all nvt temp ${T} ${T} 0.01 +run ${STEP} +unfix 1 diff --git a/data/parse_lammps.sh b/data/parse_lammps.sh new file mode 100755 index 00000000..bcbb2079 --- /dev/null +++ b/data/parse_lammps.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +EXP_DIR="lammps_scripts/Si/si-custom/" +DUMP_FILENAME="dump.si-300-1.yaml" +THERMO_FILENAME="thermo_log.yaml" +OUTPUT_NAME="demo.parquet" + +python crystal_diffusion/data/parse_lammps_outputs.py \ + --dump_file ${EXP_DIR}/${DUMP_FILENAME} \ + --thermo_file ${EXP_DIR}/${THERMO_FILENAME} \ + --output_name ${EXP_DIR}/${OUTPUT_NAME} diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh new file mode 100644 index 00000000..60ea1792 --- /dev/null +++ b/data/run_lammps_example.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +TEMPERATURE=300 +BOX_SIZE=1 + +lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE + +# extract the thermodynamic outputs in a yaml file +egrep '^(keywords:|data:$|---$|\.\.\.$| - \[)' log.lammps > log.yaml diff --git a/data/si.sw b/data/si.sw new file mode 100755 index 00000000..26cfc25b --- /dev/null +++ b/data/si.sw @@ -0,0 +1,22 @@ +# Stillinger-Weber parameters for various elements and mixtures +# multiple entries can be added to this file, LAMMPS reads the ones it needs +# these entries are in LAMMPS "metal" units: +# epsilon = eV; sigma = Angstroms +# other quantities are unitless + +# format of a single entry (one or more lines): +# element 1, element 2, element 3, +# epsilon, sigma, a, lambda, gamma, costheta0, A, B, p, q, tol + +# Here are the original parameters in metal units, for Silicon from: +# +# Stillinger and Weber, Phys. Rev. B, v. 31, p. 5262, (1985) +# +# Parameters for 'dia' Si +Si Si Si 2.1683 2.0951 1.80 21.0 1.20 -0.333333333333 + 7.049556277 0.6022245584 4.0 0.0 0.0 +# +# Parameters for amorphous Si with the modified SW potential +#(R. L. C. Vink, G. T. Barkema, W. F. van der Weg et N. Mousseau, A semi-empirical potential for amorphous silicon, J. Non-Cryst. Sol. 282, 248-255 (2001)) +#Si Si Si 1.64833 2.0951 1.80 31.5 1.20 -0.333333333333 +# 7.049556277 0.6022245584 4.0 0.0 0.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..cc765920 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,24 @@ +datasets==2.17.1 +flake8==4.0.1 +flake8-docstrings==1.6.0 +gitpython==3.1.27 +isort==5.13.2 +jupyter==1.0.0 +jinja2==3.1.2 +myst-parser==2.0.0 +orion>=0.2.4.post1 +pyarrow==15.0.0 +pyyaml==6.0 +pytest==7.1.2 +pytest-cov==3.0.0 +pytorch_lightning>=2.2.0 +pytype==2024.2.13 +sphinx==7.2.6 +sphinx-autoapi==3.0.0 +sphinx-rtd-theme==2.0.0 +sphinxcontrib-napoleon==0.7 +sphinxcontrib-katex==0.8.6 +tensorboard==2.16.2 +tqdm==4.64.0 +torch==2.2.0 +torchvision>=0.17.0 diff --git a/setup.py b/setup.py index 0db43ec0..2a9715ba 100644 --- a/setup.py +++ b/setup.py @@ -1,34 +1,14 @@ from setuptools import find_packages, setup +with open('requirements.txt', 'r') as f: + requirements = f.readlines() + setup( name='crystal_diffusion', version='0.0.1', packages=find_packages(include=['crystal_diffusion', 'crystal_diffusion.*']), python_requires='>=3.11', - install_requires=[ - 'flake8==4.0.1', - 'flake8-docstrings==1.6.0', - 'isort==5.13.2', - 'gitpython==3.1.27', - 'jupyter==1.0.0', - 'jinja2==3.1.2', - 'myst-parser==2.0.0', - 'orion>=0.2.4.post1', - 'pyyaml==6.0', - 'pytest==7.1.2', - 'pytest-cov==3.0.0', - 'pytorch_lightning>=2.2.0', - 'pytype==2024.2.13', - 'sphinx==7.2.6', - 'sphinx-autoapi==3.0.0', - 'sphinx-rtd-theme==2.0.0', - 'sphinxcontrib-napoleon==0.7', - 'sphinxcontrib-katex==0.8.6', - 'tensorboard==2.16.2', - 'tqdm==4.64.0', - 'torch==2.2.0', - 'torchvision>=0.17.0', - ], + install_requires=requirements, entry_points={ 'console_scripts': [ 'cd-train=crystal_diffusion.train:main', diff --git a/tests/data/test_parse_lammps_output.py b/tests/data/test_parse_lammps_output.py new file mode 100644 index 00000000..bb82ae52 --- /dev/null +++ b/tests/data/test_parse_lammps_output.py @@ -0,0 +1,73 @@ +import os + +import numpy as np +import pandas as pd +import pytest +import yaml + +from crystal_diffusion.data.parse_lammps_outputs import parse_lammps_output + + +def generate_fake_yaml(filename, documents, multiple_docs=True): + # Write the YAML content + with open(filename, 'w') as yaml_file: + if multiple_docs: + yaml.dump_all(documents, yaml_file) + else: + yaml.dump(documents, yaml_file) + + +@pytest.fixture +def fake_lammps_yaml(tmpdir): + # fake LAMMPS output file with 4 MD steps in 1D for 3 atoms + yaml_content = [ + {'keywords': ['id', 'type', 'x', 'fx'], + 'data': [[0, 1, 0.1, 0.01], [1, 2, 0.2, 0.02], [2, 1, 0.3, 0.03]]}, + {'keywords': ['id', 'type', 'x', 'fx'], + 'data': [[0, 1, 1.1, 1.01], [1, 2, 1.2, 1.02], [2, 1, 1.3, 1.03]]}, + {'keywords': ['id', 'type', 'x', 'fx'], + 'data': [[0, 1, 2.1, 2.01], [1, 2, 2.2, 2.02], [2, 1, 2.3, 2.03]]}, + {'keywords': ['id', 'type', 'x', 'fx'], + 'data': [[0, 1, 3.1, 3.01], [1, 2, 3.2, 3.02], [2, 1, 3.3, 3.03]]}, + ] + file = os.path.join(tmpdir, 'fake_lammps_dump.yaml') + generate_fake_yaml(file, yaml_content) + return file + + +@pytest.fixture +def fake_thermo_yaml(tmpdir): + # fake LAMMPS thermo file with 4 MD steps + yaml_content = { + 'keywords': ['KinEng', 'PotEng'], + 'data': [[0.4, 0.5], [1.4, 1.5], [2.4, 2.5], [3.4, 3.5]] + } + file = os.path.join(tmpdir, 'fake_lammps_thermo.yaml') + generate_fake_yaml(file, yaml_content, multiple_docs=False) + return file + + +def test_parse_lammps_outputs(fake_lammps_yaml, fake_thermo_yaml, tmpdir): + output_name = os.path.join(tmpdir, 'test.parquet') + parse_lammps_output(fake_lammps_yaml, fake_thermo_yaml, output_name) + # check that a file exists + assert os.path.exists(output_name) + + df = pd.read_parquet(output_name) + assert not df.empty + + assert len(df) == 4 + + for i, v in enumerate(['id', 'type', 'x', 'fx', 'energy']): + assert v in df.keys() + for x in range(4): + if v == 'id': + assert np.array_equal(df[v][x], [0, 1, 2]) + elif v == 'type': + assert np.array_equal(df[v][x], [1, 2, 1]) + elif v == 'x': + assert np.allclose(df[v][x], [x + 0.1 * y for y in range(1, 4)]) + elif v == 'fx': + assert np.allclose(df[v][x], [x + 0.01 * y for y in range(1, 4)]) + else: # v == 'energy' + assert np.allclose(df[v][x], [2 * x + 0.9])