Skip to content

Commit

Permalink
Merge pull request #1 from mila-iqia/lammps_output_preprocess
Browse files Browse the repository at this point in the history
Lammps output preprocess
  • Loading branch information
sblackburn86 authored Feb 27, 2024
2 parents fb61240 + a75654d commit 6b76ed8
Show file tree
Hide file tree
Showing 8 changed files with 242 additions and 24 deletions.
68 changes: 68 additions & 0 deletions crystal_diffusion/data/parse_lammps_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import argparse
import os
from collections import defaultdict

import pandas as pd
import yaml


def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: str):
"""Parse a LAMMPS output file and save in a .csv format.
Args:
lammps_dump: LAMMPS output file
lammps_thermo_log: LAMMPS thermodynamic variables output file
output_name: name of parsed output written by the script
"""
if not os.path.exists(lammps_dump):
raise ValueError(f'{lammps_dump} does not exist. Please provide a valid LAMMPS dump file as yaml.')

if not os.path.exists(lammps_thermo_log):
raise ValueError(f'{lammps_thermo_log} does not exist. Please provide a valid LAMMPS thermo log file as yaml.')

# get the atom information (positions and forces) from the LAMMPS 'dump' file
with open(lammps_dump, 'r') as f:
dump_yaml = yaml.safe_load_all(f)
# every MD iteration is saved as a separate document in the yaml file
# prepare a dataframe to get all the data
pd_data = defaultdict(list)
for doc in dump_yaml: # loop over MD steps
if 'id' not in doc['keywords']: # sanity check
raise ValueError('id should be in LAMMPS dump file')
atoms_info = defaultdict(list) # store information on atoms positions and forces here
for data in doc['data']: # loop over the atoms to get their positions and forces
for key, v in zip(doc['keywords'], data):
if key not in ['id', 'type', 'x', 'y', 'z', 'fx', 'fy', 'fz']:
continue
else:
atoms_info[key].append(v) # get positions or forces
# add the information about that MD step to the dataframe
for k, v in atoms_info.items(): # k should be id, type, x, y, z, fx, fy, fz
pd_data[k].append(v)

# get the total energy from the LAMMPS second output
with open(lammps_thermo_log, 'r') as f:
log_yaml = yaml.safe_load(f)
kin_idx = log_yaml['keywords'].index('KinEng')
pot_idx = log_yaml['keywords'].index('PotEng')
pd_data['energy'] = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']]

if not output_name.endswith('.parquet'):
output_name += '.parquet'

pd.DataFrame(pd_data).to_parquet(output_name, engine='pyarrow', index=False)


def main():
"""Main script to parse LAMMPS files and output a single parquet file."""
parser = argparse.ArgumentParser(description="Convert LAMMPS outputs in parquet file compatible with a dataloader.")
parser.add_argument("--dump_file", type=str, help="LAMMPS dump file in yaml format.")
parser.add_argument("--thermo_file", type=str, help="LAMMPS thermo output file in yaml format.")
parser.add_argument("--output_name", type=str, help="Output name")
args = parser.parse_args()

parse_lammps_output(args.dump_file, args.thermo_file, args.output_name)


if __name__ == '__main__':
main()
31 changes: 31 additions & 0 deletions data/lammps_input_example.lammps
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
log log.si-${T}-${S}.lammps

units metal
atom_style atomic
atom_modify map array

lattice diamond 5.43
region simbox block 0 ${S} 0 ${S} 0 ${S}
create_box 1 simbox
create_atoms 1 region simbox

#read_dump ${DUMP} ${STEP} x y z vx vy vz fx fy fz box yes replace no purge yes add yes

mass 1 28.0855

group Si type 1

pair_style sw
pair_coeff * * si.sw Si

velocity all create ${T} 62177

dump 1 all yaml 1 dump.si-${T}-${S}.yaml id type x y z fx fy fz

thermo_style yaml
thermo 1
#==========================Output files========================

fix 1 all nvt temp ${T} ${T} 0.01
run ${STEP}
unfix 1
11 changes: 11 additions & 0 deletions data/parse_lammps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

EXP_DIR="lammps_scripts/Si/si-custom/"
DUMP_FILENAME="dump.si-300-1.yaml"
THERMO_FILENAME="thermo_log.yaml"
OUTPUT_NAME="demo.parquet"

python crystal_diffusion/data/parse_lammps_outputs.py \
--dump_file ${EXP_DIR}/${DUMP_FILENAME} \
--thermo_file ${EXP_DIR}/${THERMO_FILENAME} \
--output_name ${EXP_DIR}/${OUTPUT_NAME}
9 changes: 9 additions & 0 deletions data/run_lammps_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

TEMPERATURE=300
BOX_SIZE=1

lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE

# extract the thermodynamic outputs in a yaml file
egrep '^(keywords:|data:$|---$|\.\.\.$| - \[)' log.lammps > log.yaml
22 changes: 22 additions & 0 deletions data/si.sw
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Stillinger-Weber parameters for various elements and mixtures
# multiple entries can be added to this file, LAMMPS reads the ones it needs
# these entries are in LAMMPS "metal" units:
# epsilon = eV; sigma = Angstroms
# other quantities are unitless

# format of a single entry (one or more lines):
# element 1, element 2, element 3,
# epsilon, sigma, a, lambda, gamma, costheta0, A, B, p, q, tol

# Here are the original parameters in metal units, for Silicon from:
#
# Stillinger and Weber, Phys. Rev. B, v. 31, p. 5262, (1985)
#
# Parameters for 'dia' Si
Si Si Si 2.1683 2.0951 1.80 21.0 1.20 -0.333333333333
7.049556277 0.6022245584 4.0 0.0 0.0
#
# Parameters for amorphous Si with the modified SW potential
#(R. L. C. Vink, G. T. Barkema, W. F. van der Weg et N. Mousseau, A semi-empirical potential for amorphous silicon, J. Non-Cryst. Sol. 282, 248-255 (2001))
#Si Si Si 1.64833 2.0951 1.80 31.5 1.20 -0.333333333333
# 7.049556277 0.6022245584 4.0 0.0 0.0
24 changes: 24 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
datasets==2.17.1
flake8==4.0.1
flake8-docstrings==1.6.0
gitpython==3.1.27
isort==5.13.2
jupyter==1.0.0
jinja2==3.1.2
myst-parser==2.0.0
orion>=0.2.4.post1
pyarrow==15.0.0
pyyaml==6.0
pytest==7.1.2
pytest-cov==3.0.0
pytorch_lightning>=2.2.0
pytype==2024.2.13
sphinx==7.2.6
sphinx-autoapi==3.0.0
sphinx-rtd-theme==2.0.0
sphinxcontrib-napoleon==0.7
sphinxcontrib-katex==0.8.6
tensorboard==2.16.2
tqdm==4.64.0
torch==2.2.0
torchvision>=0.17.0
28 changes: 4 additions & 24 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,14 @@
from setuptools import find_packages, setup

with open('requirements.txt', 'r') as f:
requirements = f.readlines()

setup(
name='crystal_diffusion',
version='0.0.1',
packages=find_packages(include=['crystal_diffusion', 'crystal_diffusion.*']),
python_requires='>=3.11',
install_requires=[
'flake8==4.0.1',
'flake8-docstrings==1.6.0',
'isort==5.13.2',
'gitpython==3.1.27',
'jupyter==1.0.0',
'jinja2==3.1.2',
'myst-parser==2.0.0',
'orion>=0.2.4.post1',
'pyyaml==6.0',
'pytest==7.1.2',
'pytest-cov==3.0.0',
'pytorch_lightning>=2.2.0',
'pytype==2024.2.13',
'sphinx==7.2.6',
'sphinx-autoapi==3.0.0',
'sphinx-rtd-theme==2.0.0',
'sphinxcontrib-napoleon==0.7',
'sphinxcontrib-katex==0.8.6',
'tensorboard==2.16.2',
'tqdm==4.64.0',
'torch==2.2.0',
'torchvision>=0.17.0',
],
install_requires=requirements,
entry_points={
'console_scripts': [
'cd-train=crystal_diffusion.train:main',
Expand Down
73 changes: 73 additions & 0 deletions tests/data/test_parse_lammps_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os

import numpy as np
import pandas as pd
import pytest
import yaml

from crystal_diffusion.data.parse_lammps_outputs import parse_lammps_output


def generate_fake_yaml(filename, documents, multiple_docs=True):
# Write the YAML content
with open(filename, 'w') as yaml_file:
if multiple_docs:
yaml.dump_all(documents, yaml_file)
else:
yaml.dump(documents, yaml_file)


@pytest.fixture
def fake_lammps_yaml(tmpdir):
# fake LAMMPS output file with 4 MD steps in 1D for 3 atoms
yaml_content = [
{'keywords': ['id', 'type', 'x', 'fx'],
'data': [[0, 1, 0.1, 0.01], [1, 2, 0.2, 0.02], [2, 1, 0.3, 0.03]]},
{'keywords': ['id', 'type', 'x', 'fx'],
'data': [[0, 1, 1.1, 1.01], [1, 2, 1.2, 1.02], [2, 1, 1.3, 1.03]]},
{'keywords': ['id', 'type', 'x', 'fx'],
'data': [[0, 1, 2.1, 2.01], [1, 2, 2.2, 2.02], [2, 1, 2.3, 2.03]]},
{'keywords': ['id', 'type', 'x', 'fx'],
'data': [[0, 1, 3.1, 3.01], [1, 2, 3.2, 3.02], [2, 1, 3.3, 3.03]]},
]
file = os.path.join(tmpdir, 'fake_lammps_dump.yaml')
generate_fake_yaml(file, yaml_content)
return file


@pytest.fixture
def fake_thermo_yaml(tmpdir):
# fake LAMMPS thermo file with 4 MD steps
yaml_content = {
'keywords': ['KinEng', 'PotEng'],
'data': [[0.4, 0.5], [1.4, 1.5], [2.4, 2.5], [3.4, 3.5]]
}
file = os.path.join(tmpdir, 'fake_lammps_thermo.yaml')
generate_fake_yaml(file, yaml_content, multiple_docs=False)
return file


def test_parse_lammps_outputs(fake_lammps_yaml, fake_thermo_yaml, tmpdir):
output_name = os.path.join(tmpdir, 'test.parquet')
parse_lammps_output(fake_lammps_yaml, fake_thermo_yaml, output_name)
# check that a file exists
assert os.path.exists(output_name)

df = pd.read_parquet(output_name)
assert not df.empty

assert len(df) == 4

for i, v in enumerate(['id', 'type', 'x', 'fx', 'energy']):
assert v in df.keys()
for x in range(4):
if v == 'id':
assert np.array_equal(df[v][x], [0, 1, 2])
elif v == 'type':
assert np.array_equal(df[v][x], [1, 2, 1])
elif v == 'x':
assert np.allclose(df[v][x], [x + 0.1 * y for y in range(1, 4)])
elif v == 'fx':
assert np.allclose(df[v][x], [x + 0.01 * y for y in range(1, 4)])
else: # v == 'energy'
assert np.allclose(df[v][x], [2 * x + 0.9])

0 comments on commit 6b76ed8

Please sign in to comment.