Merge pull request #1 from mila-iqia/lammps_output_preprocess

Lammps output preprocess
mila-iqia · Feb 27, 2024 · 6b76ed8 · 6b76ed8
2 parents fb61240 + a75654d
commit 6b76ed8
Show file tree

Hide file tree

Showing 8 changed files with 242 additions and 24 deletions.
diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -0,0 +1,68 @@
+import argparse
+import os
+from collections import defaultdict
+
+import pandas as pd
+import yaml
+
+
+def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: str):
+    """Parse a LAMMPS output file and save in a .csv format.
+
+    Args:
+        lammps_dump: LAMMPS output file
+        lammps_thermo_log: LAMMPS thermodynamic variables output file
+        output_name: name of parsed output written by the script
+    """
+    if not os.path.exists(lammps_dump):
+        raise ValueError(f'{lammps_dump} does not exist. Please provide a valid LAMMPS dump file as yaml.')
+
+    if not os.path.exists(lammps_thermo_log):
+        raise ValueError(f'{lammps_thermo_log} does not exist. Please provide a valid LAMMPS thermo log file as yaml.')
+
+    # get the atom information (positions and forces) from the LAMMPS 'dump' file
+    with open(lammps_dump, 'r') as f:
+        dump_yaml = yaml.safe_load_all(f)
+        # every MD iteration is saved as a separate document in the yaml file
+        # prepare a dataframe to get all the data
+        pd_data = defaultdict(list)
+        for doc in dump_yaml:  # loop over MD steps
+            if 'id' not in doc['keywords']:  # sanity check
+                raise ValueError('id should be in LAMMPS dump file')
+            atoms_info = defaultdict(list)  # store information on atoms positions and forces here
+            for data in doc['data']:  # loop over the atoms to get their positions and forces
+                for key, v in zip(doc['keywords'], data):
+                    if key not in ['id', 'type', 'x', 'y', 'z', 'fx', 'fy', 'fz']:
+                        continue
+                    else:
+                        atoms_info[key].append(v)  # get positions or forces
+            # add the information about that MD step to the dataframe
+            for k, v in atoms_info.items():  # k should be id, type, x, y, z, fx, fy, fz
+                pd_data[k].append(v)
+
+    # get the total energy from the LAMMPS second output
+    with open(lammps_thermo_log, 'r') as f:
+        log_yaml = yaml.safe_load(f)
+        kin_idx = log_yaml['keywords'].index('KinEng')
+        pot_idx = log_yaml['keywords'].index('PotEng')
+        pd_data['energy'] = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']]
+
+    if not output_name.endswith('.parquet'):
+        output_name += '.parquet'
+
+    pd.DataFrame(pd_data).to_parquet(output_name, engine='pyarrow', index=False)
+
+
+def main():
+    """Main script to parse LAMMPS files and output a single parquet file."""
+    parser = argparse.ArgumentParser(description="Convert LAMMPS outputs in parquet file compatible with a dataloader.")
+    parser.add_argument("--dump_file", type=str, help="LAMMPS dump file in yaml format.")
+    parser.add_argument("--thermo_file", type=str, help="LAMMPS thermo output file in yaml format.")
+    parser.add_argument("--output_name", type=str, help="Output name")
+    args = parser.parse_args()
+
+    parse_lammps_output(args.dump_file, args.thermo_file, args.output_name)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps
@@ -0,0 +1,31 @@
+log log.si-${T}-${S}.lammps
+
+units           metal
+atom_style      atomic
+atom_modify     map array
+
+lattice         diamond 5.43
+region          simbox block    0 ${S} 0 ${S} 0 ${S}
+create_box      1 simbox
+create_atoms    1 region simbox
+
+#read_dump ${DUMP} ${STEP} x y z vx vy vz fx fy fz box yes replace no purge yes add yes
+
+mass 1 28.0855
+
+group Si type 1
+
+pair_style sw
+pair_coeff * * si.sw Si
+
+velocity   all create ${T} 62177
+
+dump 1 all yaml 1 dump.si-${T}-${S}.yaml id type x y z fx fy fz
+
+thermo_style yaml
+thermo 1
+#==========================Output files========================
+
+fix 1 all nvt temp ${T} ${T} 0.01
+run ${STEP}
+unfix 1
diff --git a/data/parse_lammps.sh b/data/parse_lammps.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+EXP_DIR="lammps_scripts/Si/si-custom/"
+DUMP_FILENAME="dump.si-300-1.yaml"
+THERMO_FILENAME="thermo_log.yaml"
+OUTPUT_NAME="demo.parquet"
+
+python crystal_diffusion/data/parse_lammps_outputs.py \
+    --dump_file  ${EXP_DIR}/${DUMP_FILENAME} \
+    --thermo_file ${EXP_DIR}/${THERMO_FILENAME} \
+    --output_name ${EXP_DIR}/${OUTPUT_NAME}
diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+TEMPERATURE=300
+BOX_SIZE=1
+
+lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
+
+# extract the thermodynamic outputs in a yaml file
+egrep  '^(keywords:|data:$|---$|\.\.\.$|  - \[)' log.lammps > log.yaml
diff --git a/data/si.sw b/data/si.sw
@@ -0,0 +1,22 @@
+# Stillinger-Weber parameters for various elements and mixtures
+# multiple entries can be added to this file, LAMMPS reads the ones it needs
+# these entries are in LAMMPS "metal" units:
+#   epsilon = eV; sigma = Angstroms
+#   other quantities are unitless
+
+# format of a single entry (one or more lines):
+#   element 1, element 2, element 3, 
+#   epsilon, sigma, a, lambda, gamma, costheta0, A, B, p, q, tol
+
+# Here are the original parameters in metal units, for Silicon from:
+#
+# Stillinger and Weber,  Phys. Rev. B, v. 31, p. 5262, (1985)
+#
+# Parameters for 'dia' Si
+Si Si Si 2.1683  2.0951  1.80  21.0  1.20  -0.333333333333
+         7.049556277  0.6022245584  4.0  0.0 0.0
+#
+# Parameters for amorphous Si  with the modified SW potential
+#(R. L. C. Vink, G. T. Barkema, W. F. van der Weg et N. Mousseau, A semi-empirical potential for amorphous silicon, J. Non-Cryst. Sol. 282, 248-255 (2001))
+#Si Si Si 1.64833  2.0951  1.80  31.5  1.20  -0.333333333333
+#         7.049556277  0.6022245584  4.0  0.0 0.0 
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,24 @@
+datasets==2.17.1
+flake8==4.0.1
+flake8-docstrings==1.6.0
+gitpython==3.1.27
+isort==5.13.2
+jupyter==1.0.0
+jinja2==3.1.2
+myst-parser==2.0.0
+orion>=0.2.4.post1
+pyarrow==15.0.0
+pyyaml==6.0
+pytest==7.1.2
+pytest-cov==3.0.0
+pytorch_lightning>=2.2.0
+pytype==2024.2.13
+sphinx==7.2.6
+sphinx-autoapi==3.0.0
+sphinx-rtd-theme==2.0.0
+sphinxcontrib-napoleon==0.7
+sphinxcontrib-katex==0.8.6
+tensorboard==2.16.2
+tqdm==4.64.0
+torch==2.2.0
+torchvision>=0.17.0
diff --git a/setup.py b/setup.py
@@ -1,34 +1,14 @@
 from setuptools import find_packages, setup
 
+with open('requirements.txt', 'r') as f:
+    requirements = f.readlines()
+
 setup(
     name='crystal_diffusion',
     version='0.0.1',
     packages=find_packages(include=['crystal_diffusion', 'crystal_diffusion.*']),
     python_requires='>=3.11',
-    install_requires=[
-        'flake8==4.0.1',
-        'flake8-docstrings==1.6.0',
-        'isort==5.13.2',
-        'gitpython==3.1.27',
-        'jupyter==1.0.0',
-        'jinja2==3.1.2',
-        'myst-parser==2.0.0',
-        'orion>=0.2.4.post1',
-        'pyyaml==6.0',
-        'pytest==7.1.2',
-        'pytest-cov==3.0.0',
-        'pytorch_lightning>=2.2.0',
-        'pytype==2024.2.13',
-        'sphinx==7.2.6',
-        'sphinx-autoapi==3.0.0',
-        'sphinx-rtd-theme==2.0.0',
-        'sphinxcontrib-napoleon==0.7',
-        'sphinxcontrib-katex==0.8.6',
-        'tensorboard==2.16.2',
-        'tqdm==4.64.0',
-        'torch==2.2.0',
-        'torchvision>=0.17.0',
-    ],
+    install_requires=requirements,
     entry_points={
         'console_scripts': [
             'cd-train=crystal_diffusion.train:main',

diff --git a/tests/data/test_parse_lammps_output.py b/tests/data/test_parse_lammps_output.py
@@ -0,0 +1,73 @@
+import os
+
+import numpy as np
+import pandas as pd
+import pytest
+import yaml
+
+from crystal_diffusion.data.parse_lammps_outputs import parse_lammps_output
+
+
+def generate_fake_yaml(filename, documents, multiple_docs=True):
+    # Write the YAML content
+    with open(filename, 'w') as yaml_file:
+        if multiple_docs:
+            yaml.dump_all(documents, yaml_file)
+        else:
+            yaml.dump(documents, yaml_file)
+
+
+@pytest.fixture
+def fake_lammps_yaml(tmpdir):
+    # fake LAMMPS output file with 4 MD steps in 1D for 3 atoms
+    yaml_content = [
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 0.1, 0.01], [1, 2, 0.2, 0.02], [2, 1, 0.3, 0.03]]},
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 1.1, 1.01], [1, 2, 1.2, 1.02], [2, 1, 1.3, 1.03]]},
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 2.1, 2.01], [1, 2, 2.2, 2.02], [2, 1, 2.3, 2.03]]},
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 3.1, 3.01], [1, 2, 3.2, 3.02], [2, 1, 3.3, 3.03]]},
+    ]
+    file = os.path.join(tmpdir, 'fake_lammps_dump.yaml')
+    generate_fake_yaml(file, yaml_content)
+    return file
+
+
+@pytest.fixture
+def fake_thermo_yaml(tmpdir):
+    # fake LAMMPS thermo file with 4 MD steps
+    yaml_content = {
+        'keywords': ['KinEng', 'PotEng'],
+        'data': [[0.4, 0.5], [1.4, 1.5], [2.4, 2.5], [3.4, 3.5]]
+    }
+    file = os.path.join(tmpdir, 'fake_lammps_thermo.yaml')
+    generate_fake_yaml(file, yaml_content, multiple_docs=False)
+    return file
+
+
+def test_parse_lammps_outputs(fake_lammps_yaml, fake_thermo_yaml, tmpdir):
+    output_name = os.path.join(tmpdir, 'test.parquet')
+    parse_lammps_output(fake_lammps_yaml, fake_thermo_yaml, output_name)
+    # check that a file exists
+    assert os.path.exists(output_name)
+
+    df = pd.read_parquet(output_name)
+    assert not df.empty
+
+    assert len(df) == 4
+
+    for i, v in enumerate(['id', 'type', 'x', 'fx', 'energy']):
+        assert v in df.keys()
+        for x in range(4):
+            if v == 'id':
+                assert np.array_equal(df[v][x], [0, 1, 2])
+            elif v == 'type':
+                assert np.array_equal(df[v][x], [1, 2, 1])
+            elif v == 'x':
+                assert np.allclose(df[v][x], [x + 0.1 * y for y in range(1, 4)])
+            elif v == 'fx':
+                assert np.allclose(df[v][x], [x + 0.01 * y for y in range(1, 4)])
+            else:  # v == 'energy'
+                assert np.allclose(df[v][x], [2 * x + 0.9])