From b492a4b2b54b2557678916c08981539a2c1128ec Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 14:03:33 -0500
Subject: [PATCH 01/32] scripts to preprocess LAMMPS outputs

---
 crystal_diffusion/data/parse_lammps.sh        | 11 +++
 .../data/parse_lammps_outputs.py              | 68 +++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100755 crystal_diffusion/data/parse_lammps.sh
 create mode 100644 crystal_diffusion/data/parse_lammps_outputs.py

diff --git a/crystal_diffusion/data/parse_lammps.sh b/crystal_diffusion/data/parse_lammps.sh
new file mode 100755
index 00000000..bcbb2079
--- /dev/null
+++ b/crystal_diffusion/data/parse_lammps.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+EXP_DIR="lammps_scripts/Si/si-custom/"
+DUMP_FILENAME="dump.si-300-1.yaml"
+THERMO_FILENAME="thermo_log.yaml"
+OUTPUT_NAME="demo.parquet"
+
+python crystal_diffusion/data/parse_lammps_outputs.py \
+    --dump_file  ${EXP_DIR}/${DUMP_FILENAME} \
+    --thermo_file ${EXP_DIR}/${THERMO_FILENAME} \
+    --output_name ${EXP_DIR}/${OUTPUT_NAME}
diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
new file mode 100644
index 00000000..06c84bd5
--- /dev/null
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -0,0 +1,68 @@
+import argparse
+import os
+
+import pandas as pd
+import yaml
+
+
+def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: str):
+    """Parse a LAMMPS output file and save in a .csv format.
+
+    Args:
+        lammps_dump: LAMMPS output file
+        lammps_thermo_log: LAMMPS
+        output_name: name of parsed output written by the script
+    """
+    if not os.path.exists(lammps_dump):
+        raise ValueError(f'{lammps_dump} does not exist. Please provide a valid LAMMPS dump file as yaml.')
+
+    if not os.path.exists(lammps_thermo_log):
+        raise ValueError(f'{lammps_thermo_log} does not exist. Please provide a valid LAMMPS thermo log file as yaml.')
+
+    # get the atom information (positions and forces) from the LAMMPS 'dump' file
+    with open(lammps_dump, 'r') as f:
+        dump_yaml = yaml.safe_load_all(f)
+        # every MD iteration is saved as a separate document in the yaml file
+        # prepare a dataframe to get all the data
+        pd_data = {}
+        for doc in dump_yaml:  # loop over MD steps
+            if 'id' not in doc['keywords']:  # sanity check
+                raise ValueError('id should be in LAMMPS dump file')
+            atoms_info = {} # store information on atoms positions and forces here
+            for data in doc['data']:  # loop over the atoms to get their positions and forces
+                for key, v in zip(doc['keywords'], data):
+                    print(key, v)
+                    if key not in ['x', 'y', 'z', 'fx', 'fy', 'fz']:
+                        continue
+                    else:
+                        atoms_info[key] = atoms_info.get(key, []) + [v]  # get positions or forces
+            # add the information about that MD step to the dataframe
+            for k, v in atoms_info.items():  # k should be x, y, z, fx, fy, fz
+                pd_data[k] = pd_data.get(k, []) + [v]
+
+    # get the total energy from the LAMMPS second output
+    with open(lammps_thermo_log, 'r') as f:
+        log_yaml = yaml.safe_load(f)
+        kin_idx = log_yaml['keywords'].index('KinEng')
+        pot_idx = log_yaml['keywords'].index('PotEng')
+        pd_data['energy'] = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']]
+
+    if not output_name.endswith('.parquet'):
+        output_name += '.parquet'
+
+    pd.DataFrame(pd_data).to_parquet(output_name, engine='pyarrow')
+
+
+def main():
+    """Main script to parse LAMMPS files and output a single parquet file."""
+    parser = argparse.ArgumentParser(description="Convert LAMMPS outputs in parquet file compatible with a dataloader.")
+    parser.add_argument("--dump_file", type=str, help="LAMMPS dump file in yaml format.")
+    parser.add_argument("--thermo_file", type=str, help="LAMMPS thermo output file in yaml format.")
+    parser.add_argument("--output_name", type=str, help="Output name")
+    args = parser.parse_args()
+
+    parse_lammps_output(args.dump_file, args.thermo_file, args.output_name)
+
+
+if __name__ == '__main__':
+    main()

From eda4bbd3a95480c3544d96c50e4c02463f80d7ac Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 14:09:08 -0500
Subject: [PATCH 02/32] example scripts to run LAMMPS

---
 data/lammps_input_example.lammps | 31 +++++++++++++++++++++++++++++++
 data/run_lammps_example.sh       |  6 ++++++
 2 files changed, 37 insertions(+)
 create mode 100755 data/lammps_input_example.lammps
 create mode 100644 data/run_lammps_example.sh

diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps
new file mode 100755
index 00000000..ba9d70a5
--- /dev/null
+++ b/data/lammps_input_example.lammps
@@ -0,0 +1,31 @@
+log log.si-${T}-${S}.lammps
+
+units           metal
+atom_style      atomic
+atom_modify     map array
+
+lattice         diamond 5.43
+region          simbox block    0 ${S} 0 ${S} 0 ${S}
+create_box      1 simbox
+create_atoms    1 region simbox
+
+#read_dump ${DUMP} ${STEP} x y z vx vy vz fx fy fz box yes replace no purge yes add yes
+
+mass 1 28.0855
+
+group Si type 1
+
+pair_style sw
+pair_coeff * * ../Si.sw Si
+
+velocity   all create ${T} 62177
+
+dump 1 all yaml 1 dump.si-${T}-${S}.yaml id type x y z fx fy fz
+
+thermo_style yaml
+thermo 1
+#==========================Output files========================
+
+fix 1 all nvt temp ${T} ${T} 0.01
+run ${STEP}
+unfix 1
diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
new file mode 100644
index 00000000..66ba52c6
--- /dev/null
+++ b/data/run_lammps_example.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+TEMPERATURE=300
+BOX_SIZE=1
+
+lmp < data/lammps_input_example -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file

From ecae0825461b57e97a0458b6b78dc0a1ef6eded3 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 14:13:31 -0500
Subject: [PATCH 03/32] do not save index when writing parquet

---
 crystal_diffusion/data/parse_lammps_outputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index 06c84bd5..d0fca2fe 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -50,7 +50,7 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
     if not output_name.endswith('.parquet'):
         output_name += '.parquet'
 
-    pd.DataFrame(pd_data).to_parquet(output_name, engine='pyarrow')
+    pd.DataFrame(pd_data).to_parquet(output_name, engine='pyarrow', index=False)
 
 
 def main():

From f31573ce2f23ec8b783916bcf06164ebfcfcb11a Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 14:28:50 -0500
Subject: [PATCH 04/32] save atoms id and type in parquet

---
 crystal_diffusion/data/parse_lammps_outputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index d0fca2fe..52033742 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -32,7 +32,7 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
             for data in doc['data']:  # loop over the atoms to get their positions and forces
                 for key, v in zip(doc['keywords'], data):
                     print(key, v)
-                    if key not in ['x', 'y', 'z', 'fx', 'fy', 'fz']:
+                    if key not in ['id', 'type', 'x', 'y', 'z', 'fx', 'fy', 'fz']:
                         continue
                     else:
                         atoms_info[key] = atoms_info.get(key, []) + [v]  # get positions or forces

From 94eb4d1da314387fceb6c0b664b88fe9add81662 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:04:11 -0500
Subject: [PATCH 05/32] unit test

---
 tests/data/test_parse_lammps_output.py | 72 ++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 tests/data/test_parse_lammps_output.py

diff --git a/tests/data/test_parse_lammps_output.py b/tests/data/test_parse_lammps_output.py
new file mode 100644
index 00000000..4cf6d6c9
--- /dev/null
+++ b/tests/data/test_parse_lammps_output.py
@@ -0,0 +1,72 @@
+import os
+
+import numpy as np
+import pandas as pd
+import pytest
+import yaml
+
+from crystal_diffusion.data.parse_lammps_outputs import parse_lammps_output
+
+
+def generate_fake_yaml(filename, documents, multiple_docs=True):
+    # Write the YAML content
+    with open(filename, 'w') as yaml_file:
+        if multiple_docs:
+            yaml.dump_all(documents, yaml_file)
+        else:
+            yaml.dump(documents, yaml_file)
+
+@pytest.fixture
+def fake_lammps_yaml(tmpdir):
+    # fake LAMMPS output file with 4 MD steps in 1D for 3 atoms
+    yaml_content = [
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 0.1, 0.01], [1, 2, 0.2, 0.02], [2, 1, 0.3, 0.03]]},
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 1.1, 1.01], [1, 2, 1.2, 1.02], [2, 1, 1.3, 1.03]]},
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 2.1, 2.01], [1, 2, 2.2, 2.02], [2, 1, 2.3, 2.03]]},
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 3.1, 3.01], [1, 2, 3.2, 3.02], [2, 1, 3.3, 3.03]]},
+    ]
+    file = os.path.join(tmpdir, 'fake_lammps_dump.yaml')
+    generate_fake_yaml(file, yaml_content)
+    return file
+
+
+@pytest.fixture
+def fake_thermo_yaml(tmpdir):
+    # fake LAMMPS thermo file with 4 MD steps
+    yaml_content = {
+        'keywords': ['KinEng', 'PotEng'],
+        'data': [[0.4, 0.5], [1.4, 1.5], [2.4, 2.5], [3.4, 3.5]]
+    }
+    file = os.path.join(tmpdir, 'fake_lammps_thermo.yaml')
+    generate_fake_yaml(file, yaml_content, multiple_docs=False)
+    return file
+
+
+def test_parse_lammps_outputs(fake_lammps_yaml, fake_thermo_yaml, tmpdir):
+    output_name = os.path.join(tmpdir, 'test.parquet')
+    parse_lammps_output(fake_lammps_yaml, fake_thermo_yaml, output_name)
+    # check that a file exists
+    assert os.path.exists(output_name)
+
+    df = pd.read_parquet(output_name)
+    assert not df.empty
+
+    assert len(df) == 4
+
+    for i, v in enumerate(['id', 'type', 'x', 'fx', 'energy']):
+        assert v in df.keys()
+        for x in range(4):
+            if v == 'id':
+                assert np.array_equal(df[v][x], [0, 1, 2])
+            elif v == 'type':
+                assert np.array_equal(df[v][x], [1, 2, 1])
+            elif v == 'x':
+                assert np.allclose(df[v][x], [x + 0.1 * y for y in range(1, 4)])
+            elif v == 'fx':
+                assert np.allclose(df[v][x], [x + 0.01 * y for y in range(1, 4)])
+            else:  # v == 'energy'
+                assert np.allclose(df[v][x], [2 * x + 0.9])
\ No newline at end of file

From 9a4f9e519230254594cb831afb539a5b8909e2f2 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:24:33 -0500
Subject: [PATCH 06/32] updated list of dependencies

---
 requirements.txt | 23 +++++++++++++++++++++++
 setup.py         | 28 +++++-----------------------
 2 files changed, 28 insertions(+), 23 deletions(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..0ade435e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,23 @@
+datasets-2.17.1
+flake8==4.0.1
+flake8-docstrings==1.6.0
+gitpython==3.1.27
+jupyter==1.0.0
+jinja2==3.1.2
+myst-parser==2.0.0
+orion>=0.2.4.post1
+pyarrow-15.0.0
+pyyaml==6.0
+pytest==7.1.2
+pytest-cov==3.0.0
+pytorch_lightning>=2.2.0
+pytype==2024.2.13
+sphinx==7.2.6
+sphinx-autoapi==3.0.0
+sphinx-rtd-theme==2.0.0
+sphinxcontrib-napoleon==0.7
+sphinxcontrib-katex==0.8.6
+tensorboard==2.16.2
+tqdm==4.64.0
+torch==2.2.0
+torchvision>=0.17.0
diff --git a/setup.py b/setup.py
index 9f1017c0..fb11dfb6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,33 +1,15 @@
 from setuptools import find_packages, setup
 
+
+with open('requirements.txt', 'r') as f:
+    requirements = f.readlines()
+
 setup(
     name='crystal_diffusion',
     version='0.0.1',
     packages=find_packages(include=['crystal_diffusion', 'crystal_diffusion.*']),
     python_requires='>=3.11',
-    install_requires=[
-        'flake8==4.0.1',
-        'flake8-docstrings==1.6.0',
-        'gitpython==3.1.27',
-        'jupyter==1.0.0',
-        'jinja2==3.1.2',
-        'myst-parser==2.0.0',
-        'orion>=0.2.4.post1',
-        'pyyaml==6.0',
-        'pytest==7.1.2',
-        'pytest-cov==3.0.0',
-        'pytorch_lightning>=2.2.0',
-        'pytype==2024.2.13',
-        'sphinx==7.2.6',
-        'sphinx-autoapi==3.0.0',
-        'sphinx-rtd-theme==2.0.0',
-        'sphinxcontrib-napoleon==0.7',
-        'sphinxcontrib-katex==0.8.6',
-        'tensorboard==2.16.2',
-        'tqdm==4.64.0',
-        'torch==2.2.0',
-        'torchvision>=0.17.0',
-    ],
+    install_requires=requirements,
     entry_points={
         'console_scripts': [
             'cd-train=crystal_diffusion.train:main',

From 3d9be3e7cf02b7868eef4092aa6c19bfae7be26c Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:32:44 -0500
Subject: [PATCH 07/32] flake8 errors

---
 crystal_diffusion/data/parse_lammps_outputs.py | 3 +--
 tests/data/test_parse_lammps_output.py         | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index 52033742..b23ddb5f 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -28,10 +28,9 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
         for doc in dump_yaml:  # loop over MD steps
             if 'id' not in doc['keywords']:  # sanity check
                 raise ValueError('id should be in LAMMPS dump file')
-            atoms_info = {} # store information on atoms positions and forces here
+            atoms_info = {}  # store information on atoms positions and forces here
             for data in doc['data']:  # loop over the atoms to get their positions and forces
                 for key, v in zip(doc['keywords'], data):
-                    print(key, v)
                     if key not in ['id', 'type', 'x', 'y', 'z', 'fx', 'fy', 'fz']:
                         continue
                     else:
diff --git a/tests/data/test_parse_lammps_output.py b/tests/data/test_parse_lammps_output.py
index 4cf6d6c9..bb82ae52 100644
--- a/tests/data/test_parse_lammps_output.py
+++ b/tests/data/test_parse_lammps_output.py
@@ -16,6 +16,7 @@ def generate_fake_yaml(filename, documents, multiple_docs=True):
         else:
             yaml.dump(documents, yaml_file)
 
+
 @pytest.fixture
 def fake_lammps_yaml(tmpdir):
     # fake LAMMPS output file with 4 MD steps in 1D for 3 atoms
@@ -69,4 +70,4 @@ def test_parse_lammps_outputs(fake_lammps_yaml, fake_thermo_yaml, tmpdir):
             elif v == 'fx':
                 assert np.allclose(df[v][x], [x + 0.01 * y for y in range(1, 4)])
             else:  # v == 'energy'
-                assert np.allclose(df[v][x], [2 * x + 0.9])
\ No newline at end of file
+                assert np.allclose(df[v][x], [2 * x + 0.9])

From 06d4b51d76c2f8ebea6c3eb5fdb65b188e96283f Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:37:02 -0500
Subject: [PATCH 08/32] isort fix

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index fb11dfb6..2a9715ba 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
 from setuptools import find_packages, setup
 
-
 with open('requirements.txt', 'r') as f:
     requirements = f.readlines()
 

From 0a0ef8f361601ac2d4ad754f92f795301dfe6d8c Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:45:02 -0500
Subject: [PATCH 09/32] typo in requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0ade435e..3b402065 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-datasets-2.17.1
+datasets==2.17.1
 flake8==4.0.1
 flake8-docstrings==1.6.0
 gitpython==3.1.27

From 8e6dc8498fbb161788fecb1975e44b696287d7ad Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:48:05 -0500
Subject: [PATCH 10/32] another typo in requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3b402065..c50c4cc3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ jupyter==1.0.0
 jinja2==3.1.2
 myst-parser==2.0.0
 orion>=0.2.4.post1
-pyarrow-15.0.0
+pyarrow==15.0.0
 pyyaml==6.0
 pytest==7.1.2
 pytest-cov==3.0.0

From b4e0c39779856b42bf58a5d30034b2d741f6ddba Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 23 Feb 2024 13:40:28 -0500
Subject: [PATCH 11/32] code review fixes

---
 data/lammps_input_example.lammps |  2 +-
 data/run_lammps_example.sh       |  2 +-
 data/si.sw                       | 22 ++++++++++++++++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100755 data/si.sw

diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps
index ba9d70a5..9531b57d 100755
--- a/data/lammps_input_example.lammps
+++ b/data/lammps_input_example.lammps
@@ -16,7 +16,7 @@ mass 1 28.0855
 group Si type 1
 
 pair_style sw
-pair_coeff * * ../Si.sw Si
+pair_coeff * * data/si.sw Si
 
 velocity   all create ${T} 62177
 
diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
index 66ba52c6..f0262196 100644
--- a/data/run_lammps_example.sh
+++ b/data/run_lammps_example.sh
@@ -3,4 +3,4 @@
 TEMPERATURE=300
 BOX_SIZE=1
 
-lmp < data/lammps_input_example -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file
+lmp < data/lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file
diff --git a/data/si.sw b/data/si.sw
new file mode 100755
index 00000000..26cfc25b
--- /dev/null
+++ b/data/si.sw
@@ -0,0 +1,22 @@
+# Stillinger-Weber parameters for various elements and mixtures
+# multiple entries can be added to this file, LAMMPS reads the ones it needs
+# these entries are in LAMMPS "metal" units:
+#   epsilon = eV; sigma = Angstroms
+#   other quantities are unitless
+
+# format of a single entry (one or more lines):
+#   element 1, element 2, element 3, 
+#   epsilon, sigma, a, lambda, gamma, costheta0, A, B, p, q, tol
+
+# Here are the original parameters in metal units, for Silicon from:
+#
+# Stillinger and Weber,  Phys. Rev. B, v. 31, p. 5262, (1985)
+#
+# Parameters for 'dia' Si
+Si Si Si 2.1683  2.0951  1.80  21.0  1.20  -0.333333333333
+         7.049556277  0.6022245584  4.0  0.0 0.0
+#
+# Parameters for amorphous Si  with the modified SW potential
+#(R. L. C. Vink, G. T. Barkema, W. F. van der Weg et N. Mousseau, A semi-empirical potential for amorphous silicon, J. Non-Cryst. Sol. 282, 248-255 (2001))
+#Si Si Si 1.64833  2.0951  1.80  31.5  1.20  -0.333333333333
+#         7.049556277  0.6022245584  4.0  0.0 0.0 

From bf01d7bcd4d378c2aa9de28b92f3669e420d3fa7 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 23 Feb 2024 15:08:22 -0500
Subject: [PATCH 12/32] code review part 2

---
 crystal_diffusion/data/parse_lammps_outputs.py | 2 +-
 data/lammps_input_example.lammps               | 2 +-
 data/run_lammps_example.sh                     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index b23ddb5f..0aecf232 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -10,7 +10,7 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
 
     Args:
         lammps_dump: LAMMPS output file
-        lammps_thermo_log: LAMMPS
+        lammps_thermo_log: LAMMPS thermodynamic variables output file
         output_name: name of parsed output written by the script
     """
     if not os.path.exists(lammps_dump):
diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps
index 9531b57d..4aca590e 100755
--- a/data/lammps_input_example.lammps
+++ b/data/lammps_input_example.lammps
@@ -16,7 +16,7 @@ mass 1 28.0855
 group Si type 1
 
 pair_style sw
-pair_coeff * * data/si.sw Si
+pair_coeff * *si.sw Si
 
 velocity   all create ${T} 62177
 
diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
index f0262196..4e639cfe 100644
--- a/data/run_lammps_example.sh
+++ b/data/run_lammps_example.sh
@@ -3,4 +3,4 @@
 TEMPERATURE=300
 BOX_SIZE=1
 
-lmp < data/lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file
+lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file

From 5927511b163780d0be25d5db4a0eb8868b12f8b2 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 23 Feb 2024 15:20:46 -0500
Subject: [PATCH 13/32] missing space

---
 data/lammps_input_example.lammps | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps
index 4aca590e..c2f77445 100755
--- a/data/lammps_input_example.lammps
+++ b/data/lammps_input_example.lammps
@@ -16,7 +16,7 @@ mass 1 28.0855
 group Si type 1
 
 pair_style sw
-pair_coeff * *si.sw Si
+pair_coeff * * si.sw Si
 
 velocity   all create ${T} 62177
 

From da2b08b9c079550a2b73fa9f9e64ac832094a3d1 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 14:03:33 -0500
Subject: [PATCH 14/32] scripts to preprocess LAMMPS outputs

---
 crystal_diffusion/data/parse_lammps.sh        | 11 +++
 .../data/parse_lammps_outputs.py              | 68 +++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100755 crystal_diffusion/data/parse_lammps.sh
 create mode 100644 crystal_diffusion/data/parse_lammps_outputs.py

diff --git a/crystal_diffusion/data/parse_lammps.sh b/crystal_diffusion/data/parse_lammps.sh
new file mode 100755
index 00000000..bcbb2079
--- /dev/null
+++ b/crystal_diffusion/data/parse_lammps.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+EXP_DIR="lammps_scripts/Si/si-custom/"
+DUMP_FILENAME="dump.si-300-1.yaml"
+THERMO_FILENAME="thermo_log.yaml"
+OUTPUT_NAME="demo.parquet"
+
+python crystal_diffusion/data/parse_lammps_outputs.py \
+    --dump_file  ${EXP_DIR}/${DUMP_FILENAME} \
+    --thermo_file ${EXP_DIR}/${THERMO_FILENAME} \
+    --output_name ${EXP_DIR}/${OUTPUT_NAME}
diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
new file mode 100644
index 00000000..06c84bd5
--- /dev/null
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -0,0 +1,68 @@
+import argparse
+import os
+
+import pandas as pd
+import yaml
+
+
+def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: str):
+    """Parse a LAMMPS output file and save in a .csv format.
+
+    Args:
+        lammps_dump: LAMMPS output file
+        lammps_thermo_log: LAMMPS
+        output_name: name of parsed output written by the script
+    """
+    if not os.path.exists(lammps_dump):
+        raise ValueError(f'{lammps_dump} does not exist. Please provide a valid LAMMPS dump file as yaml.')
+
+    if not os.path.exists(lammps_thermo_log):
+        raise ValueError(f'{lammps_thermo_log} does not exist. Please provide a valid LAMMPS thermo log file as yaml.')
+
+    # get the atom information (positions and forces) from the LAMMPS 'dump' file
+    with open(lammps_dump, 'r') as f:
+        dump_yaml = yaml.safe_load_all(f)
+        # every MD iteration is saved as a separate document in the yaml file
+        # prepare a dataframe to get all the data
+        pd_data = {}
+        for doc in dump_yaml:  # loop over MD steps
+            if 'id' not in doc['keywords']:  # sanity check
+                raise ValueError('id should be in LAMMPS dump file')
+            atoms_info = {} # store information on atoms positions and forces here
+            for data in doc['data']:  # loop over the atoms to get their positions and forces
+                for key, v in zip(doc['keywords'], data):
+                    print(key, v)
+                    if key not in ['x', 'y', 'z', 'fx', 'fy', 'fz']:
+                        continue
+                    else:
+                        atoms_info[key] = atoms_info.get(key, []) + [v]  # get positions or forces
+            # add the information about that MD step to the dataframe
+            for k, v in atoms_info.items():  # k should be x, y, z, fx, fy, fz
+                pd_data[k] = pd_data.get(k, []) + [v]
+
+    # get the total energy from the LAMMPS second output
+    with open(lammps_thermo_log, 'r') as f:
+        log_yaml = yaml.safe_load(f)
+        kin_idx = log_yaml['keywords'].index('KinEng')
+        pot_idx = log_yaml['keywords'].index('PotEng')
+        pd_data['energy'] = [x[kin_idx] + x[pot_idx] for x in log_yaml['data']]
+
+    if not output_name.endswith('.parquet'):
+        output_name += '.parquet'
+
+    pd.DataFrame(pd_data).to_parquet(output_name, engine='pyarrow')
+
+
+def main():
+    """Main script to parse LAMMPS files and output a single parquet file."""
+    parser = argparse.ArgumentParser(description="Convert LAMMPS outputs in parquet file compatible with a dataloader.")
+    parser.add_argument("--dump_file", type=str, help="LAMMPS dump file in yaml format.")
+    parser.add_argument("--thermo_file", type=str, help="LAMMPS thermo output file in yaml format.")
+    parser.add_argument("--output_name", type=str, help="Output name")
+    args = parser.parse_args()
+
+    parse_lammps_output(args.dump_file, args.thermo_file, args.output_name)
+
+
+if __name__ == '__main__':
+    main()

From 031c975ed11b9343a2d19c3ef8c90ce2e9d63342 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 14:09:08 -0500
Subject: [PATCH 15/32] example scripts to run LAMMPS

---
 data/lammps_input_example.lammps | 31 +++++++++++++++++++++++++++++++
 data/run_lammps_example.sh       |  6 ++++++
 2 files changed, 37 insertions(+)
 create mode 100755 data/lammps_input_example.lammps
 create mode 100644 data/run_lammps_example.sh

diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps
new file mode 100755
index 00000000..ba9d70a5
--- /dev/null
+++ b/data/lammps_input_example.lammps
@@ -0,0 +1,31 @@
+log log.si-${T}-${S}.lammps
+
+units           metal
+atom_style      atomic
+atom_modify     map array
+
+lattice         diamond 5.43
+region          simbox block    0 ${S} 0 ${S} 0 ${S}
+create_box      1 simbox
+create_atoms    1 region simbox
+
+#read_dump ${DUMP} ${STEP} x y z vx vy vz fx fy fz box yes replace no purge yes add yes
+
+mass 1 28.0855
+
+group Si type 1
+
+pair_style sw
+pair_coeff * * ../Si.sw Si
+
+velocity   all create ${T} 62177
+
+dump 1 all yaml 1 dump.si-${T}-${S}.yaml id type x y z fx fy fz
+
+thermo_style yaml
+thermo 1
+#==========================Output files========================
+
+fix 1 all nvt temp ${T} ${T} 0.01
+run ${STEP}
+unfix 1
diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
new file mode 100644
index 00000000..66ba52c6
--- /dev/null
+++ b/data/run_lammps_example.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+TEMPERATURE=300
+BOX_SIZE=1
+
+lmp < data/lammps_input_example -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file

From ba9baa0443b40672ed9cffd355da7b3ee43b19f3 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 14:13:31 -0500
Subject: [PATCH 16/32] do not save index when writing parquet

---
 crystal_diffusion/data/parse_lammps_outputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index 06c84bd5..d0fca2fe 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -50,7 +50,7 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
     if not output_name.endswith('.parquet'):
         output_name += '.parquet'
 
-    pd.DataFrame(pd_data).to_parquet(output_name, engine='pyarrow')
+    pd.DataFrame(pd_data).to_parquet(output_name, engine='pyarrow', index=False)
 
 
 def main():

From 04d76c42f59e6ea9f589ca39b6c257a59d2939c9 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 14:28:50 -0500
Subject: [PATCH 17/32] save atoms id and type in parquet

---
 crystal_diffusion/data/parse_lammps_outputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index d0fca2fe..52033742 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -32,7 +32,7 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
             for data in doc['data']:  # loop over the atoms to get their positions and forces
                 for key, v in zip(doc['keywords'], data):
                     print(key, v)
-                    if key not in ['x', 'y', 'z', 'fx', 'fy', 'fz']:
+                    if key not in ['id', 'type', 'x', 'y', 'z', 'fx', 'fy', 'fz']:
                         continue
                     else:
                         atoms_info[key] = atoms_info.get(key, []) + [v]  # get positions or forces

From c7186bda8eec17850d46930f728cd93247df0309 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:04:11 -0500
Subject: [PATCH 18/32] unit test

---
 tests/data/test_parse_lammps_output.py | 72 ++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 tests/data/test_parse_lammps_output.py

diff --git a/tests/data/test_parse_lammps_output.py b/tests/data/test_parse_lammps_output.py
new file mode 100644
index 00000000..4cf6d6c9
--- /dev/null
+++ b/tests/data/test_parse_lammps_output.py
@@ -0,0 +1,72 @@
+import os
+
+import numpy as np
+import pandas as pd
+import pytest
+import yaml
+
+from crystal_diffusion.data.parse_lammps_outputs import parse_lammps_output
+
+
+def generate_fake_yaml(filename, documents, multiple_docs=True):
+    # Write the YAML content
+    with open(filename, 'w') as yaml_file:
+        if multiple_docs:
+            yaml.dump_all(documents, yaml_file)
+        else:
+            yaml.dump(documents, yaml_file)
+
+@pytest.fixture
+def fake_lammps_yaml(tmpdir):
+    # fake LAMMPS output file with 4 MD steps in 1D for 3 atoms
+    yaml_content = [
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 0.1, 0.01], [1, 2, 0.2, 0.02], [2, 1, 0.3, 0.03]]},
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 1.1, 1.01], [1, 2, 1.2, 1.02], [2, 1, 1.3, 1.03]]},
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 2.1, 2.01], [1, 2, 2.2, 2.02], [2, 1, 2.3, 2.03]]},
+        {'keywords': ['id', 'type', 'x', 'fx'],
+         'data': [[0, 1, 3.1, 3.01], [1, 2, 3.2, 3.02], [2, 1, 3.3, 3.03]]},
+    ]
+    file = os.path.join(tmpdir, 'fake_lammps_dump.yaml')
+    generate_fake_yaml(file, yaml_content)
+    return file
+
+
+@pytest.fixture
+def fake_thermo_yaml(tmpdir):
+    # fake LAMMPS thermo file with 4 MD steps
+    yaml_content = {
+        'keywords': ['KinEng', 'PotEng'],
+        'data': [[0.4, 0.5], [1.4, 1.5], [2.4, 2.5], [3.4, 3.5]]
+    }
+    file = os.path.join(tmpdir, 'fake_lammps_thermo.yaml')
+    generate_fake_yaml(file, yaml_content, multiple_docs=False)
+    return file
+
+
+def test_parse_lammps_outputs(fake_lammps_yaml, fake_thermo_yaml, tmpdir):
+    output_name = os.path.join(tmpdir, 'test.parquet')
+    parse_lammps_output(fake_lammps_yaml, fake_thermo_yaml, output_name)
+    # check that a file exists
+    assert os.path.exists(output_name)
+
+    df = pd.read_parquet(output_name)
+    assert not df.empty
+
+    assert len(df) == 4
+
+    for i, v in enumerate(['id', 'type', 'x', 'fx', 'energy']):
+        assert v in df.keys()
+        for x in range(4):
+            if v == 'id':
+                assert np.array_equal(df[v][x], [0, 1, 2])
+            elif v == 'type':
+                assert np.array_equal(df[v][x], [1, 2, 1])
+            elif v == 'x':
+                assert np.allclose(df[v][x], [x + 0.1 * y for y in range(1, 4)])
+            elif v == 'fx':
+                assert np.allclose(df[v][x], [x + 0.01 * y for y in range(1, 4)])
+            else:  # v == 'energy'
+                assert np.allclose(df[v][x], [2 * x + 0.9])
\ No newline at end of file

From 4844f7c9e80f70dadd81c0e69d611f5f92f58255 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:24:33 -0500
Subject: [PATCH 19/32] updated list of dependencies

---
 requirements.txt | 23 +++++++++++++++++++++++
 setup.py         | 29 +++++------------------------
 2 files changed, 28 insertions(+), 24 deletions(-)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..0ade435e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,23 @@
+datasets-2.17.1
+flake8==4.0.1
+flake8-docstrings==1.6.0
+gitpython==3.1.27
+jupyter==1.0.0
+jinja2==3.1.2
+myst-parser==2.0.0
+orion>=0.2.4.post1
+pyarrow-15.0.0
+pyyaml==6.0
+pytest==7.1.2
+pytest-cov==3.0.0
+pytorch_lightning>=2.2.0
+pytype==2024.2.13
+sphinx==7.2.6
+sphinx-autoapi==3.0.0
+sphinx-rtd-theme==2.0.0
+sphinxcontrib-napoleon==0.7
+sphinxcontrib-katex==0.8.6
+tensorboard==2.16.2
+tqdm==4.64.0
+torch==2.2.0
+torchvision>=0.17.0
diff --git a/setup.py b/setup.py
index 0db43ec0..fb11dfb6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,34 +1,15 @@
 from setuptools import find_packages, setup
 
+
+with open('requirements.txt', 'r') as f:
+    requirements = f.readlines()
+
 setup(
     name='crystal_diffusion',
     version='0.0.1',
     packages=find_packages(include=['crystal_diffusion', 'crystal_diffusion.*']),
     python_requires='>=3.11',
-    install_requires=[
-        'flake8==4.0.1',
-        'flake8-docstrings==1.6.0',
-        'isort==5.13.2',
-        'gitpython==3.1.27',
-        'jupyter==1.0.0',
-        'jinja2==3.1.2',
-        'myst-parser==2.0.0',
-        'orion>=0.2.4.post1',
-        'pyyaml==6.0',
-        'pytest==7.1.2',
-        'pytest-cov==3.0.0',
-        'pytorch_lightning>=2.2.0',
-        'pytype==2024.2.13',
-        'sphinx==7.2.6',
-        'sphinx-autoapi==3.0.0',
-        'sphinx-rtd-theme==2.0.0',
-        'sphinxcontrib-napoleon==0.7',
-        'sphinxcontrib-katex==0.8.6',
-        'tensorboard==2.16.2',
-        'tqdm==4.64.0',
-        'torch==2.2.0',
-        'torchvision>=0.17.0',
-    ],
+    install_requires=requirements,
     entry_points={
         'console_scripts': [
             'cd-train=crystal_diffusion.train:main',

From a1f5bf58a20c803cd574a317577f37294faf514e Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:32:44 -0500
Subject: [PATCH 20/32] flake8 errors

---
 crystal_diffusion/data/parse_lammps_outputs.py | 3 +--
 tests/data/test_parse_lammps_output.py         | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index 52033742..b23ddb5f 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -28,10 +28,9 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
         for doc in dump_yaml:  # loop over MD steps
             if 'id' not in doc['keywords']:  # sanity check
                 raise ValueError('id should be in LAMMPS dump file')
-            atoms_info = {} # store information on atoms positions and forces here
+            atoms_info = {}  # store information on atoms positions and forces here
             for data in doc['data']:  # loop over the atoms to get their positions and forces
                 for key, v in zip(doc['keywords'], data):
-                    print(key, v)
                     if key not in ['id', 'type', 'x', 'y', 'z', 'fx', 'fy', 'fz']:
                         continue
                     else:
diff --git a/tests/data/test_parse_lammps_output.py b/tests/data/test_parse_lammps_output.py
index 4cf6d6c9..bb82ae52 100644
--- a/tests/data/test_parse_lammps_output.py
+++ b/tests/data/test_parse_lammps_output.py
@@ -16,6 +16,7 @@ def generate_fake_yaml(filename, documents, multiple_docs=True):
         else:
             yaml.dump(documents, yaml_file)
 
+
 @pytest.fixture
 def fake_lammps_yaml(tmpdir):
     # fake LAMMPS output file with 4 MD steps in 1D for 3 atoms
@@ -69,4 +70,4 @@ def test_parse_lammps_outputs(fake_lammps_yaml, fake_thermo_yaml, tmpdir):
             elif v == 'fx':
                 assert np.allclose(df[v][x], [x + 0.01 * y for y in range(1, 4)])
             else:  # v == 'energy'
-                assert np.allclose(df[v][x], [2 * x + 0.9])
\ No newline at end of file
+                assert np.allclose(df[v][x], [2 * x + 0.9])

From b9affd444cd81cf3e270a245c58b25bc71f839a8 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:37:02 -0500
Subject: [PATCH 21/32] isort fix

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index fb11dfb6..2a9715ba 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
 from setuptools import find_packages, setup
 
-
 with open('requirements.txt', 'r') as f:
     requirements = f.readlines()
 

From c93d82d24dd14573f08b37af714a561653361296 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:45:02 -0500
Subject: [PATCH 22/32] typo in requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0ade435e..3b402065 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-datasets-2.17.1
+datasets==2.17.1
 flake8==4.0.1
 flake8-docstrings==1.6.0
 gitpython==3.1.27

From fe06eba1ec0edb3ec02859c58e740598f6bc72c0 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Thu, 22 Feb 2024 15:48:05 -0500
Subject: [PATCH 23/32] another typo in requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3b402065..c50c4cc3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ jupyter==1.0.0
 jinja2==3.1.2
 myst-parser==2.0.0
 orion>=0.2.4.post1
-pyarrow-15.0.0
+pyarrow==15.0.0
 pyyaml==6.0
 pytest==7.1.2
 pytest-cov==3.0.0

From cd78eb0ba50881507e965d114cd4706e73cddd45 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 23 Feb 2024 13:40:28 -0500
Subject: [PATCH 24/32] code review fixes

---
 data/lammps_input_example.lammps |  2 +-
 data/run_lammps_example.sh       |  2 +-
 data/si.sw                       | 22 ++++++++++++++++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100755 data/si.sw

diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps
index ba9d70a5..9531b57d 100755
--- a/data/lammps_input_example.lammps
+++ b/data/lammps_input_example.lammps
@@ -16,7 +16,7 @@ mass 1 28.0855
 group Si type 1
 
 pair_style sw
-pair_coeff * * ../Si.sw Si
+pair_coeff * * data/si.sw Si
 
 velocity   all create ${T} 62177
 
diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
index 66ba52c6..f0262196 100644
--- a/data/run_lammps_example.sh
+++ b/data/run_lammps_example.sh
@@ -3,4 +3,4 @@
 TEMPERATURE=300
 BOX_SIZE=1
 
-lmp < data/lammps_input_example -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file
+lmp < data/lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file
diff --git a/data/si.sw b/data/si.sw
new file mode 100755
index 00000000..26cfc25b
--- /dev/null
+++ b/data/si.sw
@@ -0,0 +1,22 @@
+# Stillinger-Weber parameters for various elements and mixtures
+# multiple entries can be added to this file, LAMMPS reads the ones it needs
+# these entries are in LAMMPS "metal" units:
+#   epsilon = eV; sigma = Angstroms
+#   other quantities are unitless
+
+# format of a single entry (one or more lines):
+#   element 1, element 2, element 3, 
+#   epsilon, sigma, a, lambda, gamma, costheta0, A, B, p, q, tol
+
+# Here are the original parameters in metal units, for Silicon from:
+#
+# Stillinger and Weber,  Phys. Rev. B, v. 31, p. 5262, (1985)
+#
+# Parameters for 'dia' Si
+Si Si Si 2.1683  2.0951  1.80  21.0  1.20  -0.333333333333
+         7.049556277  0.6022245584  4.0  0.0 0.0
+#
+# Parameters for amorphous Si  with the modified SW potential
+#(R. L. C. Vink, G. T. Barkema, W. F. van der Weg et N. Mousseau, A semi-empirical potential for amorphous silicon, J. Non-Cryst. Sol. 282, 248-255 (2001))
+#Si Si Si 1.64833  2.0951  1.80  31.5  1.20  -0.333333333333
+#         7.049556277  0.6022245584  4.0  0.0 0.0 

From 9a3e24342c93e10ffc21747517ef9ee95093bfa8 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 23 Feb 2024 15:08:22 -0500
Subject: [PATCH 25/32] code review part 2

---
 crystal_diffusion/data/parse_lammps_outputs.py | 2 +-
 data/lammps_input_example.lammps               | 2 +-
 data/run_lammps_example.sh                     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index b23ddb5f..0aecf232 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -10,7 +10,7 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
 
     Args:
         lammps_dump: LAMMPS output file
-        lammps_thermo_log: LAMMPS
+        lammps_thermo_log: LAMMPS thermodynamic variables output file
         output_name: name of parsed output written by the script
     """
     if not os.path.exists(lammps_dump):
diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps
index 9531b57d..4aca590e 100755
--- a/data/lammps_input_example.lammps
+++ b/data/lammps_input_example.lammps
@@ -16,7 +16,7 @@ mass 1 28.0855
 group Si type 1
 
 pair_style sw
-pair_coeff * * data/si.sw Si
+pair_coeff * *si.sw Si
 
 velocity   all create ${T} 62177
 
diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
index f0262196..4e639cfe 100644
--- a/data/run_lammps_example.sh
+++ b/data/run_lammps_example.sh
@@ -3,4 +3,4 @@
 TEMPERATURE=300
 BOX_SIZE=1
 
-lmp < data/lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file
+lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file

From 2df251888c4b25e135781d626f6ce758e80cd87b Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Fri, 23 Feb 2024 15:20:46 -0500
Subject: [PATCH 26/32] missing space

---
 data/lammps_input_example.lammps | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/lammps_input_example.lammps b/data/lammps_input_example.lammps
index 4aca590e..c2f77445 100755
--- a/data/lammps_input_example.lammps
+++ b/data/lammps_input_example.lammps
@@ -16,7 +16,7 @@ mass 1 28.0855
 group Si type 1
 
 pair_style sw
-pair_coeff * *si.sw Si
+pair_coeff * * si.sw Si
 
 velocity   all create ${T} 62177
 

From 7c20038fd59d74ddc800a27e64d8e621db7be637 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Mon, 26 Feb 2024 08:14:01 -0500
Subject: [PATCH 27/32] isort in requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index c50c4cc3..cc765920 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ datasets==2.17.1
 flake8==4.0.1
 flake8-docstrings==1.6.0
 gitpython==3.1.27
+isort==5.13.2
 jupyter==1.0.0
 jinja2==3.1.2
 myst-parser==2.0.0

From ea7664beef6fa0af70549578bf3716085141e2bc Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Mon, 26 Feb 2024 08:21:56 -0500
Subject: [PATCH 28/32] fixes from code review

---
 crystal_diffusion/data/parse_lammps_outputs.py   | 11 ++++++-----
 {crystal_diffusion/data => data}/parse_lammps.sh |  0
 data/run_lammps_example.sh                       |  5 ++++-
 3 files changed, 10 insertions(+), 6 deletions(-)
 rename {crystal_diffusion/data => data}/parse_lammps.sh (100%)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index 0aecf232..f118067c 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+from collections import defaultdict
 
 import pandas as pd
 import yaml
@@ -24,20 +25,20 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
         dump_yaml = yaml.safe_load_all(f)
         # every MD iteration is saved as a separate document in the yaml file
         # prepare a dataframe to get all the data
-        pd_data = {}
+        pd_data = defaultdict(list)
         for doc in dump_yaml:  # loop over MD steps
             if 'id' not in doc['keywords']:  # sanity check
                 raise ValueError('id should be in LAMMPS dump file')
-            atoms_info = {}  # store information on atoms positions and forces here
+            atoms_info = defaultdict(list) # store information on atoms positions and forces here
             for data in doc['data']:  # loop over the atoms to get their positions and forces
                 for key, v in zip(doc['keywords'], data):
                     if key not in ['id', 'type', 'x', 'y', 'z', 'fx', 'fy', 'fz']:
                         continue
                     else:
-                        atoms_info[key] = atoms_info.get(key, []) + [v]  # get positions or forces
+                        atoms_info[key].append(v)  # get positions or forces
             # add the information about that MD step to the dataframe
-            for k, v in atoms_info.items():  # k should be x, y, z, fx, fy, fz
-                pd_data[k] = pd_data.get(k, []) + [v]
+            for k, v in atoms_info.items():  # k should be id, type, x, y, z, fx, fy, fz
+                pd_data[k].append(v)
 
     # get the total energy from the LAMMPS second output
     with open(lammps_thermo_log, 'r') as f:
diff --git a/crystal_diffusion/data/parse_lammps.sh b/data/parse_lammps.sh
similarity index 100%
rename from crystal_diffusion/data/parse_lammps.sh
rename to data/parse_lammps.sh
diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
index 4e639cfe..7f17d2e1 100644
--- a/data/run_lammps_example.sh
+++ b/data/run_lammps_example.sh
@@ -3,4 +3,7 @@
 TEMPERATURE=300
 BOX_SIZE=1
 
-lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
\ No newline at end of file
+lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
+
+# extract the thermodynamic outputs in a yaml file
+egrep  '^(keywords:|data:$|---$|\.\.\.$|  - \[)' log.lammps > log.yaml
\ No newline at end of file

From 9550c930d8c89ea1d88a83c1498cc0fa6fa13e8b Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Mon, 26 Feb 2024 08:28:11 -0500
Subject: [PATCH 29/32] cleaning files from error in rebase

---
 crystal_diffusion/data/parse_lammps.sh | 11 -----------
 data/run_lammps_example.sh             |  3 +--
 2 files changed, 1 insertion(+), 13 deletions(-)
 delete mode 100755 crystal_diffusion/data/parse_lammps.sh

diff --git a/crystal_diffusion/data/parse_lammps.sh b/crystal_diffusion/data/parse_lammps.sh
deleted file mode 100755
index bcbb2079..00000000
--- a/crystal_diffusion/data/parse_lammps.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-EXP_DIR="lammps_scripts/Si/si-custom/"
-DUMP_FILENAME="dump.si-300-1.yaml"
-THERMO_FILENAME="thermo_log.yaml"
-OUTPUT_NAME="demo.parquet"
-
-python crystal_diffusion/data/parse_lammps_outputs.py \
-    --dump_file  ${EXP_DIR}/${DUMP_FILENAME} \
-    --thermo_file ${EXP_DIR}/${THERMO_FILENAME} \
-    --output_name ${EXP_DIR}/${OUTPUT_NAME}
diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
index 526ab608..b521fc6b 100644
--- a/data/run_lammps_example.sh
+++ b/data/run_lammps_example.sh
@@ -8,6 +8,5 @@ lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
 
 # extract the thermodynamic outputs in a yaml file
 egrep  '^(keywords:|data:$|---$|\.\.\.$|  - \[)' log.lammps > log.yaml
-=======
+
 lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
->>>>>>> 5927511b163780d0be25d5db4a0eb8868b12f8b2

From fe1120c2224b14884eb6cfcf9e2c6f9b03a5c1a4 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Mon, 26 Feb 2024 08:29:28 -0500
Subject: [PATCH 30/32] error in run_lammps

---
 data/run_lammps_example.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
index b521fc6b..ef6d9399 100644
--- a/data/run_lammps_example.sh
+++ b/data/run_lammps_example.sh
@@ -8,5 +8,3 @@ lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
 
 # extract the thermodynamic outputs in a yaml file
 egrep  '^(keywords:|data:$|---$|\.\.\.$|  - \[)' log.lammps > log.yaml
-
-lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE

From 8d696dfb17b4bc0f852a8eadc62ab2413c66b4ae Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Mon, 26 Feb 2024 08:36:45 -0500
Subject: [PATCH 31/32] linting error

---
 crystal_diffusion/data/parse_lammps_outputs.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crystal_diffusion/data/parse_lammps_outputs.py b/crystal_diffusion/data/parse_lammps_outputs.py
index db84fe99..ad6753ea 100644
--- a/crystal_diffusion/data/parse_lammps_outputs.py
+++ b/crystal_diffusion/data/parse_lammps_outputs.py
@@ -29,8 +29,7 @@ def parse_lammps_output(lammps_dump: str, lammps_thermo_log: str, output_name: s
         for doc in dump_yaml:  # loop over MD steps
             if 'id' not in doc['keywords']:  # sanity check
                 raise ValueError('id should be in LAMMPS dump file')
-            atoms_info = defaultdict(list) # store information on atoms positions and forces here
-
+            atoms_info = defaultdict(list)  # store information on atoms positions and forces here
             for data in doc['data']:  # loop over the atoms to get their positions and forces
                 for key, v in zip(doc['keywords'], data):
                     if key not in ['id', 'type', 'x', 'y', 'z', 'fx', 'fy', 'fz']:

From a75654def03d2e560840968e4e0c7e9bccdb32a7 Mon Sep 17 00:00:00 2001
From: Simon Blackburn <simon.blackburn@mila.quebec>
Date: Tue, 27 Feb 2024 08:11:22 -0500
Subject: [PATCH 32/32] git rebase error fix

---
 data/run_lammps_example.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/data/run_lammps_example.sh b/data/run_lammps_example.sh
index ef6d9399..60ea1792 100644
--- a/data/run_lammps_example.sh
+++ b/data/run_lammps_example.sh
@@ -3,7 +3,6 @@
 TEMPERATURE=300
 BOX_SIZE=1
 
-<<<<<<< HEAD
 lmp < lammps_input_example.lammps -v STEP 10 -v T $TEMPERATURE -v S $BOX_SIZE
 
 # extract the thermodynamic outputs in a yaml file