From 4cf61367b6e7cd063b945c2f76318b28d034c972 Mon Sep 17 00:00:00 2001 From: mjwen Date: Fri, 8 Mar 2024 16:48:55 -0600 Subject: [PATCH] Install a specific version of torch_scatter --- README.md | 4 + notebooks/prepare_data.ipynb | 197 ++++++++++++++++++++++++++ scripts/README.md | 39 +++++ scripts/configs/materials_tensor.yaml | 2 +- src/matten/utils_wandb.py | 2 +- 5 files changed, 242 insertions(+), 2 deletions(-) create mode 100644 notebooks/prepare_data.ipynb diff --git a/README.md b/README.md index 7abc2ff..385070d 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,10 @@ crystals, pass a list of structures to `predict`. - An example of 100 crystals is available in the [datasets](./datasets) directory. - The full dataset is available at: https://doi.org/10.5281/zenodo.8190849 +## Train the model (using your own data) + +See instructions [here](./scripts/README.md). + ## Reference ``` diff --git a/notebooks/prepare_data.ipynb b/notebooks/prepare_data.ipynb new file mode 100644 index 0000000..06eaa35 --- /dev/null +++ b/notebooks/prepare_data.ipynb @@ -0,0 +1,197 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Data Preparation\n", + "\n", + "This notebook gives an example of how to prepare your own data to train the model. " + ], + "metadata": { + "collapsed": false + }, + "id": "b5c6b7dd5f67094c" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "from pymatgen.core import Structure" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-08T22:47:02.268364Z", + "start_time": "2024-03-08T22:47:01.432158Z" + } + }, + "id": "45862d0d3d88e62e", + "execution_count": 1 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "def get_structures():\n", + " \"\"\"Create a pymatgen structure for Si.\"\"\"\n", + " return Structure(\n", + " lattice=np.array([[0, 2.73, 2.73], [2.73, 0, 2.73], [2.73, 2.73, 0]]),\n", + " species=[\"Si\", \"Si\"],\n", + " coords=[[0, 0, 0], [0.25, 0.25, 0.25]],\n", + " )\n", + "\n", + "\n", + "def get_tensor(seed: int = 35):\n", + " \"\"\"Generate random 3x3x3x3 elastic tensor.\n", + "\n", + " Note, this is by no means a physical tensor that satisfies the symmetry of any\n", + " crystal. It is just a random array to show the data preparation process.\n", + " \"\"\"\n", + " np.random.seed(seed)\n", + " t = np.random.rand(3, 3, 3, 3)\n", + "\n", + " return t" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-08T22:47:02.272659Z", + "start_time": "2024-03-08T22:47:02.270735Z" + } + }, + "id": "39bef8ecadee2005", + "execution_count": 2 + }, + { + "cell_type": "markdown", + "source": [ + "## Get data \n", + "\n", + "Here we simply make 10 copies of the Si structure and 10 copies of the elastic tensor. \n", + "You should replace this with your own data." + ], + "metadata": { + "collapsed": false + }, + "id": "a8bfc64bcfa05bd2" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "Si = get_structures()\n", + "t = get_tensor()\n", + "\n", + "structures = [Si for _ in range(10)]\n", + "tensors = [t for _ in range(10)]" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-08T22:47:02.276072Z", + "start_time": "2024-03-08T22:47:02.272804Z" + } + }, + "id": "2edf1f8dd8c323ea", + "execution_count": 3 + }, + { + "cell_type": "markdown", + "source": [ + "## Write data to file" + ], + "metadata": { + "collapsed": false + }, + "id": "2cfecfec86d3444a" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "def write_data(\n", + " structures: list[Structure],\n", + " tensors: list[np.ndarray],\n", + " path: Path = \"elasticity_tensors.json\",\n", + "):\n", + " \"\"\"Write structures and tensors to file.\n", + "\n", + " Args:\n", + " structures: list of pymatgen structures.\n", + " tensors: list of 3x3x3x3 elastic tensors.\n", + " path: path to write the data.\n", + " \"\"\"\n", + " data = {\n", + " \"structure\": [s.as_dict() for s in structures],\n", + " \"elastic_tensor_full\": [t.tolist() for t in tensors],\n", + " }\n", + " df = pd.DataFrame(data)\n", + "\n", + " df.to_json(path)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-08T22:47:02.279873Z", + "start_time": "2024-03-08T22:47:02.276910Z" + } + }, + "id": "55b15d7d8af1a98d", + "execution_count": 4 + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "write_data(structures, tensors)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-08T22:47:02.286930Z", + "start_time": "2024-03-08T22:47:02.281900Z" + } + }, + "id": "26057e33efd0eaf9", + "execution_count": 5 + }, + { + "cell_type": "code", + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-08T22:47:02.287405Z", + "start_time": "2024-03-08T22:47:02.285609Z" + } + }, + "id": "ba7de54ec1bf9ba3", + "execution_count": 5 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/README.md b/scripts/README.md index 889a1c6..6f53236 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,3 +1,42 @@ # scripts This directory contains scripts to train/evaluate/test the models. + +## First try + +To train an example model, run the below command in this directory: + +```bash +python train_materials_tensor.py +``` + +This will train a toy model on a very small example dataset for only 10 epochs and then stop. + +After training, you can see the metrics and the location of the best model. + +### Under the hood + +All training configurations (data, optimizer, model etc.) are defined in the [./configs/materials_tensor.yaml](./configs/materials_tensor.yaml) file. Particularly you can see from the `data` section (copied below) that the data file we are using is `example_crystal_elasticity_tensor_n100.json` and it is stored at the [../datasets/](../datasets). Here we use the same file for training, validation and testing. This is just for demonstration purposes. In a real-world scenario, you would have separate files for each of these. + +```yaml +data: + root: ../datasets/ + r_cut: 5.0 + trainset_filename: example_crystal_elasticity_tensor_n100.json + valset_filename: example_crystal_elasticity_tensor_n100.json + testset_filename: example_crystal_elasticity_tensor_n100.json + reuse: false + loader_kwargs: + batch_size: 32 + shuffle: true +``` + +Feel free to change other settings and see how the model behaves. You might find [this config](../pretrained/20230627/config_final.yaml) file useful for reference, which is the final configuration used to train the model in the paper. + +You might also want to uncomment the `logger` section to use [Weights and Biases](https://wandb.ai/site) for logging, which makes tracking the training process much easier. + +## Training with your own data + +All you need to do is to prepare your data as json file(s), similar to `example_crystal_elasticity_tensor_n100.json`. +It needs a list of pymatgen `structure`s and the corresponding `elastic_tensor_full` for the structures. +See this [notebook](../notebooks/prepare_data.ipynb) for more details on how to prepare the data. diff --git a/scripts/configs/materials_tensor.yaml b/scripts/configs/materials_tensor.yaml index 8743ec7..f41a54c 100644 --- a/scripts/configs/materials_tensor.yaml +++ b/scripts/configs/materials_tensor.yaml @@ -3,10 +3,10 @@ log_level: info data: root: ../datasets/ - r_cut: 5.0 trainset_filename: example_crystal_elasticity_tensor_n100.json valset_filename: example_crystal_elasticity_tensor_n100.json testset_filename: example_crystal_elasticity_tensor_n100.json + r_cut: 5.0 reuse: false loader_kwargs: batch_size: 32 diff --git a/src/matten/utils_wandb.py b/src/matten/utils_wandb.py index d82ddb7..50946ea 100644 --- a/src/matten/utils_wandb.py +++ b/src/matten/utils_wandb.py @@ -13,7 +13,7 @@ def get_git_repo_commit(repo_path: Path) -> str: """ - Get the latest git commit info of a github repository. + Get the latest git commit info of a git repository. Args: repo_path: path to the repo