From e2ffbea5274eac8180dfff50df60246cb59711df Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 22 May 2024 08:24:41 +0200 Subject: [PATCH 1/3] Update Key names in SmartSim data exchange to current FLEXI version --- src/relexi/env/flexiEnvSmartSim.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/relexi/env/flexiEnvSmartSim.py b/src/relexi/env/flexiEnvSmartSim.py index 7576b56..ee77d49 100644 --- a/src/relexi/env/flexiEnvSmartSim.py +++ b/src/relexi/env/flexiEnvSmartSim.py @@ -325,7 +325,7 @@ def _set_prediction(self,action): # TODO: make this a user parameter action_mod = action * 0.5 for i in range(self.n_envs): - dataset = self.client.put_tensor(self.tag[i]+"Cs",action_mod[i,::].astype(np.float64)) + dataset = self.client.put_tensor(self.tag[i]+"actions",action_mod[i,::].astype(np.float64)) def _get_current_state(self): @@ -334,13 +334,14 @@ def _get_current_state(self): ATTENTION: This is the routine the enviroment will idle until the necessary data becomes available """ do_init = True + key="state" for tag in self.tag: - self.client.poll_tensor(tag+"U",10,10000) + self.client.poll_tensor(tag+"state",10,10000) try: - data = self.client.get_tensor(tag+"U") + data = self.client.get_tensor(tag+key) except: - rlxout.printWarning("Did not get U in "+tag) - self.client.delete_tensor(tag+"U") + rlxout.printWarning("Did not get state from environment "+tag[:-1]) + self.client.delete_tensor(tag+key) # Account for Fortran/C memory layout and 32bit for TF data = np.transpose(data) data = np.expand_dims(data,axis=0) @@ -356,10 +357,12 @@ def _get_current_state(self): def _flexi_ended(self): """ Checks whether FLEXI has already ended.""" has_ended = np.empty((self.n_envs)) + + key="step_type" for i in range(self.n_envs): - self.client.poll_tensor(self.tag[i]+"step_type",10,1000) - step_type = self.client.get_tensor(self.tag[i]+"step_type") - self.client.delete_tensor(self.tag[i]+"step_type") + self.client.poll_tensor(self.tag[i]+key,10,1000) + step_type = self.client.get_tensor(self.tag[i]+key) + self.client.delete_tensor(self.tag[i]+key) if step_type > 0: has_ended[i] = False else: @@ -378,11 +381,13 @@ def _get_reward(self): """Compute the reward for the agent, based on the current flow state.""" reward = np.zeros( (self.n_envs,) ) self.E_LES = np.zeros( (self.n_envs,self.reward_kmax) ) + + key="Ekin" for i in range(self.n_envs): # Poll Tensor - self.client.poll_tensor(self.tag[i]+"Ekin",10,1000) - data = self.client.get_tensor(self.tag[i]+"Ekin") - self.client.delete_tensor(self.tag[i]+"Ekin") + self.client.poll_tensor(self.tag[i]+key,10,1000) + data = self.client.get_tensor(self.tag[i]+key) + self.client.delete_tensor(self.tag[i]+key) self.E_LES[i,:] = data[0:self.reward_kmax] # Compute Reward From f95eec4efc85be04a76fb2d612c9f50afb9bfaf8 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 22 May 2024 08:25:58 +0200 Subject: [PATCH 2/3] Add .gitignore files to example cases to explicitly exclude created files from git tracking. --- examples/HIT_24_DOF/.gitignore | 1 + examples/HIT_32_DOF/.gitignore | 1 + 2 files changed, 2 insertions(+) create mode 100644 examples/HIT_24_DOF/.gitignore create mode 100644 examples/HIT_32_DOF/.gitignore diff --git a/examples/HIT_24_DOF/.gitignore b/examples/HIT_24_DOF/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/examples/HIT_24_DOF/.gitignore @@ -0,0 +1 @@ +* diff --git a/examples/HIT_32_DOF/.gitignore b/examples/HIT_32_DOF/.gitignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/examples/HIT_32_DOF/.gitignore @@ -0,0 +1 @@ +* From 245aefb942babf6c76ac0828d3a7e252dad0e507 Mon Sep 17 00:00:00 2001 From: Marius Kurz Date: Wed, 22 May 2024 08:24:11 +0200 Subject: [PATCH 3/3] Update Readme to current versions of SmartSim, TensorFlow and FLEXI. --- README.md | 229 +++++++++++++++++++-------------------------- docs/build_docs.sh | 0 2 files changed, 94 insertions(+), 135 deletions(-) mode change 100644 => 100755 docs/build_docs.sh diff --git a/README.md b/README.md index 6792b98..ea97ec5 100644 --- a/README.md +++ b/README.md @@ -4,175 +4,134 @@ [![doi](https://img.shields.io/badge/DOI-10.1016/j.simpa.2022.100422-blue "DOI")](https://doi.org/10.1016/j.simpa.2022.100422) # About Relexi - Relexi is a Reinforcement Learning (RL) framework developed for the high-order HPC flow solver [FLEXI][flexi]. However, Relexi is developed with modularity in mind and allows to used with other HPC solvers as well. Relexi builds upon TensorFlow and its RL extension TF-Agents. For the efficient communication, data handling and the managment of the simulations runs on HPC systems, Relexi uses the SmartSim package with its SmartRedis communication clients. For details on its scaling behavior, suitability for HPC and for use cases, please see -* [Kurz, M., Offenhäuser, P., Viola, D., Resch, M., & Beck, A. (2022). Relexi—A scalable open source reinforcement learning framework for high-performance computing. Software Impacts, 14, 100422.](https://www.sciencedirect.com/science/article/pii/S2665963822001063) -* [Kurz, M., Offenhäuser, P., Viola, D., Shcherbakov, O., Resch, M., & Beck, A. (2022). Deep Reinforcement Learning for Computational Fluid Dynamics on HPC Systems. Journal of Computational Science, 65, 101884.](https://www.sciencedirect.com/science/article/pii/S1877750322002435) -* [Kurz, M., Offenhäuser, P., & Beck, A. (2023). Deep reinforcement learning for turbulence modeling in large eddy simulations. International Journal of Heat and Fluid Flow, 99, 109094.](https://arxiv.org/pdf/2206.11038) +* [Kurz, M., Offenhäuser, P., Viola, D., Resch, M., & Beck, A. (2022). Relexi—A scalable open source reinforcement learning framework for high-performance computing. Software Impacts, 14, 100422.](https://doi.org/10.1016/j.simpa.2022.100422) +* [Kurz, M., Offenhäuser, P., Viola, D., Shcherbakov, O., Resch, M., & Beck, A. (2022). Deep Reinforcement Learning for Computational Fluid Dynamics on HPC Systems. Journal of Computational Science, 65, 101884.](https://doi.org/10.1016/j.jocs.2022.101884) +* [Kurz, M., Offenhäuser, P., & Beck, A. (2023). Deep reinforcement learning for turbulence modeling in large eddy simulations. International Journal of Heat and Fluid Flow, 99, 109094.](https://doi.org/10.1016/j.ijheatfluidflow.2022.109094) +* [Beck, A., & Kurz, M. (2023). Toward discretization-consistent closure schemes for large eddy simulation using reinforcement learning. Physics of Fluids, 35(12), 125122.](https://doi.org/10.1063/5.0176223) This is a scientific project. If you use Relexi or find it helpful, please cite the project using a suitable reference from the list above referring to either the general Relexi project, its HPC aspects or its application for scientific modeling tasks, respectively. # Documentation - -The documentation of Relexi is built via `pdoc`, which can be installed via `pip install pdoc`. +The documentation of Relexi is built via `pdoc`, which can be installed via +```bash +pip install pdoc +``` Next, change into the `docs` folder and build the documentation via - ``` - cd docs - bash build_docs.sh - ``` +```bash +cd docs +bash build_docs.sh +``` Open the resulting `relexi.html` with your browser. # Testing - -A suite of unit tests are implemented for Relexi using the `pytest` testing environment. -To run the tests, simply execute in the root directory - ``` - pytest - ``` - +A suite of unit tests is implemented for Relexi using the `pytest` testing environment. To run the tests, simply execute in the root directory +```bash +pytest +``` # Installation - The following quick start details a standard installation of the Relexi framework. ### Dependencies - Relexi has a variety of dependencies. The main dependencies of Relexi are listed in the following with their supported version. -| Package | Version | Note | -|:-----------------|--------------:|:---------| -| Python | ≥3.8 | | -| TensorFlow | ≥2.9 | | -| TF-Agents | ≥0.13 | | -| SmartSim | ≥0.3.2 | | -| SmartRedis | 0.2.0 | | -| Cmake | ≥3.0 | | -| Make | ≥4.0 | | -| gcc-fortran | ≥9.4 | GCC-10 not supported! (GCC-11 and higher is fine) | -| gcc | ≥9.4 | | -| gcc-c++ | ≥9.4 | | - -Please be ware that The major dependencies (SmartSim, TensorFlow, FLEXI) might have a more expansive dependency tree, for which we refer the user to the corresponding documentation for details. +| Package | Version | Note | +|:-----------------|----------------:|:----------------------------------------| +| Python | ≥3.9 | | +| TensorFlow | 2.9.0 - 2.15.1| | +| TF-Agents | ≥0.13 | | +| SmartSim | 0.4.0 - 0.6.2 | | +| SmartRedis | ≥0.4.1 | | +| CMake | ≥3.0 | | +| Make | ≥4.0 | | +| gcc-fortran | ≥9.4 | gcc 10 not supported! (gcc ≥11 is fine) | +| gcc | ≥9.4 | | +| gcc-c++ | ≥9.4 | | + +Be ware that the major dependencies (SmartSim, TensorFlow, FLEXI) might have a more expansive dependency tree, for which we refer the user to the corresponding documentations for details. ### Prerequisites -* Open a terminal -* Change into the directory where you want to install Relexi and its dependecies -* For convienience, save the current directory with - ``` - ROOTDIR=$(pwd) - ``` - -* It is highly recommended to use some form of virtual environment for the installation. You can use any tool you like, we use `virtualenv` which can be installed with - ``` - python3 -m pip install virtualenv - ``` - -* Then create and activate a new environment with - ``` - python3 -m virtualenv env_relexi - source env_relexi/bin/activate - ``` - -* Then install the necessary dependecies - ``` - python3 -m pip install smartredis cmake tensorflow tf-agents pyyaml matplotlib - ``` +Open a terminal and change into the directory where you want to install Relexi and its dependecies +For convenience, save the current directory with +```bash +ROOTDIR=$(pwd) +``` +It is highly recommended to use some form of virtual environment for the installation. You can use any tool you like, we use `virtualenv` which can be installed with +```bash +python3 -m pip install virtualenv +``` +Create and activate a new environment with +```bash +python3 -m virtualenv env_relexi +source env_relexi/bin/activate +``` +Install the necessary dependencies +```bash +python3 -m pip install tensorflow smartredis cmake tf-agents pyyaml matplotlib +``` ### Install SmartSim -The installation commands were changed in SmartSim version `0.4.0`. So use the following commands depending on the version you want to use. - -#### SmartSim 0.3.2 -* Now, install SmartSim in version `0.3.2`. For this, first the package has to be installed via pip and then we can install it using the smart tool provided by SmartSim. - ``` - pip install smartsim==0.3.2 - smart --clobber - smart --clean - smart --no_tf --no_pt -v - SMARTSIM_DIR=$(smart --site) - export PATH=$PATH:$SMARTSIM_DIR/_core/bin - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${SMARTSIM_DIR}/_core/lib - ``` - -#### SmartSim >= 0.4.0 -* Now, install SmartSim in version you like. The following commands install the latest version. The individual tools of the `smart` command line tool are now not longer prefixed by a double dash. - ``` - pip install smartsim - smart clobber - smart clean - smart build --no_tf --no_pt -v - SMARTSIM_DIR=$(smart site) - export PATH=$PATH:$SMARTSIM_DIR/_core/bin - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${SMARTSIM_DIR}/_core/lib - ``` - -### Install SmartRedis -* Go back into the main directory - ``` - cd $ROOTDIR - ``` - -* Then, we install the SmartRedis clients for C/C++ and Fortran. For this we clone its repository and build version `0.2.0` - ``` - git clone https://github.com/CrayLabs/SmartRedis.git - cd SmartRedis - git checkout v0.2.0 - make lib -j - ``` - -* Export the build directory, so FLEXI finds the installation to link against, in order to build with support for SmartRedis. - ``` - export SMARTREDIS_DIR=$(pwd) - ``` +Install SmartSim in a supported version ranging from `0.4.0` to `0.6.2`. The following commands install the latest supported version. +```bash +pip install smartsim==0.6.2 +smart clobber +smart clean +smart build --no_tf --no_pt -v +SMARTSIM_DIR=$(smart site) +export PATH=$PATH:$SMARTSIM_DIR/_core/bin +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${SMARTSIM_DIR}/_core/lib +``` ### Install FLEXI -* Clone the required version of FLEXI from GitHub and build it with the standard compile flags - ``` - cd $ROOTDIR - git clone https://github.com/flexi-framework/flexi-extensions.git - cd flexi-extensions - git checkout smartsim - mkdir -p build && cd build - cmake .. -DLIBS_BUILD_HDF5=ON -DLIBS_USE_MPI=OFF -DLIBS_USE_SMARTREDIS=ON -DLIBS_USE_FFTW=ON -DPOSTI=OFF -DFLEXI_TESTCASE=hit -DFLEXI_NODETYPE=GAUSS-LOBATTO -DFLEXI_SPLIT_DG=ON -DFLEXI_EDDYVISCOSITY=ON - make -j - ``` -* This compiles FLEXI without MPI and thus in its serial version. To enable MPI or to change the configuration of FLEXI, please see the [official documentation][userguide] of FLEXI. +Clone the required version of FLEXI from GitHub and build it with the standard compile flags +```bash +cd $ROOTDIR +git clone https://github.com/flexi-framework/flexi-extensions.git +cd flexi-extensions +git checkout smartsim +mkdir -p build && cd build +cmake .. -DLIBS_BUILD_HDF5=ON -DLIBS_USE_MPI=OFF -DLIBS_BUILD_SMARTREDIS=ON -DLIBS_USE_SMARTREDIS=ON -DLIBS_USE_FFTW=ON -DPOSTI=OFF -DFLEXI_TESTCASE=hit -DFLEXI_NODETYPE=GAUSS-LOBATTO -DFLEXI_SPLIT_DG=ON -DFLEXI_EDDYVISCOSITY=ON +make -j +``` +This compiles FLEXI without MPI and thus in its serial version. To enable MPI or to change the configuration of FLEXI, please see the [official documentation][userguide] of FLEXI. ### Install Relexi -* Finally, we can clone the Relexi repository. - ``` - cd $ROOTDIR - git clone https://github.com/flexi-framework/relexi.git - ``` +Finally, clone the Relexi repository. No additional installation steps are required. +```bash +cd $ROOTDIR +git clone https://github.com/flexi-framework/relexi.git +``` # Running the Code -* Relexi comes with some example setups to test if the code runs. Enter the directory of the first test case with - ``` - cd relexi/examples/HIT_24_DOF/ - ``` -* Open the ``prm.yaml`` file in a text editor of your choice. If you have installed the ``flexi`` binary not in the default path, adapt the path of the executable under ``library_path`` accordingly. -* Then you can start the training process with - ``` - python3 ../../src/train.py prm.yaml - ``` -* You may also set the number of parallel environments by setting ``num_parallel_environments`` according to your local hardware resources. -* You can change the number of processors used for each FLEXI environment by setting ``num_procs_per_environment`` to the appropriate value. Please be aware that for using FLEXI in parallel, i.e. with more than 1 CPU core per environment, it has to be compiled with MPI. Please refer to the [FLEXI documentation](https://www.flexi-project.org/doc/userguide/userguide.pdf) for details. +Relexi comes with some example setups to test if the code runs. Enter the directory of the first test case with +```bash +cd relexi/examples/HIT_24_DOF/ +``` +Open the ``prm.yaml`` file in a text editor of your choice. If you have installed the ``flexi`` binary not in the default path, adapt the path of the executable under ``library_path`` accordingly. +Then you can start the training process with +```bash +python3 ../../src/train.py prm.yaml +``` +You may also set the number of parallel environments by setting ``num_parallel_environments`` according to your local hardware resources. +You can change the number of processors used for each FLEXI environment by setting ``num_procs_per_environment`` to the appropriate value. Please be aware that for using FLEXI in parallel, i.e. with more than 1 CPU core per environment, it has to be compiled with MPI. Please refer to the [FLEXI documentation](https://www.flexi-project.org/doc/userguide/userguide.pdf) for details. # Results -* To visualize the results, Relexi uses the TensorBoard suite. After running the code, Relexi should create a directory ``logs``, where the model, training checkpoints and the training metrics are saved. Open it with - ``` - tensorboard --logdir logs/ - ``` -* Tensorboard then provides a URL that can be opened in the Browser. -* If the training is performed on a remote server, the port where TensorBoard sends its data has to be redirected to your local machine. If you use `ssh` to connect to the server, you can redirect the standard TensorBoard port (6006) with - ``` - ssh -L 6006:127.0.0.1:6006 your_remote_server - ``` +To visualize the results, Relexi uses the TensorBoard suite. After running the code, Relexi should create a directory ``logs``, where the model, training checkpoints and the training metrics are saved. Open it with +```bash +tensorboard --logdir logs/ +``` +Tensorboard then provides a URL that can be opened in the Browser. +If the training is performed on a remote server, the port where TensorBoard sends its data has to be redirected to your local machine. If you use `ssh` to connect to the server, you can redirect the standard TensorBoard port (6006) with +```bash +ssh -L 6006:127.0.0.1:6006 your_remote_server +``` [nrg]: https://numericsresearchgroup.org/index.html [flexi]: https://numericsresearchgroup.org/flexi_index.html diff --git a/docs/build_docs.sh b/docs/build_docs.sh old mode 100644 new mode 100755