From 16d3cab2e5d9ea02fbbc49ceaf9aa819e9ce538a Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Fri, 27 Nov 2020 21:16:30 +0100
Subject: [PATCH 001/331] Disable python bindings for faster build   - while
 working on NMODL + LLVM, we don't worry that much     about Python bindings
 by default   - so lets disable them by default

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index df16e23768..0f9cccd1d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 # =============================================================================
 # Build options for NMODL
 # =============================================================================
-option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" ON)
+option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" OFF)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
 if(NMODL_ENABLE_LEGACY_UNITS)
   add_definitions(-DUSE_LEGACY_UNITS)

From b1cfda6e60f7bc1e8e3dc9f94eb616dba430511b Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sat, 28 Nov 2020 06:22:04 +0100
Subject: [PATCH 002/331] Integrate LLVM into CMake build system   * added
 NMODL_ENABLE_LLVM option to enable/disable     llvm support in nmodl   *
 LLVMHelper.cmake added to help with linking LLVM libraries      - clang might
 need to use libstdc++ or libc++ linking      - on BB5, using GCC with LLVM
 libraries is fine. But using        clang results into lots of link error.
 Adding -stdlib=libstd++        solves the issue      - use
 check_cxx_source_compiles to find out which cxx flag is needed

---
 CMakeLists.txt         |  9 +++++++++
 cmake/LLVMHelper.cmake | 45 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 cmake/LLVMHelper.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0f9cccd1d5..af7707785c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,8 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 # =============================================================================
 option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" OFF)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
+option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
+
 if(NMODL_ENABLE_LEGACY_UNITS)
   add_definitions(-DUSE_LEGACY_UNITS)
 endif()
@@ -140,6 +142,13 @@ find_python_module(sympy 1.2 REQUIRED)
 find_python_module(textwrap 0.9 REQUIRED)
 find_python_module(yaml 3.12 REQUIRED)
 
+# =============================================================================
+# Find LLVM dependencies
+# =============================================================================
+if(NMODL_ENABLE_LLVM)
+  include(LLVMHelper)
+endif()
+
 # =============================================================================
 # Compiler specific flags for external submodules
 # =============================================================================
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
new file mode 100644
index 0000000000..a22cf4c835
--- /dev/null
+++ b/cmake/LLVMHelper.cmake
@@ -0,0 +1,45 @@
+# =============================================================================
+# LLVM/Clang needs to be linked with either libc++ or libstdc++
+# =============================================================================
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
+  find_package(LLVM REQUIRED CONFIG)
+  include(CheckCXXSourceCompiles)
+
+  # test by including LLVM header and core library
+  llvm_map_components_to_libnames(LLVM_CORE_LIB core)
+  set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
+  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_CORE_LIB})
+
+  # simple code to test LLVM library linking
+  set(CODE_TO_TEST
+      "
+    #include <llvm/IR/IRBuilder.h>
+    using namespace llvm;
+    int main(int argc, char* argv[]) {
+        std::unique_ptr<IRBuilder<>> Builder;
+    }")
+
+  # first compile without any flags
+  check_cxx_source_compiles("${CODE_TO_TEST}" LLVM_LIB_LINK_TEST)
+
+  # if standard compilation fails
+  if(NOT LLVM_LIB_LINK_TEST)
+    # try libstdc++ first
+    set(CMAKE_REQUIRED_FLAGS "-stdlib=libstdc++")
+    check_cxx_source_compiles("${CODE_TO_TEST}" LLVM_LIBSTDCPP_TEST)
+    # on failure, try libc++
+    if(NOT LLVM_LIBSTDCPP_TEST)
+      set(CMAKE_REQUIRED_FLAGS "-stdlib=libc++")
+      check_cxx_source_compiles("${CODE_TO_TEST}" LLVM_LIBCPP_TEST)
+    endif()
+    # if either library works then add it to CXX flags
+    if(LLVM_LIBSTDCPP_TEST OR LLVM_LIBCPP_TEST)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_REQUIRED_FLAGS}")
+      message(
+        STATUS
+          "Adding ${CMAKE_REQUIRED_FLAGS} to CMAKE_CXX_FLAGS, required to link with LLVM libraries")
+    else()
+      message(STATUS "WARNING : -stdlib=libstdcx++ or -stdlib=libc++ didn't work to link with LLVM library")
+    endif()
+  endif()
+endif()

From 46d4779b480806a1536b5af06abaf42fd798d207 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 29 Nov 2020 00:50:16 +0100
Subject: [PATCH 003/331] Code infrastructure for LLVM code generation backend 
 - added llvm dir under codegen where LLVM code generation    work will live 
 - llvm codegen visitor created that can be used as template    for initial
 work  - cmake adapted to enable llvm codegen based on CMake option  - simple
 procedure.mod added that can be initial target for    testing  - new CLI
 option --llvm that runs LLVM codegen visitor  - Enable CXX 14 because new
 LLVM versions require it

---
 CMakeLists.txt                            |  3 +-
 cmake/LLVMHelper.cmake                    |  4 +-
 src/CMakeLists.txt                        |  6 ++
 src/codegen/CMakeLists.txt                |  5 ++
 src/codegen/llvm/CMakeLists.txt           | 13 +++++
 src/codegen/llvm/codegen_llvm_visitor.cpp | 46 ++++++++++++++++
 src/codegen/llvm/codegen_llvm_visitor.hpp | 67 +++++++++++++++++++++++
 src/main.cpp                              | 18 ++++++
 test/integration/mod/procedure.mod        | 15 +++++
 9 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100644 src/codegen/llvm/CMakeLists.txt
 create mode 100644 src/codegen/llvm/codegen_llvm_visitor.cpp
 create mode 100644 src/codegen/llvm/codegen_llvm_visitor.hpp
 create mode 100644 test/integration/mod/procedure.mod

diff --git a/CMakeLists.txt b/CMakeLists.txt
index af7707785c..b294fecad7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,7 +14,7 @@ project(
 # =============================================================================
 # CMake common project settings
 # =============================================================================
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
@@ -147,6 +147,7 @@ find_python_module(yaml 3.12 REQUIRED)
 # =============================================================================
 if(NMODL_ENABLE_LLVM)
   include(LLVMHelper)
+  add_definitions(-DNMODL_LLVM_BACKEND)
 endif()
 
 # =============================================================================
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index a22cf4c835..de078be7b5 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -6,9 +6,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
   include(CheckCXXSourceCompiles)
 
   # test by including LLVM header and core library
-  llvm_map_components_to_libnames(LLVM_CORE_LIB core)
+  llvm_map_components_to_libnames(LLVM_CORE_LIBS core)
   set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
-  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_CORE_LIB})
+  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_CORE_LIBS})
 
   # simple code to test LLVM library linking
   set(CODE_TO_TEST
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7b5e67a66a..61e4f9f233 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -29,6 +29,12 @@ target_link_libraries(
   lexer
   ${NMODL_WRAPPER_LIBS})
 
+if(NMODL_ENABLE_LLVM)
+  # LLVM core libraries to link
+  llvm_map_components_to_libnames(LLVM_CORE_LIBS core)
+  target_link_libraries(nmodl llvm_codegen ${LLVM_CORE_LIBS})
+endif()
+
 # =============================================================================
 # Add dependency with nmodl pytnon module (for consumer projects)
 # =============================================================================
diff --git a/src/codegen/CMakeLists.txt b/src/codegen/CMakeLists.txt
index 32ad4e1303..2d31e1b1d6 100644
--- a/src/codegen/CMakeLists.txt
+++ b/src/codegen/CMakeLists.txt
@@ -35,6 +35,11 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/fast_math.ispc
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/fast_math.hpp
                ${CMAKE_BINARY_DIR}/include/nmodl/fast_math.hpp COPYONLY)
 
+# build llvm visitor if enabled
+if(NMODL_ENABLE_LLVM)
+  add_subdirectory(llvm)
+endif()
+
 # =============================================================================
 # Install include files
 # =============================================================================
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
new file mode 100644
index 0000000000..71ecca338c
--- /dev/null
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -0,0 +1,13 @@
+# =============================================================================
+# Codegen sources
+# =============================================================================
+set(LLVM_CODEGEN_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.cpp
+                              ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp)
+
+# =============================================================================
+# LLVM codegen library
+# =============================================================================
+
+include_directories(${LLVM_INCLUDE_DIRS})
+add_library(llvm_codegen STATIC ${LLVM_CODEGEN_SOURCE_FILES})
+add_dependencies(llvm_codegen lexer util visitor)
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
new file mode 100644
index 0000000000..3f4e319503
--- /dev/null
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -0,0 +1,46 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "ast/all.hpp"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
+namespace nmodl {
+namespace codegen {
+
+
+// LLVM code generator objects
+using namespace llvm;
+static std::unique_ptr<LLVMContext> TheContext;
+static std::unique_ptr<Module> TheModule;
+static std::unique_ptr<IRBuilder<>> Builder;
+static std::map<std::string, Value*> NamedValues;
+
+
+void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node) {
+    logger->info("CodegenLLVMVisitor : visiting statement block");
+    node.visit_children(*this);
+    // TODO : code generation for new block scope
+}
+
+void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
+    logger->info("CodegenLLVMVisitor : visiting {} procedure", node.get_node_name());
+    node.visit_children(*this);
+    // TODO : code generation for procedure block
+}
+
+void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
+    node.visit_children(*this);
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
new file mode 100644
index 0000000000..2b77160cd5
--- /dev/null
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -0,0 +1,67 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief LLVM based code generation backend implementation for CoreNEURON
+ *
+ * \file
+ * \brief \copybrief nmodl::codegen::CodegenLLVMVisitor
+ */
+
+#include <ostream>
+#include <string>
+
+#include "utils/logger.hpp"
+#include "visitors/ast_visitor.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+/**
+ * @defgroup llvm LLVM Based Code Generation Implementation
+ * @brief Implementations of LLVM based code generation
+ *
+ * @defgroup llvm_backends LLVM Codegen Backend
+ * @ingroup llvm
+ * @brief Code generation backends for NMODL AST to LLVM IR
+ * @{
+ */
+
+/**
+ * \class CodegenLLVMVisitor
+ * \brief %Visitor for transforming NMODL AST to LLVM IR
+ */
+class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
+    // Name of mod file (without .mod suffix)
+    std::string mod_filename;
+
+    // Output directory for code generation
+    std::string output_dir;
+
+  public:
+    /**
+     * \brief Constructs the LLVM code generator visitor
+     *
+     * This constructor instantiates an NMODL LLVM code generator. This is
+     * just template to work with initial implementation.
+     */
+    CodegenLLVMVisitor(const std::string& mod_filename, const std::string& output_dir)
+        : mod_filename(mod_filename)
+        , output_dir(output_dir) {}
+
+    void visit_statement_block(const ast::StatementBlock& node) override;
+    void visit_procedure_block(const ast::ProcedureBlock& node) override;
+    void visit_program(const ast::Program& node) override;
+};
+
+/** \} */  // end of llvm_backends
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/main.cpp b/src/main.cpp
index 60e933f052..fcc813b74b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -17,6 +17,9 @@
 #include "codegen/codegen_cuda_visitor.hpp"
 #include "codegen/codegen_ispc_visitor.hpp"
 #include "codegen/codegen_omp_visitor.hpp"
+#ifdef NMODL_LLVM_BACKEND
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#endif
 #include "config/config.h"
 #include "parser/nmodl_driver.hpp"
 #include "pybind/pyembed.hpp"
@@ -82,6 +85,9 @@ int main(int argc, const char* argv[]) {
     /// true if cuda code to be generated
     bool cuda_backend(false);
 
+    /// true if llvm code to be generated
+    bool llvm_backend(false);
+
     /// true if sympy should be used for solving ODEs analytically
     bool sympy_analytic(false);
 
@@ -162,6 +168,10 @@ int main(int argc, const char* argv[]) {
         ->ignore_case()
         ->check(CLI::IsMember({"trace", "debug", "info", "warning", "error", "critical", "off"}));
 
+#ifdef NMODL_LLVM_BACKEND
+    app.add_flag("--llvm", llvm_backend, "Enable LLVM based code generation")->ignore_case();
+#endif
+
     app.add_option("file", mod_files, "One or more MOD files to process")
         ->ignore_case()
         ->required()
@@ -548,6 +558,14 @@ int main(int argc, const char* argv[]) {
                                            optimize_ionvar_copies_codegen);
                 visitor.visit_program(*ast);
             }
+
+#ifdef NMODL_LLVM_BACKEND
+            if (llvm_backend) {
+                logger->info("Running LLVM backend code generator");
+                CodegenLLVMVisitor visitor(modfile, output_dir);
+                visitor.visit_program(*ast);
+            }
+#endif
         }
     }
 
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
new file mode 100644
index 0000000000..3eb4817b3b
--- /dev/null
+++ b/test/integration/mod/procedure.mod
@@ -0,0 +1,15 @@
+PROCEDURE state(x, y) {
+    LOCAL z
+    z = x + y
+}
+
+PROCEDURE rates(v) {
+    LOCAL  alpha, beta, sum
+    {
+        alpha = .1 * exp(-(v+40))
+        beta =  4 * exp(-(v+65)/18)
+    }
+    {
+        sum = alpha + beta
+    }
+}

From 51987dc9567db2bd1abe5f18c8e1ddc6f9a038a4 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 29 Nov 2020 00:58:20 +0100
Subject: [PATCH 004/331] Azure CI fixes for LLVM build and README update  -
 install llvm via brew  - set LLV_DIR variable so that CMake can find
 llvm-config

---
 INSTALL.md          | 13 +++++++++++--
 azure-pipelines.yml |  6 +++---
 setup.py            |  2 +-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 335651c86c..32e9106669 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -31,7 +31,7 @@ Typically the versions of bison and flex provided by the system are outdated and
 To get recent version of all dependencies we recommend using [homebrew](https://brew.sh/):
 
 ```sh
-brew install flex bison cmake python3
+brew install flex bison cmake python3 llvm
 ```
 
 The necessary Python packages can then easily be added using the pip3 command.
@@ -57,7 +57,7 @@ export PATH=/opt/homebrew/opt/flex/bin:/opt/homebrew/opt/bison/bin:$PATH
 On Ubuntu (>=18.04) flex/bison versions are recent enough and are installed along with the system toolchain:
 
 ```sh
-apt-get install flex bison gcc python3 python3-pip
+apt-get install flex bison gcc python3 python3-pip llvm-dev llvm-runtime llvm clang-format clang
 ```
 
 The Python dependencies are installed using:
@@ -79,6 +79,15 @@ cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/nmodl
 make -j && make install
 ```
 
+If `llvm-config` is not in PATH then set LLVM_DIR as:
+
+```sh
+cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DLLVM_DIR=/path/to/llvm/install/lib/cmake/llvm
+
+# on OSX
+cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DLLVM_DIR=`brew --prefix llvm`/lib/cmake/llvm
+```
+
 And set PYTHONPATH as:
 
 ```sh
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index f3a9d20722..ed123543c4 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -56,7 +56,7 @@ jobs:
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
       cmake --version
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3.7) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=Release
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3.7) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=Release -DNMODL_ENABLE_LLVM=OFF
       make -j 2
       if [ $? -ne 0 ]
       then
@@ -124,7 +124,7 @@ jobs:
     submodules: True
   - script: |
       brew install flex cmake python@3
-      brew install bison
+      brew install bison llvm
       python3 -m pip install -U pip setuptools
       python3 -m pip install --user 'Jinja2>=2.9.3' 'PyYAML>=3.13' pytest pytest-cov numpy 'sympy>=1.3'
     displayName: 'Install Dependencies'
@@ -132,7 +132,7 @@ jobs:
       export PATH=/usr/local/opt/flex/bin:/usr/local/opt/bison/bin:$PATH;
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=`brew --prefix llvm`/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
       make -j 2
       if [ $? -ne 0 ]
       then
diff --git a/setup.py b/setup.py
index ec560c6c1e..26e7a0b92c 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,7 @@ def _config_exe(exe_name):
 ]
 
 
-cmake_args = ["-DPYTHON_EXECUTABLE=" + sys.executable]
+cmake_args = ["-DPYTHON_EXECUTABLE=" + sys.executable, "-DNMODL_ENABLE_LLVM=OFF"]
 if "bdist_wheel" in sys.argv:
     cmake_args.append("-DLINK_AGAINST_PYTHON=FALSE")
 

From ae07ce463fb69838a04201673ee295452cfbf513 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 29 Nov 2020 08:40:54 +0100
Subject: [PATCH 005/331] Print build status after cmake configure stage   -
 print table with different build options, flags and paths     used that can
 be helpful for debugging   - fix git revision date for older git version   -
 update INSTALL.md with correct brew paths for flex and bison

---
 CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b294fecad7..8397a644f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -258,6 +258,12 @@ if(cmake_generator_tolower MATCHES "makefile")
   message(STATUS "Flex                | ${FLEX_EXECUTABLE}")
   message(STATUS "Bison               | ${BISON_EXECUTABLE}")
   message(STATUS "Python              | ${PYTHON_EXECUTABLE}")
+  message(STATUS "LLVM Codegen        | ${NMODL_ENABLE_LLVM}")
+  if(NMODL_ENABLE_LLVM)
+    message(STATUS "  VERSION           | ${LLVM_PACKAGE_VERSION}")
+    message(STATUS "  INCLUDE           | ${LLVM_INCLUDE_DIRS}")
+    message(STATUS "  CMAKE             | ${LLVM_CMAKE_DIR}")
+  endif()
   if(NMODL_CLANG_FORMAT)
     message(STATUS "Clang Format        | ${ClangFormat_EXECUTABLE}")
   endif()

From c540fb1ffc1094bc936b04e8ad80b34dec9bd4cd Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 29 Nov 2020 16:46:29 +0100
Subject: [PATCH 006/331] Adding test template for LLVM codegen   -
 test/unit/codegen/llvm.cpp added for unit testing     LLVM code generation
 visitor   - ./bin/testcodegen binary can be used to launch     LLVM codegen
 specific tests   - multiple llvm_map_components_to_libnames removed   -
 update procedure.mod with simple examples for IR generation

---
 cmake/LLVMHelper.cmake                    |  8 ++--
 src/CMakeLists.txt                        |  4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 10 +++++
 src/codegen/llvm/codegen_llvm_visitor.hpp |  8 ++++
 test/integration/mod/procedure.mod        | 19 +++++++--
 test/unit/CMakeLists.txt                  | 37 ++++++++++++----
 test/unit/codegen/llvm.cpp                | 51 +++++++++++++++++++++++
 7 files changed, 118 insertions(+), 19 deletions(-)
 create mode 100644 test/unit/codegen/llvm.cpp

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index de078be7b5..dbd29c92b6 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -6,9 +6,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
   include(CheckCXXSourceCompiles)
 
   # test by including LLVM header and core library
-  llvm_map_components_to_libnames(LLVM_CORE_LIBS core)
+  llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core)
   set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
-  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_CORE_LIBS})
+  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
   # simple code to test LLVM library linking
   set(CODE_TO_TEST
@@ -39,7 +39,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
         STATUS
           "Adding ${CMAKE_REQUIRED_FLAGS} to CMAKE_CXX_FLAGS, required to link with LLVM libraries")
     else()
-      message(STATUS "WARNING : -stdlib=libstdcx++ or -stdlib=libc++ didn't work to link with LLVM library")
+      message(
+        STATUS
+          "WARNING : -stdlib=libstdcx++ or -stdlib=libc++ didn't work to link with LLVM library")
     endif()
   endif()
 endif()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 61e4f9f233..cf4acc4de0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,9 +30,7 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 
 if(NMODL_ENABLE_LLVM)
-  # LLVM core libraries to link
-  llvm_map_components_to_libnames(LLVM_CORE_LIBS core)
-  target_link_libraries(nmodl llvm_codegen ${LLVM_CORE_LIBS})
+  target_link_libraries(nmodl llvm_codegen ${LLVM_LIBS_TO_LINK})
 endif()
 
 # =============================================================================
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 3f4e319503..494d5fd1f3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -7,6 +7,7 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "ast/all.hpp"
+#include "visitors/visitor_utils.hpp"
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -34,12 +35,21 @@ void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node)
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
     logger->info("CodegenLLVMVisitor : visiting {} procedure", node.get_node_name());
+
+    // print position, nmodl and json form as
+    /*
+    logger->info("Location {} \n NMODL {} \n JSON : {} \n",
+                 node.get_token()->position(),
+                 to_nmodl(node),
+                 to_json(node));
+    */
     node.visit_children(*this);
     // TODO : code generation for procedure block
 }
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     node.visit_children(*this);
+    result_code = "Hello World";
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 2b77160cd5..5b0ad3a968 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -45,6 +45,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Output directory for code generation
     std::string output_dir;
 
+    // result string for demo
+    std::string result_code;
+
   public:
     /**
      * \brief Constructs the LLVM code generator visitor
@@ -59,6 +62,11 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_statement_block(const ast::StatementBlock& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
+
+    // demo method
+    std::string get_code() const {
+        return result_code;
+    }
 };
 
 /** \} */  // end of llvm_backends
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
index 3eb4817b3b..ebbc39f15a 100644
--- a/test/integration/mod/procedure.mod
+++ b/test/integration/mod/procedure.mod
@@ -1,15 +1,26 @@
-PROCEDURE state(x, y) {
+PROCEDURE hello_world() {
+    print("Hello World")
+}
+
+PROCEDURE simple_sum(x, y) {
     LOCAL z
     z = x + y
 }
 
-PROCEDURE rates(v) {
+PROCEDURE complex_sum(v) {
     LOCAL  alpha, beta, sum
     {
         alpha = .1 * exp(-(v+40))
         beta =  4 * exp(-(v+65)/18)
-    }
-    {
         sum = alpha + beta
     }
 }
+
+PROCEDURE loop_function(v) {
+    LOCAL i
+    i = 0
+    WHILE(i < 10) {
+        print("Hello World")
+        i = i + 1
+    }
+}
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 04e33614cd..81ceb04a59 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -60,6 +60,11 @@ add_executable(testcodegen codegen/main.cpp codegen/codegen_ispc.cpp codegen/cod
 
 target_link_libraries(testmodtoken lexer util)
 target_link_libraries(testlexer lexer util)
+target_link_libraries(testprinter printer util)
+target_link_libraries(testsymtab symtab lexer util)
+target_link_libraries(testunitlexer lexer util)
+target_link_libraries(testunitparser lexer test_util config)
+
 target_link_libraries(
   testparser
   visitor
@@ -69,6 +74,7 @@ target_link_libraries(
   test_util
   printer
   ${NMODL_WRAPPER_LIBS})
+
 target_link_libraries(
   testvisitor
   visitor
@@ -88,10 +94,22 @@ target_link_libraries(
   test_util
   printer
   ${NMODL_WRAPPER_LIBS})
-target_link_libraries(testprinter printer util)
-target_link_libraries(testsymtab symtab lexer util)
-target_link_libraries(testunitlexer lexer util)
-target_link_libraries(testunitparser lexer test_util config)
+
+if(NMODL_ENABLE_LLVM)
+  add_executable(testcodegen visitor/main.cpp codegen/llvm.cpp)
+  target_link_libraries(
+    testcodegen
+    visitor
+    symtab
+    lexer
+    util
+    test_util
+    printer
+    llvm_codegen
+    ${NMODL_WRAPPER_LIBS}
+    ${LLVM_LIBS_TO_LINK})
+  set(CODEGEN_TEST testcodegen)
+endif()
 
 # =============================================================================
 # Use catch_discover instead of add_test for granular test report if CMAKE ver is greater than 3.9,
@@ -100,9 +118,10 @@ target_link_libraries(testunitparser lexer test_util config)
 set(testvisitor_env "PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH}")
 if(NOT LINK_AGAINST_PYTHON)
   list(APPEND testvisitor_env "NMODL_PYLIB=$ENV{NMODL_PYLIB}")
-  list(APPEND testvisitor_env
-       "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
-
+  list(
+    APPEND
+      testvisitor_env
+      "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif()
 
 foreach(
@@ -117,8 +136,8 @@ foreach(
   testnewton
   testfast_math
   testunitlexer
-  testunitparser)
-
+  testunitparser
+  ${CODEGEN_TEST})
   if(${CMAKE_VERSION} VERSION_GREATER "3.10")
     if(${test_name} STREQUAL "testvisitor")
       catch_discover_tests(${test_name} TEST_PREFIX "${test_name}/" PROPERTIES ENVIRONMENT
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
new file mode 100644
index 0000000000..b6efe2f9ca
--- /dev/null
+++ b/test/unit/codegen/llvm.cpp
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <catch/catch.hpp>
+
+#include "ast/program.hpp"
+#include "parser/nmodl_driver.hpp"
+#include "visitors/checkparent_visitor.hpp"
+#include "visitors/inline_visitor.hpp"
+#include "visitors/symtab_visitor.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+
+using namespace nmodl;
+using namespace visitor;
+using nmodl::parser::NmodlDriver;
+
+//=============================================================================
+// Sample LLVM codegen test
+//=============================================================================
+
+std::string run_llvm_visitor(const std::string& text) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    SymtabVisitor().visit_program(*ast);
+    InlineVisitor().visit_program(*ast);
+
+    codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".");
+    llvm_visitor.visit_program(*ast);
+    return llvm_visitor.get_code();
+}
+
+SCENARIO("Running LLVM Codegen", "[visitor][llvm]") {
+    GIVEN("Simple procedure with hello world message") {
+        std::string nmodl_text = R"(
+            PROCEDURE say_hello() {
+                print("Hello World")
+            }
+        )";
+
+        THEN("Hello world message is printed") {
+            std::string expected = "Hello World";
+            auto result = run_llvm_visitor(nmodl_text);
+            REQUIRE(result == expected);
+        }
+    }
+}
\ No newline at end of file

From 724605c89c65b91e74acdc801ba807f5dbcd5b98 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 22 Dec 2020 13:54:32 +0300
Subject: [PATCH 007/331] Initial LLVM codegen vistor routines (#457)

* Added LLVM code generation for `ProcedureBlock`.
* Added code generation routines for double, integer and
   boolean variable types.
* Added binary and unary operator code generation:
     - Supported binary operators: +, -, *, /.
     - Supported unary operators: -.
     - Assignment (=) is also supported.
* Added regex matching unit tests for LLVM code generation.
* Fixed Travis CI/builds.

fixes #451, fixes #452, fixes #456

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 CMakeLists.txt                            |   1 +
 azure-pipelines.yml                       |   5 +-
 cmake/LLVMHelper.cmake                    |  14 +-
 setup.py                                  |   2 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 157 +++++++++++++++---
 src/codegen/llvm/codegen_llvm_visitor.hpp |  40 ++++-
 test/integration/mod/procedure.mod        |   9 +-
 test/unit/CMakeLists.txt                  |   7 +-
 test/unit/codegen/llvm.cpp                | 188 ++++++++++++++++++++--
 9 files changed, 365 insertions(+), 58 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8397a644f6..a280906edd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,6 +147,7 @@ find_python_module(yaml 3.12 REQUIRED)
 # =============================================================================
 if(NMODL_ENABLE_LLVM)
   include(LLVMHelper)
+  include_directories(${LLVM_INCLUDE_DIRS})
   add_definitions(-DNMODL_LLVM_BACKEND)
 endif()
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index ed123543c4..f9d7d8ee80 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -123,8 +123,7 @@ jobs:
   - checkout: self
     submodules: True
   - script: |
-      brew install flex cmake python@3
-      brew install bison llvm
+      brew install flex bison cmake python@3 llvm
       python3 -m pip install -U pip setuptools
       python3 -m pip install --user 'Jinja2>=2.9.3' 'PyYAML>=3.13' pytest pytest-cov numpy 'sympy>=1.3'
     displayName: 'Install Dependencies'
@@ -171,6 +170,7 @@ jobs:
     displayName: 'Build Neuron and Run Integration Tests'
 - job: 'manylinux_wheels'
   timeoutInMinutes: 45
+  condition: eq(1,2)
   pool:
     vmImage: 'ubuntu-18.04'
   strategy:
@@ -220,6 +220,7 @@ jobs:
   - template: ci/upload-wheels.yml
 - job: 'macos_wheels'
   timeoutInMinutes: 45
+  condition: eq(1,2)
   pool:
     vmImage: 'macOS-10.15'
   strategy:
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index dbd29c92b6..982af48660 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -1,15 +1,17 @@
 # =============================================================================
 # LLVM/Clang needs to be linked with either libc++ or libstdc++
 # =============================================================================
+
+find_package(LLVM REQUIRED CONFIG)
+
+# include LLVM header and core library
+llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core)
+set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
+set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
+
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
-  find_package(LLVM REQUIRED CONFIG)
   include(CheckCXXSourceCompiles)
 
-  # test by including LLVM header and core library
-  llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core)
-  set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
-  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
-
   # simple code to test LLVM library linking
   set(CODE_TO_TEST
       "
diff --git a/setup.py b/setup.py
index 26e7a0b92c..27539ab4ce 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,7 @@ def _config_exe(exe_name):
 ]
 
 
-cmake_args = ["-DPYTHON_EXECUTABLE=" + sys.executable, "-DNMODL_ENABLE_LLVM=OFF"]
+cmake_args = ["-DPYTHON_EXECUTABLE=" + sys.executable, "-DNMODL_ENABLE_LLVM=OFF", "-DNMODL_ENABLE_PYTHON_BINDINGS=ON"]
 if "bdist_wheel" in sys.argv:
     cmake_args.append("-DLINK_AGAINST_PYTHON=FALSE")
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 494d5fd1f3..b8b3778e86 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -10,46 +10,153 @@
 #include "visitors/visitor_utils.hpp"
 
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/ValueSymbolTable.h"
 
 namespace nmodl {
 namespace codegen {
 
 
-// LLVM code generator objects
-using namespace llvm;
-static std::unique_ptr<LLVMContext> TheContext;
-static std::unique_ptr<Module> TheModule;
-static std::unique_ptr<IRBuilder<>> Builder;
-static std::map<std::string, Value*> NamedValues;
+/****************************************************************************************/
+/*                            Overloaded visitor routines                               */
+/****************************************************************************************/
 
 
-void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node) {
-    logger->info("CodegenLLVMVisitor : visiting statement block");
-    node.visit_children(*this);
-    // TODO : code generation for new block scope
+void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& node) {
+    const auto& op = node.get_op().get_value();
+
+    // Process rhs first, since lhs is handled differently for assignment and binary
+    // operators.
+    node.get_rhs()->accept(*this);
+    llvm::Value* rhs = values.back();
+    values.pop_back();
+    if (op == ast::BinaryOp::BOP_ASSIGN) {
+        auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
+        if (!var) {
+            throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
+        }
+        llvm::Value* alloca = named_values[var->get_node_name()];
+        builder.CreateStore(rhs, alloca);
+        return;
+    }
+
+    node.get_lhs()->accept(*this);
+    llvm::Value* lhs = values.back();
+    values.pop_back();
+    llvm::Value* result;
+
+    // \todo: Support other binary operators
+    switch (op) {
+#define DISPATCH(binary_op, llvm_op) \
+    case binary_op:                  \
+        result = llvm_op(lhs, rhs);  \
+        values.push_back(result);    \
+        break;
+
+        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd);
+        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv);
+        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul);
+        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub);
+
+#undef DISPATCH
+    }
 }
 
-void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
-    logger->info("CodegenLLVMVisitor : visiting {} procedure", node.get_node_name());
-
-    // print position, nmodl and json form as
-    /*
-    logger->info("Location {} \n NMODL {} \n JSON : {} \n",
-                 node.get_token()->position(),
-                 to_nmodl(node),
-                 to_json(node));
-    */
-    node.visit_children(*this);
-    // TODO : code generation for procedure block
+void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
+    const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*context),
+                                                  node.get_value());
+    values.push_back(constant);
+}
+
+void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
+    const auto& constant = llvm::ConstantFP::get(llvm::Type::getDoubleTy(*context),
+                                                 node.get_value());
+    values.push_back(constant);
+}
+
+void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
+    const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
+                                                  node.get_value());
+    values.push_back(constant);
+}
+
+void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatement& node) {
+    for (const auto& variable: node.get_variables()) {
+        // LocalVar always stores a Name.
+        auto name = variable->get_node_name();
+        llvm::Type* var_type = llvm::Type::getDoubleTy(*context);
+        llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+        named_values[name] = alloca;
+    }
 }
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     node.visit_children(*this);
-    result_code = "Hello World";
+    // Keep this for easier development (maybe move to debug mode later).
+    std::cout << print_module();
+}
+
+void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
+    const auto& name = node.get_node_name();
+    const auto& parameters = node.get_parameters();
+
+    // The procedure parameters are doubles by default.
+    std::vector<llvm::Type*> arg_types;
+    for (size_t i = 0, e = parameters.size(); i < e; ++i)
+        arg_types.push_back(llvm::Type::getDoubleTy(*context));
+    llvm::Type* return_type = llvm::Type::getVoidTy(*context);
+
+    llvm::Function* proc =
+        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                               llvm::Function::ExternalLinkage,
+                               name,
+                               *module);
+
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", proc);
+    builder.SetInsertPoint(body);
+
+    // First, allocate parameters on the stack and add them to the symbol table.
+    unsigned i = 0;
+    for (auto& arg: proc->args()) {
+        std::string arg_name = parameters[i++].get()->get_node_name();
+        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
+        arg.setName(arg_name);
+        builder.CreateStore(&arg, alloca);
+        named_values[arg_name] = alloca;
+    }
+
+    const auto& statements = node.get_statement_block()->get_statements();
+    for (const auto& statement: statements) {
+        // \todo: Support other statement types.
+        if (statement->is_local_list_statement() || statement->is_expression_statement())
+            statement->accept(*this);
+    }
+
+    values.clear();
+    // \todo: Add proper support for the symbol table.
+    named_values.clear();
+}
+
+void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node) {
+    ast::UnaryOp op = node.get_op().get_value();
+    node.get_expression()->accept(*this);
+    llvm::Value* value = values.back();
+    values.pop_back();
+    if (op == ast::UOP_NEGATION) {
+        llvm::Value* result = builder.CreateFNeg(value);
+        values.push_back(result);
+    } else {
+        // Support only `double` operators for now.
+        throw std::runtime_error("Error: unsupported unary operator\n");
+    }
+}
+
+void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
+    llvm::Value* var = builder.CreateLoad(named_values[node.get_node_name()]);
+    values.push_back(var);
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 5b0ad3a968..5a288d9836 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -21,6 +21,10 @@
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
 namespace nmodl {
 namespace codegen {
 
@@ -45,8 +49,18 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Output directory for code generation
     std::string output_dir;
 
-    // result string for demo
-    std::string result_code;
+  private:
+    std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
+
+    std::unique_ptr<llvm::Module> module = std::make_unique<llvm::Module>(mod_filename, *context);
+
+    llvm::IRBuilder<> builder;
+
+    // Stack to hold visited values
+    std::vector<llvm::Value*> values;
+
+    // Mappings for named values for lookups
+    std::map<std::string, llvm::Value*> named_values;
 
   public:
     /**
@@ -57,15 +71,27 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     CodegenLLVMVisitor(const std::string& mod_filename, const std::string& output_dir)
         : mod_filename(mod_filename)
-        , output_dir(output_dir) {}
+        , output_dir(output_dir)
+        , builder(*context) {}
 
-    void visit_statement_block(const ast::StatementBlock& node) override;
+    // Visitors
+    void visit_binary_expression(const ast::BinaryExpression& node) override;
+    void visit_boolean(const ast::Boolean& node) override;
+    void visit_double(const ast::Double& node) override;
+    void visit_integer(const ast::Integer& node) override;
+    void visit_local_list_statement(const ast::LocalListStatement& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
+    void visit_unary_expression(const ast::UnaryExpression& node) override;
+    void visit_var_name(const ast::VarName& node) override;
 
-    // demo method
-    std::string get_code() const {
-        return result_code;
+    // TODO: use custom printer here
+    std::string print_module() const {
+        std::string str;
+        llvm::raw_string_ostream os(str);
+        os << *module;
+        os.flush();
+        return str;
     }
 };
 
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
index ebbc39f15a..4017b6a505 100644
--- a/test/integration/mod/procedure.mod
+++ b/test/integration/mod/procedure.mod
@@ -1,5 +1,10 @@
+NEURON {
+    SUFFIX procedure_test
+    THREADSAFE
+}
+
 PROCEDURE hello_world() {
-    print("Hello World")
+    printf("Hello World")
 }
 
 PROCEDURE simple_sum(x, y) {
@@ -20,7 +25,7 @@ PROCEDURE loop_function(v) {
     LOCAL i
     i = 0
     WHILE(i < 10) {
-        print("Hello World")
+        printf("Hello World")
         i = i + 1
     }
 }
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 81ceb04a59..c3a8dd104d 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -96,9 +96,10 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 
 if(NMODL_ENABLE_LLVM)
-  add_executable(testcodegen visitor/main.cpp codegen/llvm.cpp)
+  include_directories(${LLVM_INCLUDE_DIRS})
+  add_executable(testllvm visitor/main.cpp codegen/llvm.cpp)
   target_link_libraries(
-    testcodegen
+    testllvm
     visitor
     symtab
     lexer
@@ -108,7 +109,7 @@ if(NMODL_ENABLE_LLVM)
     llvm_codegen
     ${NMODL_WRAPPER_LIBS}
     ${LLVM_LIBS_TO_LINK})
-  set(CODEGEN_TEST testcodegen)
+  set(CODEGEN_TEST testllvm)
 endif()
 
 # =============================================================================
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index b6efe2f9ca..270ce97ec0 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -6,20 +6,21 @@
  *************************************************************************/
 
 #include <catch/catch.hpp>
+#include <regex>
 
 #include "ast/program.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
 #include "visitors/inline_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
-#include "codegen/llvm/codegen_llvm_visitor.hpp"
 
 using namespace nmodl;
 using namespace visitor;
 using nmodl::parser::NmodlDriver;
 
 //=============================================================================
-// Sample LLVM codegen test
+// Utility to get LLVM module as a string
 //=============================================================================
 
 std::string run_llvm_visitor(const std::string& text) {
@@ -31,21 +32,184 @@ std::string run_llvm_visitor(const std::string& text) {
 
     codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".");
     llvm_visitor.visit_program(*ast);
-    return llvm_visitor.get_code();
+    return llvm_visitor.print_module();
 }
 
-SCENARIO("Running LLVM Codegen", "[visitor][llvm]") {
-    GIVEN("Simple procedure with hello world message") {
+//=============================================================================
+// BinaryExpression and Double
+//=============================================================================
+
+SCENARIO("Binary expression", "[visitor][llvm]") {
+    GIVEN("Procedure with addition of its arguments") {
+        std::string nmodl_text = R"(
+            PROCEDURE add(a, b) {
+                LOCAL i
+                i = a + b
+            }
+        )";
+
+        THEN("variables are loaded and add instruction is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check the values are loaded correctly and added
+            std::regex rhs(R"(%1 = load double, double\* %b)");
+            std::regex lhs(R"(%2 = load double, double\* %a)");
+            std::regex res(R"(%3 = fadd double %2, %1)");
+            REQUIRE(std::regex_search(module_string, m, rhs));
+            REQUIRE(std::regex_search(module_string, m, lhs));
+            REQUIRE(std::regex_search(module_string, m, res));
+        }
+    }
+
+    GIVEN("Procedure with multiple binary operators") {
         std::string nmodl_text = R"(
-            PROCEDURE say_hello() {
-                print("Hello World")
+            PROCEDURE multiple(a, b) {
+                LOCAL i
+                i = (a - b) / (a + b)
             }
         )";
 
-        THEN("Hello world message is printed") {
-            std::string expected = "Hello World";
-            auto result = run_llvm_visitor(nmodl_text);
-            REQUIRE(result == expected);
+        THEN("variables are processed from rhs first") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check rhs
+            std::regex rr(R"(%1 = load double, double\* %b)");
+            std::regex rl(R"(%2 = load double, double\* %a)");
+            std::regex x(R"(%3 = fadd double %2, %1)");
+            REQUIRE(std::regex_search(module_string, m, rr));
+            REQUIRE(std::regex_search(module_string, m, rl));
+            REQUIRE(std::regex_search(module_string, m, x));
+
+            // Check lhs
+            std::regex lr(R"(%4 = load double, double\* %b)");
+            std::regex ll(R"(%5 = load double, double\* %a)");
+            std::regex y(R"(%6 = fsub double %5, %4)");
+            REQUIRE(std::regex_search(module_string, m, lr));
+            REQUIRE(std::regex_search(module_string, m, ll));
+            REQUIRE(std::regex_search(module_string, m, y));
+
+            // Check result
+            std::regex res(R"(%7 = fdiv double %6, %3)");
+            REQUIRE(std::regex_search(module_string, m, res));
         }
     }
-}
\ No newline at end of file
+
+    GIVEN("Procedure with assignment") {
+        std::string nmodl_text = R"(
+            PROCEDURE assignment() {
+                LOCAL i
+                i = 2
+            }
+        )";
+
+        THEN("double constant is stored into i") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check store immediate is created
+            std::regex allocation(R"(%i = alloca double)");
+            std::regex assignment(R"(store double 2.0*e\+00, double\* %i)");
+            REQUIRE(std::regex_search(module_string, m, allocation));
+            REQUIRE(std::regex_search(module_string, m, assignment));
+        }
+    }
+}
+
+//=============================================================================
+// LocalList and LocalVar
+//=============================================================================
+
+SCENARIO("Local variable", "[visitor][llvm]") {
+    GIVEN("Procedure with some local variables") {
+        std::string nmodl_text = R"(
+            PROCEDURE local() {
+                LOCAL i, j
+            }
+        )";
+
+        THEN("local variables are allocated on the stack") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check stack allocations for i and j
+            std::regex i(R"(%i = alloca double)");
+            std::regex j(R"(%j = alloca double)");
+            REQUIRE(std::regex_search(module_string, m, i));
+            REQUIRE(std::regex_search(module_string, m, j));
+        }
+    }
+}
+
+//=============================================================================
+// ProcedureBlock
+//=============================================================================
+
+SCENARIO("Procedure", "[visitor][llvm]") {
+    GIVEN("Empty procedure with no arguments") {
+        std::string nmodl_text = R"(
+            PROCEDURE empty() {}
+        )";
+
+        THEN("empty void function is produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check procedure has empty body
+            std::regex procedure(R"(define void @empty\(\) \{\n\})");
+            REQUIRE(std::regex_search(module_string, m, procedure));
+        }
+    }
+
+    GIVEN("Empty procedure with arguments") {
+        std::string nmodl_text = R"(
+            PROCEDURE with_argument(x) {}
+        )";
+
+        THEN("void function is produced with arguments allocated on stack") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check procedure signature
+            std::regex function_signature(R"(define void @with_argument\(double %x1\) \{)");
+            REQUIRE(std::regex_search(module_string, m, function_signature));
+
+            // Check that procedure arguments are allocated on the local stack
+            std::regex alloca_instr(R"(%x = alloca double)");
+            std::regex store_instr(R"(store double %x1, double\* %x)");
+            REQUIRE(std::regex_search(module_string, m, alloca_instr));
+            REQUIRE(std::regex_search(module_string, m, store_instr));
+        }
+    }
+}
+
+//=============================================================================
+// UnaryExpression
+//=============================================================================
+
+SCENARIO("Unary expression", "[visitor][llvm]") {
+    GIVEN("Procedure with negation") {
+        std::string nmodl_text = R"(
+            PROCEDURE negation(a) {
+                LOCAL i
+                i = -a
+            }
+        )";
+
+        THEN("fneg instruction is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex allocation(R"(%1 = load double, double\* %a)");
+            REQUIRE(std::regex_search(module_string, m, allocation));
+
+            // llvm v9 and llvm v11 implementation for negation
+            std::regex negation_v9(R"(%2 = fsub double -0.000000e\+00, %1)");
+            std::regex negation_v11(R"(fneg double %1)");
+            bool result = std::regex_search(module_string, m, negation_v9) ||
+                          std::regex_search(module_string, m, negation_v11);
+            REQUIRE(result == true);
+        }
+    }
+}

From b621d4e5b3df04a8189a97036433bbe824780b40 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 25 Dec 2020 13:29:02 +0300
Subject: [PATCH 008/331] FunctionBlock code generation and terminator checks
 (#470)

* LLVM code generation for `FunctionBlock` is now supported.
* Terminators in function or procedure blocks are enforced:
      - Every procedure must have `ret void` instruction.
      - Every function returns a double, specified by `ret_<function_name>`.
* For local symbol table, code generation now uses LLVM's builtin
`llvm::ValueSymbolTable`.


fixes #454, fixes #469
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 123 ++++++++++++++--------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  11 +-
 test/unit/codegen/llvm.cpp                |  50 ++++++++-
 3 files changed, 137 insertions(+), 47 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index b8b3778e86..6e1177cbec 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -7,6 +7,7 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "ast/all.hpp"
+#include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
 #include "llvm/IR/BasicBlock.h"
@@ -20,6 +21,80 @@ namespace nmodl {
 namespace codegen {
 
 
+/****************************************************************************************/
+/*                            Helper routines                                           */
+/****************************************************************************************/
+
+
+void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
+    const auto& name = node.get_node_name();
+    const auto& parameters = node.get_parameters();
+
+    // Procedure or function parameters are doubles by default.
+    std::vector<llvm::Type*> arg_types;
+    for (size_t i = 0; i < parameters.size(); ++i)
+        arg_types.push_back(llvm::Type::getDoubleTy(*context));
+
+    // If visiting a function, the return type is a double by default.
+    llvm::Type* return_type = node.is_function_block() ? llvm::Type::getDoubleTy(*context)
+                                                       : llvm::Type::getVoidTy(*context);
+
+    llvm::Function* func =
+        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                               llvm::Function::ExternalLinkage,
+                               name,
+                               *module);
+
+    // Create the entry basic block of the function/procedure and point the local named values table
+    // to the symbol table.
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
+    builder.SetInsertPoint(body);
+    local_named_values = func->getValueSymbolTable();
+
+    // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
+    // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
+    // symbolic conflicts. Then, allocate the return variable on the local stack.
+    std::string return_var_name = "ret_" + name;
+    const auto& block = node.get_statement_block();
+    if (node.is_function_block()) {
+        visitor::RenameVisitor v(name, return_var_name);
+        block->accept(v);
+        builder.CreateAlloca(llvm::Type::getDoubleTy(*context),
+                             /*ArraySize=*/nullptr,
+                             return_var_name);
+    }
+
+    // Allocate parameters on the stack and add them to the symbol table.
+    unsigned i = 0;
+    for (auto& arg: func->args()) {
+        std::string arg_name = parameters[i++].get()->get_node_name();
+        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
+        arg.setName(arg_name);
+        builder.CreateStore(&arg, alloca);
+    }
+
+    // Process function or procedure body.
+    const auto& statements = block->get_statements();
+    for (const auto& statement: statements) {
+        // \todo: Support other statement types.
+        if (statement->is_local_list_statement() || statement->is_expression_statement())
+            statement->accept(*this);
+    }
+
+    // Add the terminator. If visiting function, we need to return the value specified by
+    // ret_<function_name>.
+    if (node.is_function_block()) {
+        llvm::Value* return_var = builder.CreateLoad(local_named_values->lookup(return_var_name));
+        builder.CreateRet(return_var);
+    } else {
+        builder.CreateRetVoid();
+    }
+
+    // Clear local values stack and remove the pointer to the local symbol table.
+    values.clear();
+    local_named_values = nullptr;
+}
+
 /****************************************************************************************/
 /*                            Overloaded visitor routines                               */
 /****************************************************************************************/
@@ -38,7 +113,7 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
         if (!var) {
             throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
         }
-        llvm::Value* alloca = named_values[var->get_node_name()];
+        llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
         builder.CreateStore(rhs, alloca);
         return;
     }
@@ -77,6 +152,10 @@ void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
     values.push_back(constant);
 }
 
+void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
+    visit_procedure_or_function(node);
+}
+
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
                                                   node.get_value());
@@ -89,7 +168,6 @@ void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatemen
         auto name = variable->get_node_name();
         llvm::Type* var_type = llvm::Type::getDoubleTy(*context);
         llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
-        named_values[name] = alloca;
     }
 }
 
@@ -100,44 +178,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 }
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
-    const auto& name = node.get_node_name();
-    const auto& parameters = node.get_parameters();
-
-    // The procedure parameters are doubles by default.
-    std::vector<llvm::Type*> arg_types;
-    for (size_t i = 0, e = parameters.size(); i < e; ++i)
-        arg_types.push_back(llvm::Type::getDoubleTy(*context));
-    llvm::Type* return_type = llvm::Type::getVoidTy(*context);
-
-    llvm::Function* proc =
-        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
-                               llvm::Function::ExternalLinkage,
-                               name,
-                               *module);
-
-    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", proc);
-    builder.SetInsertPoint(body);
-
-    // First, allocate parameters on the stack and add them to the symbol table.
-    unsigned i = 0;
-    for (auto& arg: proc->args()) {
-        std::string arg_name = parameters[i++].get()->get_node_name();
-        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
-        arg.setName(arg_name);
-        builder.CreateStore(&arg, alloca);
-        named_values[arg_name] = alloca;
-    }
-
-    const auto& statements = node.get_statement_block()->get_statements();
-    for (const auto& statement: statements) {
-        // \todo: Support other statement types.
-        if (statement->is_local_list_statement() || statement->is_expression_statement())
-            statement->accept(*this);
-    }
-
-    values.clear();
-    // \todo: Add proper support for the symbol table.
-    named_values.clear();
+    visit_procedure_or_function(node);
 }
 
 void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node) {
@@ -155,7 +196,7 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
 }
 
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
-    llvm::Value* var = builder.CreateLoad(named_values[node.get_node_name()]);
+    llvm::Value* var = builder.CreateLoad(local_named_values->lookup(node.get_node_name()));
     values.push_back(var);
 }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 5a288d9836..801922cdc1 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -59,8 +59,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Stack to hold visited values
     std::vector<llvm::Value*> values;
 
-    // Mappings for named values for lookups
-    std::map<std::string, llvm::Value*> named_values;
+    // Pointer to the local symbol table.
+    llvm::ValueSymbolTable* local_named_values = nullptr;
 
   public:
     /**
@@ -74,10 +74,17 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , output_dir(output_dir)
         , builder(*context) {}
 
+    /**
+     * Visit nmodl function or procedure
+     * \param node the AST node representing the function or procedure in NMODL
+     */
+    void visit_procedure_or_function(const ast::Block& node);
+
     // Visitors
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
     void visit_double(const ast::Double& node) override;
+    void visit_function_block(const ast::FunctionBlock& node) override;
     void visit_integer(const ast::Integer& node) override;
     void visit_local_list_statement(const ast::LocalListStatement& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index 270ce97ec0..44ca18391b 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -117,6 +117,44 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// FunctionBlock
+//=============================================================================
+
+SCENARIO("Function", "[visitor][llvm]") {
+    GIVEN("Simple function with arguments") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(x) {
+               foo = x
+            }
+        )";
+
+        THEN("function is produced with arguments allocated on stack and a return instruction") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check function signature. The return type should be the default double type.
+            std::regex function_signature(R"(define double @foo\(double %x1\) \{)");
+            REQUIRE(std::regex_search(module_string, m, function_signature));
+
+            // Check that function arguments are allocated on the local stack.
+            std::regex alloca_instr(R"(%x = alloca double)");
+            std::regex store_instr(R"(store double %x1, double\* %x)");
+            REQUIRE(std::regex_search(module_string, m, alloca_instr));
+            REQUIRE(std::regex_search(module_string, m, store_instr));
+
+            // Check the return variable has also been allocated.
+            std::regex ret_instr(R"(%ret_foo = alloca double)");
+
+            // Check that the return value has been loaded and passed to terminator.
+            std::regex loaded(R"(%2 = load double, double\* %ret_foo)");
+            std::regex terminator(R"(ret double %2)");
+            REQUIRE(std::regex_search(module_string, m, loaded));
+            REQUIRE(std::regex_search(module_string, m, terminator));
+        }
+    }
+}
+
 //=============================================================================
 // LocalList and LocalVar
 //=============================================================================
@@ -156,8 +194,8 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check procedure has empty body
-            std::regex procedure(R"(define void @empty\(\) \{\n\})");
+            // Check procedure has empty body with a void return.
+            std::regex procedure(R"(define void @empty\(\) \{\n(\s)*ret void\n\})");
             REQUIRE(std::regex_search(module_string, m, procedure));
         }
     }
@@ -171,15 +209,19 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check procedure signature
+            // Check procedure signature.
             std::regex function_signature(R"(define void @with_argument\(double %x1\) \{)");
             REQUIRE(std::regex_search(module_string, m, function_signature));
 
-            // Check that procedure arguments are allocated on the local stack
+            // Check that procedure arguments are allocated on the local stack.
             std::regex alloca_instr(R"(%x = alloca double)");
             std::regex store_instr(R"(store double %x1, double\* %x)");
             REQUIRE(std::regex_search(module_string, m, alloca_instr));
             REQUIRE(std::regex_search(module_string, m, store_instr));
+
+            // Check terminator.
+            std::regex terminator(R"(ret void)");
+            REQUIRE(std::regex_search(module_string, m, terminator));
         }
     }
 }

From 917a7da38d8697a683fc63e759a6a9ac56e6d80f Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Mon, 28 Dec 2020 23:59:20 +0100
Subject: [PATCH 009/331] Add option to run LLVM optimisation passes (#471)

* Add option to run LLVM optimisation passes
  - update CLI argument from --llvm to llvm --ir --opt
  - --ir runs CodegenLLVMVicitor and emits LLVM IR
  - if --opt is passed, we run basic LLVM optimisation passes
  - update simple test to check optimisation passes
* Add function example in procedure.mod
* Add test for LLVM optimisation passes and dead code removal
---
 cmake/LLVMHelper.cmake                    |  2 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 25 ++++++++++++++++++
 src/codegen/llvm/codegen_llvm_visitor.hpp | 26 ++++++++++++++++--
 src/main.cpp                              | 25 +++++++++++++-----
 test/integration/mod/procedure.mod        |  8 +++++-
 test/unit/codegen/llvm.cpp                | 32 ++++++++++++++++++++---
 6 files changed, 105 insertions(+), 13 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 982af48660..5d451697b9 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -5,7 +5,7 @@
 find_package(LLVM REQUIRED CONFIG)
 
 # include LLVM header and core library
-llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core)
+llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core native)
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
 set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6e1177cbec..d99e519dca 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -25,6 +25,24 @@ namespace codegen {
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
+void CodegenLLVMVisitor::run_llvm_opt_passes() {
+    /// run some common optimisation passes that are commonly suggested
+    fpm.add(llvm::createInstructionCombiningPass());
+    fpm.add(llvm::createReassociatePass());
+    fpm.add(llvm::createGVNPass());
+    fpm.add(llvm::createCFGSimplificationPass());
+
+    /// initialize pass manager
+    fpm.doInitialization();
+
+    /// iterate over all functions and run the optimisation passes
+    auto& functions = module->getFunctionList();
+    for (auto& function: functions) {
+        llvm::verifyFunction(function);
+        fpm.run(function);
+    }
+}
+
 
 void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
     const auto& name = node.get_node_name();
@@ -95,6 +113,7 @@ void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
     local_named_values = nullptr;
 }
 
+
 /****************************************************************************************/
 /*                            Overloaded visitor routines                               */
 /****************************************************************************************/
@@ -173,6 +192,12 @@ void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatemen
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     node.visit_children(*this);
+
+    if (opt_passes) {
+        logger->info("Running LLVM optimisation passes");
+        run_llvm_opt_passes();
+    }
+
     // Keep this for easier development (maybe move to debug mode later).
     std::cout << print_module();
 }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 801922cdc1..6b94ecffbe 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -23,7 +23,12 @@
 
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
 
 namespace nmodl {
 namespace codegen {
@@ -56,12 +61,25 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
     llvm::IRBuilder<> builder;
 
+    llvm::legacy::FunctionPassManager fpm;
+
     // Stack to hold visited values
     std::vector<llvm::Value*> values;
 
     // Pointer to the local symbol table.
     llvm::ValueSymbolTable* local_named_values = nullptr;
 
+    // Run optimisation passes if true
+    bool opt_passes;
+
+    /**
+     *\brief Run LLVM optimisation passes on generated IR
+     *
+     * LLVM provides number of optimisation passes that can be run on the generated IR.
+     * Here we run common optimisation LLVM passes that benefits code optimisation.
+     */
+    void run_llvm_opt_passes();
+
   public:
     /**
      * \brief Constructs the LLVM code generator visitor
@@ -69,10 +87,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      * This constructor instantiates an NMODL LLVM code generator. This is
      * just template to work with initial implementation.
      */
-    CodegenLLVMVisitor(const std::string& mod_filename, const std::string& output_dir)
+    CodegenLLVMVisitor(const std::string& mod_filename,
+                       const std::string& output_dir,
+                       bool opt_passes)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
-        , builder(*context) {}
+        , opt_passes(opt_passes)
+        , builder(*context)
+        , fpm(module.get()) {}
 
     /**
      * Visit nmodl function or procedure
diff --git a/src/main.cpp b/src/main.cpp
index fcc813b74b..bc123f905a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -161,6 +161,14 @@ int main(int argc, const char* argv[]) {
     /// floating point data type
     std::string data_type("double");
 
+#ifdef NMODL_LLVM_BACKEND
+    /// generate llvm IR
+    bool llvm_ir(false);
+
+    /// run llvm optimisation passes
+    bool llvm_opt_passes(false);
+#endif
+
     app.get_formatter()->column_width(40);
     app.set_help_all_flag("-H,--help-all", "Print this help message including all sub-commands");
 
@@ -168,10 +176,6 @@ int main(int argc, const char* argv[]) {
         ->ignore_case()
         ->check(CLI::IsMember({"trace", "debug", "info", "warning", "error", "critical", "off"}));
 
-#ifdef NMODL_LLVM_BACKEND
-    app.add_flag("--llvm", llvm_backend, "Enable LLVM based code generation")->ignore_case();
-#endif
-
     app.add_option("file", mod_files, "One or more MOD files to process")
         ->ignore_case()
         ->required()
@@ -268,6 +272,15 @@ int main(int argc, const char* argv[]) {
         optimize_ionvar_copies_codegen,
         "Optimize copies of ion variables ({})"_format(optimize_ionvar_copies_codegen))->ignore_case();
 
+#ifdef NMODL_LLVM_BACKEND
+    auto llvm_opt = app.add_subcommand("llvm", "LLVM code generation option")->ignore_case();
+    llvm_opt->add_flag("--ir",
+        llvm_ir,
+        "Generate LLVM IR ({})"_format(llvm_ir))->ignore_case();
+    llvm_opt->add_flag("--opt",
+        llvm_opt_passes,
+        "Run LLVM optimisation passes ({})"_format(llvm_opt_passes))->ignore_case();
+#endif
     // clang-format on
 
     CLI11_PARSE(app, argc, argv);
@@ -560,9 +573,9 @@ int main(int argc, const char* argv[]) {
             }
 
 #ifdef NMODL_LLVM_BACKEND
-            if (llvm_backend) {
+            if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(modfile, output_dir);
+                CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes);
                 visitor.visit_program(*ast);
             }
 #endif
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
index 4017b6a505..4a45af7d1e 100644
--- a/test/integration/mod/procedure.mod
+++ b/test/integration/mod/procedure.mod
@@ -21,7 +21,7 @@ PROCEDURE complex_sum(v) {
     }
 }
 
-PROCEDURE loop_function(v) {
+PROCEDURE loop_proc(v) {
     LOCAL i
     i = 0
     WHILE(i < 10) {
@@ -29,3 +29,9 @@ PROCEDURE loop_function(v) {
         i = i + 1
     }
 }
+
+FUNCTION square(x) {
+    LOCAL res
+    res = x * x
+    square = res
+}
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index 44ca18391b..9c86e8c30a 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -23,14 +23,14 @@ using nmodl::parser::NmodlDriver;
 // Utility to get LLVM module as a string
 //=============================================================================
 
-std::string run_llvm_visitor(const std::string& text) {
+std::string run_llvm_visitor(const std::string& text, bool opt = false) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
     InlineVisitor().visit_program(*ast);
 
-    codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".");
+    codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".", opt);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.print_module();
 }
@@ -52,10 +52,11 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check the values are loaded correctly and added
             std::regex rhs(R"(%1 = load double, double\* %b)");
             std::regex lhs(R"(%2 = load double, double\* %a)");
             std::regex res(R"(%3 = fadd double %2, %1)");
+
+            // Check the values are loaded correctly and added
             REQUIRE(std::regex_search(module_string, m, rhs));
             REQUIRE(std::regex_search(module_string, m, lhs));
             REQUIRE(std::regex_search(module_string, m, res));
@@ -255,3 +256,28 @@ SCENARIO("Unary expression", "[visitor][llvm]") {
         }
     }
 }
+
+//=============================================================================
+// Optimization : dead code removal
+//=============================================================================
+
+SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
+    GIVEN("Procedure using local variables, without any side effects") {
+        std::string nmodl_text = R"(
+            PROCEDURE add(a, b) {
+                LOCAL i
+                i = a + b
+            }
+        )";
+
+        THEN("with optimisation enabled, all ops are eliminated") {
+            std::string module_string = run_llvm_visitor(nmodl_text, true);
+            std::smatch m;
+
+            // Check if the values are optimised out
+            std::regex empty_proc(
+                R"(define void @add\(double %a1, double %b2\) \{\n(\s)*ret void\n\})");
+            REQUIRE(std::regex_search(module_string, m, empty_proc));
+        }
+    }
+}
\ No newline at end of file

From b261ba92d4484b45ffb9aa5237e7e91ec1a2e478 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Wed, 30 Dec 2020 22:14:00 +0300
Subject: [PATCH 010/331] Add function call LLVM code generation (#477)

This patch adds support for function call code generation, particularly:

- User-defined procedures and functions can now lowered to LLVM IR.
- A framework for external method calls (e.g. sin, exp, etc.) has been created, currently `exp` and `pow` are supported.
- Corresponding tests added.

fixes #472
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 103 +++++++++++++++++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp |  32 ++++++-
 test/unit/CMakeLists.txt                  |   3 +-
 test/unit/codegen/llvm.cpp                | 104 +++++++++++++++++++++-
 4 files changed, 231 insertions(+), 11 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index d99e519dca..430f3d78de 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -7,8 +7,8 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "ast/all.hpp"
+#include "codegen/codegen_helper_visitor.hpp"
 #include "visitors/rename_visitor.hpp"
-#include "visitors/visitor_utils.hpp"
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -44,7 +44,56 @@ void CodegenLLVMVisitor::run_llvm_opt_passes() {
 }
 
 
-void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
+void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
+                                                     const ast::ExpressionVector& arguments) {
+    std::vector<llvm::Value*> argument_values;
+    std::vector<llvm::Type*> argument_types;
+    for (const auto& arg: arguments) {
+        arg->accept(*this);
+        llvm::Value* value = values.back();
+        llvm::Type* type = value->getType();
+        values.pop_back();
+        argument_types.push_back(type);
+        argument_values.push_back(value);
+    }
+
+#define DISPATCH(method_name, intrinsic)                                                           \
+    if (name == method_name) {                                                                     \
+        llvm::Value* result = builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
+        values.push_back(result);                                                                  \
+        return;                                                                                    \
+    }
+
+    DISPATCH("exp", llvm::Intrinsic::exp);
+    DISPATCH("pow", llvm::Intrinsic::pow);
+#undef DISPATCH
+
+    throw std::runtime_error("Error: External method" + name + " is not currently supported");
+}
+
+void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
+                                              const std::string& name,
+                                              const ast::ExpressionVector& arguments) {
+    // Check that function is called with the expected number of arguments.
+    if (arguments.size() != func->arg_size()) {
+        throw std::runtime_error("Error: Incorrect number of arguments passed");
+    }
+
+    // Process each argument and add it to a vector to pass to the function call instruction. Note
+    // that type checks are not needed here as NMODL operates on doubles by default.
+    std::vector<llvm::Value*> argument_values;
+    for (const auto& arg: arguments) {
+        arg->accept(*this);
+        llvm::Value* value = values.back();
+        values.pop_back();
+        argument_values.push_back(value);
+    }
+
+    llvm::Value* call = builder.CreateCall(func, argument_values);
+    values.push_back(call);
+}
+
+void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Block& node) {
     const auto& name = node.get_node_name();
     const auto& parameters = node.get_parameters();
 
@@ -57,11 +106,17 @@ void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
     llvm::Type* return_type = node.is_function_block() ? llvm::Type::getDoubleTy(*context)
                                                        : llvm::Type::getVoidTy(*context);
 
-    llvm::Function* func =
-        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
-                               llvm::Function::ExternalLinkage,
-                               name,
-                               *module);
+    // Create a function that is automatically inserted into module's symbol table.
+    llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                           llvm::Function::ExternalLinkage,
+                           name,
+                           *module);
+}
+
+void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
+    const auto& name = node.get_node_name();
+    const auto& parameters = node.get_parameters();
+    llvm::Function* func = module->getFunction(name);
 
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
@@ -175,6 +230,22 @@ void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
     visit_procedure_or_function(node);
 }
 
+void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
+    const auto& name = node.get_node_name();
+    auto func = module->getFunction(name);
+    if (func) {
+        create_function_call(func, name, node.get_arguments());
+    } else {
+        auto symbol = sym_tab->lookup(name);
+        if (symbol && symbol->has_any_property(symtab::syminfo::NmodlType::extern_method)) {
+            create_external_method_call(name, node.get_arguments());
+        } else {
+            throw std::runtime_error("Error: Unknown function name: " + name +
+                                     ". (External functions references are not supported)");
+        }
+    }
+}
+
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
                                                   node.get_value());
@@ -191,6 +262,24 @@ void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatemen
 }
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
+    // Before generating LLVM, gather information about AST. For now, information about functions
+    // and procedures is used only.
+    CodegenHelperVisitor v;
+    CodegenInfo info = v.analyze(node);
+
+    // For every function and procedure, generate its declaration. Thus, we can look up
+    // `llvm::Function` in the symbol table in the module.
+    for (const auto& func: info.functions) {
+        emit_procedure_or_function_declaration(*func);
+    }
+    for (const auto& proc: info.procedures) {
+        emit_procedure_or_function_declaration(*proc);
+    }
+
+    // Set the AST symbol table.
+    sym_tab = node.get_symbol_table();
+
+    // Proceed with code generation.
     node.visit_children(*this);
 
     if (opt_passes) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 6b94ecffbe..32347bdabd 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -18,6 +18,7 @@
 #include <ostream>
 #include <string>
 
+#include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
@@ -69,7 +70,10 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Pointer to the local symbol table.
     llvm::ValueSymbolTable* local_named_values = nullptr;
 
-    // Run optimisation passes if true
+    // Pointer to AST symbol table.
+    symtab::SymbolTable* sym_tab;
+
+    // Run optimisation passes if true.
     bool opt_passes;
 
     /**
@@ -96,6 +100,31 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , builder(*context)
         , fpm(module.get()) {}
 
+    /**
+     * Create a function call to an external method
+     * \param name external method name
+     * \param arguments expressions passed as arguments to the given external method
+     */
+    void create_external_method_call(const std::string& name,
+                                     const ast::ExpressionVector& arguments);
+
+    /**
+     * Create a function call to NMODL function or procedure in the same mod file
+     * \param func LLVM function corresponding ti this call
+     * \param name function name
+     * \param arguments expressions passed as arguments to the function call
+     */
+    void create_function_call(llvm::Function* func,
+                              const std::string& name,
+                              const ast::ExpressionVector& arguments);
+
+    /**
+     * Emit function or procedure declaration in LLVM given the node
+     *
+     * \param node the AST node representing the function or procedure in NMODL
+     */
+    void emit_procedure_or_function_declaration(const ast::Block& node);
+
     /**
      * Visit nmodl function or procedure
      * \param node the AST node representing the function or procedure in NMODL
@@ -107,6 +136,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_boolean(const ast::Boolean& node) override;
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
+    void visit_function_call(const ast::FunctionCall& node) override;
     void visit_integer(const ast::Integer& node) override;
     void visit_local_list_statement(const ast::LocalListStatement& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index c3a8dd104d..7131e4eba1 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -100,13 +100,14 @@ if(NMODL_ENABLE_LLVM)
   add_executable(testllvm visitor/main.cpp codegen/llvm.cpp)
   target_link_libraries(
     testllvm
+    llvm_codegen
+    codegen
     visitor
     symtab
     lexer
     util
     test_util
     printer
-    llvm_codegen
     ${NMODL_WRAPPER_LIBS}
     ${LLVM_LIBS_TO_LINK})
   set(CODEGEN_TEST testllvm)
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index 9c86e8c30a..d2c0a65e86 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -12,7 +12,6 @@
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
-#include "visitors/inline_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
 
 using namespace nmodl;
@@ -28,7 +27,6 @@ std::string run_llvm_visitor(const std::string& text, bool opt = false) {
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
-    InlineVisitor().visit_program(*ast);
 
     codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".", opt);
     llvm_visitor.visit_program(*ast);
@@ -156,6 +154,108 @@ SCENARIO("Function", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// FunctionCall
+//=============================================================================
+
+SCENARIO("Function call", "[visitor][llvm]") {
+    GIVEN("A call to procedure") {
+        std::string nmodl_text = R"(
+            PROCEDURE bar() {}
+            FUNCTION foo() {
+                bar()
+            }
+        )";
+
+        THEN("a void call instruction is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check for call instruction.
+            std::regex call(R"(call void @bar\(\))");
+            REQUIRE(std::regex_search(module_string, m, call));
+        }
+    }
+
+    GIVEN("A call to function declared below the caller") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(x) {
+                foo = 4 * bar()
+            }
+            FUNCTION bar() {
+                bar = 5
+            }
+        )";
+
+        THEN("a correct call instruction is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check for call instruction.
+            std::regex call(R"(%[0-9]+ = call double @bar\(\))");
+            REQUIRE(std::regex_search(module_string, m, call));
+        }
+    }
+
+    GIVEN("A call to function with arguments") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(x, y) {
+                foo = 4 * x - y
+            }
+            FUNCTION bar(i) {
+                bar = foo(i, 4)
+            }
+        )";
+
+        THEN("arguments are processed before the call and passed to call instruction") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check correct arguments.
+            std::regex i(R"(%1 = load double, double\* %i)");
+            std::regex call(R"(call double @foo\(double %1, double 4.000000e\+00\))");
+            REQUIRE(std::regex_search(module_string, m, i));
+            REQUIRE(std::regex_search(module_string, m, call));
+        }
+    }
+
+    GIVEN("A call to external method") {
+        std::string nmodl_text = R"(
+            FUNCTION bar(i) {
+                bar = exp(i)
+            }
+        )";
+
+        THEN("LLVM intrinsic corresponding to this method is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check for intrinsic declaration.
+            std::regex exp(R"(declare double @llvm\.exp\.f64\(double\))");
+            REQUIRE(std::regex_search(module_string, m, exp));
+
+            // Check the correct call is made.
+            std::regex call(R"(call double @llvm\.exp\.f64\(double %[0-9]+\))");
+            REQUIRE(std::regex_search(module_string, m, call));
+        }
+    }
+
+    GIVEN("A call to function with the wrong number of arguments") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(x, y) {
+                foo = 4 * x - y
+            }
+            FUNCTION bar(i) {
+                bar = foo(i)
+            }
+        )";
+
+        THEN("a runtime error is thrown") {
+            REQUIRE_THROWS_AS(run_llvm_visitor(nmodl_text), std::runtime_error);
+        }
+    }
+}
+
 //=============================================================================
 // LocalList and LocalVar
 //=============================================================================

From 7884de81fb25ac7a94b256bb6230d5e55be450c1 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 31 Dec 2020 00:49:13 +0300
Subject: [PATCH 011/331] Support for IndexedName codegen (#478)

LLVM code generation for `IndexedName`s.

- Added code generation for initialising arrays in LOCAL blocks (with both integer constants and macros).
- Added support for indexing arrays.

fixes #467
---
 src/codegen/llvm/codegen_llvm_visitor.cpp |  88 +++++++++++++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp |  29 ++++++
 test/unit/codegen/llvm.cpp                | 111 ++++++++++++++++++++++
 3 files changed, 220 insertions(+), 8 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 430f3d78de..b2a09fdd96 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -25,6 +25,44 @@ namespace codegen {
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
+bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
+    llvm::Type* array_type =
+        local_named_values->lookup(node.get_node_name())->getType()->getPointerElementType();
+    unsigned length = array_type->getArrayNumElements();
+    return 0 <= index && index < length;
+}
+
+llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, unsigned index) {
+    llvm::Type* index_type = llvm::Type::getInt32Ty(*context);
+    std::vector<llvm::Value*> indices;
+    indices.push_back(llvm::ConstantInt::get(index_type, 0));
+    indices.push_back(llvm::ConstantInt::get(index_type, index));
+
+    return builder.CreateInBoundsGEP(local_named_values->lookup(name), indices);
+}
+
+llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
+    unsigned index = get_array_index_or_length(node);
+
+    // Check if index is within array bounds.
+    if (!check_array_bounds(node, index))
+        throw std::runtime_error("Error: Index is out of bounds");
+
+    return create_gep(node.get_node_name(), index);
+}
+
+unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& indexed_name) {
+    auto integer = std::dynamic_pointer_cast<ast::Integer>(indexed_name.get_length());
+    if (!integer)
+        throw std::runtime_error("Error: expecting integer index or length");
+
+    // Check if integer value is taken from a macro.
+    if (!integer->get_macro())
+        return integer->get_value();
+    const auto& macro = sym_tab->lookup(integer->get_macro()->get_node_name());
+    return static_cast<unsigned>(*macro->get_value());
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
@@ -43,7 +81,6 @@ void CodegenLLVMVisitor::run_llvm_opt_passes() {
     }
 }
 
-
 void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
                                                      const ast::ExpressionVector& arguments) {
     std::vector<llvm::Value*> argument_values;
@@ -187,8 +224,17 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
         if (!var) {
             throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
         }
-        llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
-        builder.CreateStore(rhs, alloca);
+
+        const auto& identifier = var->get_name();
+        if (identifier->is_name()) {
+            llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
+            builder.CreateStore(rhs, alloca);
+        } else if (identifier->is_indexed_name()) {
+            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+            builder.CreateStore(rhs, codegen_indexed_name(*indexed_name));
+        } else {
+            throw std::runtime_error("Error: Unsupported variable type");
+        }
         return;
     }
 
@@ -254,10 +300,22 @@ void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
 
 void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatement& node) {
     for (const auto& variable: node.get_variables()) {
-        // LocalVar always stores a Name.
-        auto name = variable->get_node_name();
-        llvm::Type* var_type = llvm::Type::getDoubleTy(*context);
-        llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+        std::string name = variable->get_node_name();
+        const auto& identifier = variable->get_name();
+        // Local variable can be a scalar (Node AST class) or an array (IndexedName AST class). For
+        // each case, create memory allocations with the corresponding LLVM type.
+        llvm::Type* var_type;
+        if (identifier->is_indexed_name()) {
+            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+            unsigned length = get_array_index_or_length(*indexed_name);
+            var_type = llvm::ArrayType::get(llvm::Type::getDoubleTy(*context), length);
+        } else if (identifier->is_name()) {
+            // This case corresponds to a scalar local variable. Its type is double by default.
+            var_type = llvm::Type::getDoubleTy(*context);
+        } else {
+            throw std::runtime_error("Error: Unsupported local variable type");
+        }
+        builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
     }
 }
 
@@ -310,7 +368,21 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
 }
 
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
-    llvm::Value* var = builder.CreateLoad(local_named_values->lookup(node.get_node_name()));
+    const auto& identifier = node.get_name();
+    if (!identifier->is_name() && !identifier->is_indexed_name())
+        throw std::runtime_error("Error: Unsupported variable type");
+
+    llvm::Value* ptr;
+    if (identifier->is_name())
+        ptr = local_named_values->lookup(node.get_node_name());
+
+    if (identifier->is_indexed_name()) {
+        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        ptr = codegen_indexed_name(*indexed_name);
+    }
+
+    // Finally, load the variable from the pointer value.
+    llvm::Value* var = builder.CreateLoad(ptr);
     values.push_back(var);
 }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 32347bdabd..be4eb04867 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -100,6 +100,35 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , builder(*context)
         , fpm(module.get()) {}
 
+    /**
+     * Checks if array index specified by the given IndexedName is within bounds
+     * \param node IndexedName representing array
+     * \return     \c true if the index is within bounds
+     */
+    bool check_array_bounds(const ast::IndexedName& node, unsigned index);
+
+    /**
+     * Generates LLVM code for the given IndexedName
+     * \param node IndexedName NMODL AST node
+     * \return LLVM code generated for this AST node
+     */
+    llvm::Value* codegen_indexed_name(const ast::IndexedName& node);
+
+    /**
+     * Returns GEP instruction to 1D array
+     * \param name 1D array name
+     * \param index element index
+     * \return GEP instruction value
+     */
+    llvm::Value* create_gep(const std::string& name, unsigned index);
+
+    /**
+     * Returns array index or length from given IndexedName
+     * \param node IndexedName representing array
+     * \return array index or length
+     */
+    unsigned get_array_index_or_length(const ast::IndexedName& node);
+
     /**
      * Create a function call to an external method
      * \param name external method name
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index d2c0a65e86..0ceadbe6f1 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -116,6 +116,31 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// Define
+//=============================================================================
+
+SCENARIO("Define", "[visitor][llvm]") {
+    GIVEN("Procedure with array variable of length specified by DEFINE") {
+        std::string nmodl_text = R"(
+            DEFINE N 100
+
+            PROCEDURE foo() {
+                LOCAL x[N]
+            }
+        )";
+
+        THEN("macro is expanded and array is allocated") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check stack allocations for i and j
+            std::regex array(R"(%x = alloca \[100 x double\])");
+            REQUIRE(std::regex_search(module_string, m, array));
+        }
+    }
+}
+
 //=============================================================================
 // FunctionBlock
 //=============================================================================
@@ -256,6 +281,92 @@ SCENARIO("Function call", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// IndexedName
+//=============================================================================
+
+SCENARIO("Indexed name", "[visitor][llvm]") {
+    GIVEN("Procedure with a local array variable") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo() {
+                LOCAL x[2]
+            }
+        )";
+
+        THEN("array is allocated") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex array(R"(%x = alloca \[2 x double\])");
+            REQUIRE(std::regex_search(module_string, m, array));
+        }
+    }
+
+    GIVEN("Procedure with a local array assignment") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo() {
+                LOCAL x[2]
+                x[1] = 3
+            }
+        )";
+
+        THEN("element is stored to the array") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check GEP is created correctly to pint at array element.
+            std::regex GEP(
+                R"(%1 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i32 0, i32 1)");
+            REQUIRE(std::regex_search(module_string, m, GEP));
+
+            // Check the value is stored to the pointer.
+            std::regex store(R"(store double 3.000000e\+00, double\* %1)");
+            REQUIRE(std::regex_search(module_string, m, store));
+        }
+    }
+
+    GIVEN("Procedure with a assignment of array element") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo() {
+                LOCAL x[2], y
+                x[1] = 3
+                y = x[1]
+            }
+        )";
+
+        THEN("array element is stored to the variable") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check GEP is created correctly to pint at array element.
+            std::regex GEP(
+                R"(%2 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i32 0, i32 1)");
+            REQUIRE(std::regex_search(module_string, m, GEP));
+
+            // Check the value is loaded from the pointer.
+            std::regex load(R"(%3 = load double, double\* %2)");
+            REQUIRE(std::regex_search(module_string, m, load));
+
+            // Check the value is stored to the the variable.
+            std::regex store(R"(store double %3, double\* %y)");
+            REQUIRE(std::regex_search(module_string, m, store));
+        }
+    }
+
+    GIVEN("Array with out of bounds access") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo() {
+                LOCAL x[2]
+                x[5] = 3
+            }
+        )";
+
+        THEN("error is thrown") {
+            REQUIRE_THROWS_AS(run_llvm_visitor(nmodl_text), std::runtime_error);
+        }
+    }
+}
+
 //=============================================================================
 // LocalList and LocalVar
 //=============================================================================

From dbda27148d19abffc64aa4d7808adaa3c9f7ac30 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Wed, 6 Jan 2021 23:51:59 +0100
Subject: [PATCH 012/331] Improvements for code generation specific
 transformations (#483)

NMODL AST needs various transformation to generate C++
code or LLVM IR. This PR is begining of AST transformations
to simplify code generation backends.

* New CodegenLLVMHelperVisitor to perform various AST
  transformations to simplify code generation for various
  backends and simulators.
* CodegenLLVMHelperVisitor is currently limited to LLVM
  backend to simplify initial implementation and keep
  C++ based backends working.
* CodegenLLVMHelperVisitor now handles FUNCTIONS and
  PROCEDURES blocks
  -  Replace LocalListStatement with CodegenVarStatement
  - Added new AST types for code generation
    - CodegenVar to represent variable used for code generation
    - CodegenVarType to represent codegen variable
    - CodegenVarListStatement to represent list of CodegenVar
    - CodegenStruct will be used in future to represent struct
      like NrnThread or Mechanism class

See #474
---
 src/codegen/llvm/CMakeLists.txt               |   7 +-
 .../llvm/codegen_llvm_helper_visitor.cpp      | 113 ++++++++++++++++++
 .../llvm/codegen_llvm_helper_visitor.hpp      |  50 ++++++++
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   8 ++
 src/language/code_generator.cmake             |   8 ++
 src/language/codegen.yaml                     | 108 ++++++++++++++++-
 src/language/nmodl.yaml                       |   2 +-
 src/language/node_info.py                     |   2 +
 src/language/nodes.py                         |   4 +
 src/language/templates/ast/ast_decl.hpp       |  10 ++
 .../templates/visitors/nmodl_visitor.cpp      |   3 +
 src/main.cpp                                  |   3 +
 test/integration/mod/procedure.mod            |   2 +-
 test/unit/codegen/llvm.cpp                    |   2 +-
 14 files changed, 316 insertions(+), 6 deletions(-)
 create mode 100644 src/codegen/llvm/codegen_llvm_helper_visitor.cpp
 create mode 100644 src/codegen/llvm/codegen_llvm_helper_visitor.hpp

diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 71ecca338c..db16d4072c 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -1,8 +1,11 @@
 # =============================================================================
 # Codegen sources
 # =============================================================================
-set(LLVM_CODEGEN_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.cpp
-                              ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp)
+set(LLVM_CODEGEN_SOURCE_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp)
 
 # =============================================================================
 # LLVM codegen library
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
new file mode 100644
index 0000000000..c52cc92a3d
--- /dev/null
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -0,0 +1,113 @@
+
+/*************************************************************************
+ * Copyright (C) 2018-2019 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen_llvm_helper_visitor.hpp"
+
+#include "ast/all.hpp"
+#include "utils/logger.hpp"
+#include "visitors/visitor_utils.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+using namespace fmt::literals;
+
+void CodegenLLVMHelperVisitor::visit_statement_block(ast::StatementBlock& node) {
+    node.visit_children(*this);
+
+    /// if local list statement exist, we have to replace it
+    auto local_statement = visitor::get_local_list_statement(node);
+    if (local_statement) {
+        /// create codegen variables from local variables
+        ast::CodegenVarVector variables;
+        for (const auto& var: local_statement->get_variables()) {
+            variables.emplace_back(new ast::CodegenVar(0, var->get_name()->clone()));
+        }
+
+        /// remove local list statement now
+        const auto& statements = node.get_statements();
+        node.erase_statement(statements.begin());
+
+        /// create new codegen variable statement
+        auto type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
+        auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+
+        /// insert codegen variable statement
+        node.insert_statement(statements.begin(), statement);
+    }
+}
+
+void CodegenLLVMHelperVisitor::add_function_procedure_node(ast::Block& node) {
+    std::string function_name = node.get_node_name();
+
+    const auto& source_node_type = node.get_node_type();
+    auto name = new ast::Name(new ast::String(function_name));
+    auto return_var = new ast::Name(new ast::String("ret_" + function_name));
+    ast::CodegenVarType* var_type = nullptr;
+    ast::CodegenVarType* return_type = nullptr;
+
+    /// return type based on node type
+    bool is_function = source_node_type == ast::AstNodeType::FUNCTION_BLOCK;
+    if (is_function) {
+        var_type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
+    } else {
+        var_type = new ast::CodegenVarType(ast::AstNodeType::INTEGER);
+    }
+
+    /// return type is same as variable type
+    return_type = var_type->clone();
+
+    /// function body and it's statement
+    auto block = node.get_statement_block()->clone();
+    const auto& statements = block->get_statements();
+
+    /// insert return variable at the start of the block
+    ast::CodegenVarVector codegen_vars;
+    codegen_vars.emplace_back(new ast::CodegenVar(0, return_var->clone()));
+    auto statement = std::make_shared<ast::CodegenVarListStatement>(var_type, codegen_vars);
+    block->insert_statement(statements.begin(), statement);
+
+    /// add return statement
+    auto return_statement = new ast::CodegenReturnStatement(return_var);
+    block->emplace_back_statement(return_statement);
+
+    /// prepare arguments
+    ast::CodegenArgumentVector code_arguments;
+    const auto& arguments = node.get_parameters();
+    for (const auto& arg: arguments) {
+        auto type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
+        auto var = arg->get_name()->clone();
+        code_arguments.emplace_back(new ast::CodegenArgument(type, var));
+    }
+
+    /// add new node to AST
+    auto function =
+        std::make_shared<ast::CodegenFunction>(return_type, name, code_arguments, block);
+    codegen_functions.push_back(function);
+}
+
+void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
+    node.visit_children(*this);
+    add_function_procedure_node(node);
+}
+
+void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
+    node.visit_children(*this);
+    add_function_procedure_node(node);
+}
+
+void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
+    logger->info("Running CodegenLLVMHelperVisitor");
+    node.visit_children(*this);
+    for (auto& fun: codegen_functions) {
+        node.emplace_back_node(fun);
+    }
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
new file mode 100644
index 0000000000..b7ff57aec1
--- /dev/null
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (C) 2018-2019 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief \copybrief nmodl::codegen::CodegenLLVMHelperVisitor
+ */
+
+#include <string>
+
+#include "codegen/codegen_info.hpp"
+#include "symtab/symbol_table.hpp"
+#include "visitors/ast_visitor.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+/**
+ * @addtogroup llvm_codegen_details
+ * @{
+ */
+
+/**
+ * \class CodegenLLVMHelperVisitor
+ * \brief Helper visitor to gather AST information to help LLVM code generation
+ */
+class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
+    std::vector<std::shared_ptr<ast::CodegenFunction>> codegen_functions;
+
+    void add_function_procedure_node(ast::Block& node);
+
+  public:
+    CodegenLLVMHelperVisitor() = default;
+
+    void visit_statement_block(ast::StatementBlock& node) override;
+    void visit_procedure_block(ast::ProcedureBlock& node) override;
+    void visit_function_block(ast::FunctionBlock& node) override;
+    void visit_program(ast::Program& node) override;
+};
+
+/** @} */  // end of llvm_codegen_details
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index b2a09fdd96..0fa0864d9a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -6,6 +6,8 @@
  *************************************************************************/
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+
 #include "ast/all.hpp"
 #include "codegen/codegen_helper_visitor.hpp"
 #include "visitors/rename_visitor.hpp"
@@ -347,6 +349,12 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     // Keep this for easier development (maybe move to debug mode later).
     std::cout << print_module();
+
+    // not used yet
+    {
+        CodegenLLVMHelperVisitor v;
+        v.visit_program(const_cast<ast::Program&>(node));
+    }
 }
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 400b969a23..585ac19917 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -65,6 +65,14 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/block_comment.hpp
     ${PROJECT_BINARY_DIR}/src/ast/boolean.hpp
     ${PROJECT_BINARY_DIR}/src/ast/breakpoint_block.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_argument.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_for_statement.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_function.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_struct.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_var.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_var_list_statement.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_var_type.hpp
     ${PROJECT_BINARY_DIR}/src/ast/compartment.hpp
     ${PROJECT_BINARY_DIR}/src/ast/conductance_hint.hpp
     ${PROJECT_BINARY_DIR}/src/ast/conserve.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 63762a9be0..7d5516c196 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -29,6 +29,34 @@
                 children:
                   - Number:
                   - Identifier:
+                      children:
+                        - CodegenVarType:
+                            brief: "Represent type of the variable"
+                            members:
+                              - type:
+                                  brief: "Type of the ast node"
+                                  type: AstNodeType
+                        - CodegenVar:
+                            brief: "Represent variable used for code generation"
+                            members:
+                              - pointer:
+                                  brief: "If variable is pointer type"
+                                  type: int
+                              - name:
+                                  brief: "Name of the variable"
+                                  type: Identifier
+                                  node_name: true
+                        - CodegenArgument:
+                            brief: "Represent argument to a function"
+                            members:
+                              - type:
+                                  brief: "Type of the argument"
+                                  type: CodegenVarType
+                                  suffix: {value: " "}
+                              - name:
+                                  brief: "Name of the argument"
+                                  type: Identifier
+                                  node_name: true
                   - Block:
                       children:
                         - NrnStateBlock:
@@ -89,7 +117,29 @@
                                   type: StatementBlock
                               - finalize_block:
                                   brief: "Statement block to be executed after calling linear solver"
-                                  type: StatementBlock                                  
+                                  type: StatementBlock
+                        - CodegenFunction:
+                            brief: "Function generated from FUNCTION or PROCEDURE block"
+                            members:
+                              - return_type:
+                                  brief: "Return type of the function"
+                                  type: CodegenVarType
+                                  suffix: {value: " "}
+                              - name:
+                                  brief: "Name of the function"
+                                  type: Name
+                                  node_name: true
+                              - arguments:
+                                  brief: "Vector of the parameters to the function"
+                                  type: CodegenArgument
+                                  vector: true
+                                  prefix: {value: "(", force: true}
+                                  suffix: {value: ")", force: true}
+                                  separator: ", "
+                              - statement_block:
+                                  brief: "Body of the function"
+                                  type: StatementBlock
+                                  getter: {override: true}
                   - WrappedExpression:
                       brief: "Wrap any other expression type"
                       members:
@@ -110,4 +160,60 @@
                         - node_to_solve:
                             brief: "Block to be solved (callback node or solution node itself)"
                             type: Expression
+                  - CodegenStruct:
+                      brief: "Represent a struct or class for code generation"
+                      members:
+                        - variable_statements:
+                            brief: "member variables of the class/struct"
+                            type: CodegenVarListStatement
+                            vector: true
+                        - functions:
+                            brief: "member functions of the class/struct"
+                            type: CodegenFunction
+                            vector: true
             - Statement:
+                children:
+                  - CodegenForStatement:
+                      brief: "Represent for loop used for code generation"
+                      nmodl: "for("
+                      members:
+                        - initialization:
+                            brief: "initialization expression for the loop"
+                            type: Expression
+                            optional: true
+                        - condition:
+                            brief: "condition expression for the loop"
+                            type: Expression
+                            optional: true
+                            prefix: {value: ";"}
+                            suffix: {value: "; "}
+                        - increment:
+                            brief: "increment or decrement expression for the loop"
+                            type: Expression
+                            optional: true
+                            suffix: {value: ") "}
+                        - statement_block:
+                            brief: "body of the loop"
+                            type: StatementBlock
+                            getter: {override: true}
+                  - CodegenReturnStatement:
+                      brief: "Represent return statement for code generation"
+                      nmodl: "return "
+                      members:
+                        - statement:
+                            brief: "return statement"
+                            type: Expression
+                            optional: true
+                  - CodegenVarListStatement:
+                      brief: "Represent list of variables used for code generation"
+                      members:
+                        - var_type:
+                            brief: "Type of the variables"
+                            type: CodegenVarType
+                            suffix: {value: " "}
+                        - variables:
+                            brief: "List of the variables to define"
+                            type: CodegenVar
+                            vector: true
+                            separator: ", "
+                            add: true
\ No newline at end of file
diff --git a/src/language/nmodl.yaml b/src/language/nmodl.yaml
index 0724f81e29..54da340b7b 100644
--- a/src/language/nmodl.yaml
+++ b/src/language/nmodl.yaml
@@ -1374,7 +1374,7 @@
                             type: Double
 
             - Statement:
-                brief: "TODO"
+                brief: "Base class to represent a statement in the NMODL"
                 children:
                   - UnitState:
                       brief: "TODO"
diff --git a/src/language/node_info.py b/src/language/node_info.py
index f4fb599347..bd81a0d14a 100644
--- a/src/language/node_info.py
+++ b/src/language/node_info.py
@@ -29,6 +29,7 @@
                   "QueueType",
                   "BAType",
                   "UnitStateType",
+                  "AstNodeType",
                   }
 
 BASE_TYPES = {"std::string" } | INTEGRAL_TYPES
@@ -167,6 +168,7 @@
 STATEMENT_BLOCK_NODE = "StatementBlock"
 STRING_NODE = "String"
 UNIT_BLOCK = "UnitBlock"
+AST_NODETYPE_NODE= "AstNodeType"
 
 # name of variable in prime node which represent order of derivative
 ORDER_VAR_NAME = "order"
diff --git a/src/language/nodes.py b/src/language/nodes.py
index a539b55647..05f53f3b97 100644
--- a/src/language/nodes.py
+++ b/src/language/nodes.py
@@ -147,6 +147,10 @@ def is_boolean_node(self):
     def is_name_node(self):
         return self.class_name == node_info.NAME_NODE
 
+    @property
+    def is_ast_nodetype_node(self):
+        return self.class_name == node_info.AST_NODETYPE_NODE
+
     @property
     def is_enum_node(self):
         data_type = node_info.DATA_TYPES[self.class_name]
diff --git a/src/language/templates/ast/ast_decl.hpp b/src/language/templates/ast/ast_decl.hpp
index cbca65e692..196dc9daf4 100644
--- a/src/language/templates/ast/ast_decl.hpp
+++ b/src/language/templates/ast/ast_decl.hpp
@@ -12,6 +12,7 @@
 #pragma once
 
 #include <memory>
+#include <string>
 #include <vector>
 
 /// \file
@@ -50,6 +51,15 @@ enum class AstNodeType {
 
 /** @} */  // end of ast_type
 
+static inline std::string to_string(AstNodeType type) {
+    {% for node in nodes %}
+      if(type == AstNodeType::{{ node.class_name|snake_case|upper }}) {
+          return "{{ node.class_name|snake_case|upper }}";
+      }
+    {% endfor %}
+    throw std::runtime_error("Unhandled type in to_string(AstNodeType type)!");
+}
+
 /**
  * @defgroup ast_vec_type AST Vector Type Aliases
  * @ingroup ast
diff --git a/src/language/templates/visitors/nmodl_visitor.cpp b/src/language/templates/visitors/nmodl_visitor.cpp
index a69c3b0b26..f7bb8279ca 100644
--- a/src/language/templates/visitors/nmodl_visitor.cpp
+++ b/src/language/templates/visitors/nmodl_visitor.cpp
@@ -116,6 +116,9 @@ void NmodlPrintVisitor::visit_{{ node.class_name|snake_case}}(const {{ node.clas
     {% for child in node.children %}
         {% call guard(child.force_prefix, child.force_suffix) -%}
         {% if child.is_base_type_node %}
+            {% if child.is_ast_nodetype_node %}
+               printer->add_element(ast::to_string(node.get_{{child.varname}}()));
+            {% endif %}
         {% else %}
             {% if child.optional or child.is_statement_block_node %}
                 if(node.get_{{ child.varname }}()) {
diff --git a/src/main.cpp b/src/main.cpp
index bc123f905a..53ff1f0f47 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -17,9 +17,11 @@
 #include "codegen/codegen_cuda_visitor.hpp"
 #include "codegen/codegen_ispc_visitor.hpp"
 #include "codegen/codegen_omp_visitor.hpp"
+
 #ifdef NMODL_LLVM_BACKEND
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #endif
+
 #include "config/config.h"
 #include "parser/nmodl_driver.hpp"
 #include "pybind/pyembed.hpp"
@@ -577,6 +579,7 @@ int main(int argc, const char* argv[]) {
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes);
                 visitor.visit_program(*ast);
+                ast_to_nmodl(*ast, filepath("llvm"));
             }
 #endif
         }
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
index 4a45af7d1e..daa4ad33ad 100644
--- a/test/integration/mod/procedure.mod
+++ b/test/integration/mod/procedure.mod
@@ -21,7 +21,7 @@ PROCEDURE complex_sum(v) {
     }
 }
 
-PROCEDURE loop_proc(v) {
+PROCEDURE loop_proc(v, t) {
     LOCAL i
     i = 0
     WHILE(i < 10) {
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index 0ceadbe6f1..d644947e79 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -491,4 +491,4 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
             REQUIRE(std::regex_search(module_string, m, empty_proc));
         }
     }
-}
\ No newline at end of file
+}

From 83abf60d01bce07cee72bfbe170d880bb38222eb Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Thu, 7 Jan 2021 00:50:36 +0100
Subject: [PATCH 013/331] nrn_state function generation in NMODL AST to help
 LLVM codegen (#484)

* Added new BinaryOp for += and -=
  * Added string_to_binaryop function
  * Added Void node type to represent void return type
  * Added CodegenAtomicStatement for ion write statements
  * llvm helper started handling visit_nrn_state_block
    - NrnStateBlock is being converted into CodegenFunction
    - for loop body with solution blocks created
    - voltage and node index initialization code added
    - read and write ion statements are handled
  * Some of the functions are now moved into CodegenInfo

Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>
---
 src/ast/ast_common.hpp                        |  25 +-
 src/codegen/codegen_c_visitor.hpp             |  50 --
 src/codegen/codegen_info.cpp                  |  74 +++
 src/codegen/codegen_info.hpp                  |  94 ++++
 .../llvm/codegen_llvm_helper_visitor.cpp      | 500 ++++++++++++++++--
 .../llvm/codegen_llvm_helper_visitor.hpp      |  42 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   2 +-
 src/language/code_generator.cmake             |   2 +
 src/language/codegen.yaml                     |  39 +-
 test/unit/CMakeLists.txt                      |   6 +-
 10 files changed, 725 insertions(+), 109 deletions(-)

diff --git a/src/ast/ast_common.hpp b/src/ast/ast_common.hpp
index eb854bb5c5..733fc406f7 100644
--- a/src/ast/ast_common.hpp
+++ b/src/ast/ast_common.hpp
@@ -43,9 +43,12 @@ namespace ast {
  *
  * NMODL support different binary operators and this
  * type is used to store their value in the AST.
+ *
+ * \note `+=` and `-=` are not supported by NMODL but they
+ * are added for code generation nodes.
  */
 typedef enum {
-    BOP_ADDITION,        ///< \+
+    BOP_ADDITION = 0,    ///< \+
     BOP_SUBTRACTION,     ///< --
     BOP_MULTIPLICATION,  ///< \c *
     BOP_DIVISION,        ///< \/
@@ -58,7 +61,9 @@ typedef enum {
     BOP_LESS_EQUAL,      ///< <=
     BOP_ASSIGN,          ///< =
     BOP_NOT_EQUAL,       ///< !=
-    BOP_EXACT_EQUAL      ///< ==
+    BOP_EXACT_EQUAL,     ///< ==
+    BOP_ADD_ASSIGN,      ///< \+=
+    BOP_SUB_ASSIGN       ///< \-=
 } BinaryOp;
 
 /**
@@ -68,7 +73,7 @@ typedef enum {
  * is used to lookup the corresponding symbol for the operator.
  */
 static const std::string BinaryOpNames[] =
-    {"+", "-", "*", "/", "^", "&&", "||", ">", "<", ">=", "<=", "=", "!=", "=="};
+    {"+", "-", "*", "/", "^", "&&", "||", ">", "<", ">=", "<=", "=", "!=", "==", "+=", "-="};
 
 /// enum type for unary operators
 typedef enum { UOP_NOT, UOP_NEGATION } UnaryOp;
@@ -106,6 +111,20 @@ typedef enum { LTMINUSGT, LTLT, MINUSGT } ReactionOp;
 /// string representation of ast::ReactionOp
 static const std::string ReactionOpNames[] = {"<->", "<<", "->"};
 
+/**
+ * Get corresponding ast::BinaryOp for given string
+ * @param op Binary operator in string format
+ * @return ast::BinaryOp for given string
+ */
+static inline BinaryOp string_to_binaryop(const std::string& op) {
+    /// check if binary operator supported otherwise error
+    auto it = std::find(std::begin(BinaryOpNames), std::end(BinaryOpNames), op);
+    if (it == std::end(BinaryOpNames)) {
+        throw std::runtime_error("Error in string_to_binaryop, can't find " + op);
+    }
+    int pos = std::distance(std::begin(BinaryOpNames), it);
+    return static_cast<BinaryOp>(pos);
+}
 /** @} */  // end of ast_prop
 
 }  // namespace ast
diff --git a/src/codegen/codegen_c_visitor.hpp b/src/codegen/codegen_c_visitor.hpp
index 87dad2d3ef..7b3ad57e7f 100644
--- a/src/codegen/codegen_c_visitor.hpp
+++ b/src/codegen/codegen_c_visitor.hpp
@@ -46,40 +46,6 @@ namespace codegen {
  * @{
  */
 
-/**
- * \enum BlockType
- * \brief Helper to represent various block types
- *
- * Note: do not assign integers to these enums
- *
- */
-enum BlockType {
-    /// initial block
-    Initial,
-
-    /// destructor block
-    Destructor,
-
-    /// breakpoint block
-    Equation,
-
-    /// ode_* routines block (not used)
-    Ode,
-
-    /// derivative block
-    State,
-
-    /// watch block
-    Watch,
-
-    /// net_receive block
-    NetReceive,
-
-    /// fake ending block type for loops on the enums. Keep it at the end
-    BlockTypeEnd
-};
-
-
 /**
  * \enum MemberType
  * \brief Helper to represent various variables types
@@ -134,22 +100,6 @@ struct IndexVariableInfo {
         , is_integer(is_integer) {}
 };
 
-
-/**
- * \class ShadowUseStatement
- * \brief Represents ion write statement during code generation
- *
- * Ion update statement needs use of shadow vectors for certain backends
- * as atomics operations are not supported on cpu backend.
- *
- * \todo If shadow_lhs is empty then we assume shadow statement not required
- */
-struct ShadowUseStatement {
-    std::string lhs;
-    std::string op;
-    std::string rhs;
-};
-
 /** @} */  // end of codegen_details
 
 
diff --git a/src/codegen/codegen_info.cpp b/src/codegen/codegen_info.cpp
index 8f6bd448f8..2219a18913 100644
--- a/src/codegen/codegen_info.cpp
+++ b/src/codegen/codegen_info.cpp
@@ -8,6 +8,7 @@
 #include "codegen/codegen_info.hpp"
 
 #include "ast/all.hpp"
+#include "utils/logger.hpp"
 #include "visitors/var_usage_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
@@ -15,6 +16,8 @@
 namespace nmodl {
 namespace codegen {
 
+using namespace fmt::literals;
+using symtab::syminfo::NmodlType;
 using visitor::VarUsageVisitor;
 
 /// if any ion has write variable
@@ -131,5 +134,76 @@ bool CodegenInfo::is_voltage_used_by_watch_statements() const {
     return false;
 }
 
+bool CodegenInfo::state_variable(const std::string& name) const {
+    // clang-format off
+    auto result = std::find_if(state_vars.begin(),
+                               state_vars.end(),
+                               [&name](const SymbolType& sym) {
+                                   return name == sym->get_name();
+                               }
+    );
+    // clang-format on
+    return result != state_vars.end();
+}
+
+std::pair<std::string, std::string> CodegenInfo::read_ion_variable_name(
+    const std::string& name) const {
+    return {name, "ion_" + name};
+}
+
+
+std::pair<std::string, std::string> CodegenInfo::write_ion_variable_name(
+    const std::string& name) const {
+    return {"ion_" + name, name};
+}
+
+
+/**
+ * \details Current variable used in breakpoint block could be local variable.
+ * In this case, neuron has already renamed the variable name by prepending
+ * "_l". In our implementation, the variable could have been renamed by
+ * one of the pass. And hence, we search all local variables and check if
+ * the variable is renamed. Note that we have to look into the symbol table
+ * of statement block and not breakpoint.
+ */
+std::string CodegenInfo::breakpoint_current(std::string current) const {
+    auto& breakpoint = breakpoint_node;
+    if (breakpoint == nullptr) {
+        return current;
+    }
+    const auto& symtab = breakpoint->get_statement_block()->get_symbol_table();
+    const auto& variables = symtab->get_variables_with_properties(NmodlType::local_var);
+    for (const auto& var: variables) {
+        std::string renamed_name = var->get_name();
+        std::string original_name = var->get_original_name();
+        if (current == original_name) {
+            current = renamed_name;
+            break;
+        }
+    }
+    return current;
+}
+
+
+bool CodegenInfo::is_an_instance_variable(const std::string& varname) const {
+    /// check if symbol of given name exist
+    auto check_symbol = [](const std::string& name, const std::vector<SymbolType>& symbols) {
+        for (auto& symbol: symbols) {
+            if (symbol->get_name() == name) {
+                return true;
+            }
+        }
+        return false;
+    };
+
+    /// check if variable exist into all possible types
+    if (check_symbol(varname, assigned_vars) || check_symbol(varname, state_vars) ||
+        check_symbol(varname, range_parameter_vars) || check_symbol(varname, range_assigned_vars) ||
+        check_symbol(varname, range_state_vars)) {
+        return true;
+    }
+    return false;
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/codegen_info.hpp b/src/codegen/codegen_info.hpp
index 2df99d7c1c..3298391674 100644
--- a/src/codegen/codegen_info.hpp
+++ b/src/codegen/codegen_info.hpp
@@ -15,6 +15,7 @@
 #include <string>
 
 #include "ast/ast.hpp"
+#include "codegen/codegen_naming.hpp"
 #include "symtab/symbol_table.hpp"
 
 namespace nmodl {
@@ -126,6 +127,59 @@ struct IndexSemantics {
         , size(size) {}
 };
 
+/**
+ * \enum BlockType
+ * \brief Helper to represent various block types
+ *
+ * Note: do not assign integers to these enums
+ *
+ */
+enum BlockType {
+    /// initial block
+    Initial,
+
+    /// destructor block
+    Destructor,
+
+    /// breakpoint block
+    Equation,
+
+    /// ode_* routines block (not used)
+    Ode,
+
+    /// derivative block
+    State,
+
+    /// watch block
+    Watch,
+
+    /// net_receive block
+    NetReceive,
+
+    /// fake ending block type for loops on the enums. Keep it at the end
+    BlockTypeEnd
+};
+
+/**
+ * \class ShadowUseStatement
+ * \brief Represents ion write statement during code generation
+ *
+ * Ion update statement needs use of shadow vectors for certain backends
+ * as atomics operations are not supported on cpu backend.
+ *
+ * \todo Currently `nrn_wrote_conc` is also added to shadow update statements
+ * list as it's corresponding to ion update statement in INITIAL block. This
+ * needs to be factored out.
+ * \todo This can be represented as AST node (like ast::CodegenAtomicStatement)
+ * but currently C backend use this same implementation. So we are using this
+ * same structure and then converting to ast::CodegenAtomicStatement for LLVM
+ * visitor.
+ */
+struct ShadowUseStatement {
+    std::string lhs;
+    std::string op;
+    std::string rhs;
+};
 
 /**
  * \class CodegenInfo
@@ -398,6 +452,46 @@ struct CodegenInfo {
     /// true if WatchStatement uses voltage v variable
     bool is_voltage_used_by_watch_statements() const;
 
+    /**
+     * Checks if the given variable name belongs to a state variable
+     * \param name The variable name
+     * \return     \c true if the variable is a state variable
+     */
+    bool state_variable(const std::string& name) const;
+
+    /**
+     * Return ion variable name and corresponding ion read variable name
+     * \param name The ion variable name
+     * \return     The ion read variable name
+     */
+    std::pair<std::string, std::string> read_ion_variable_name(const std::string& name) const;
+
+    /**
+     * Return ion variable name and corresponding ion write variable name
+     * \param name The ion variable name
+     * \return     The ion write variable name
+     */
+    std::pair<std::string, std::string> write_ion_variable_name(const std::string& name) const;
+
+    /**
+     * Determine the variable name for the "current" used in breakpoint block taking into account
+     * intermediate code transformations.
+     * \param current The variable name for the current used in the model
+     * \return        The name for the current to be printed in C
+     */
+    std::string breakpoint_current(std::string current) const;
+
+    /**
+     * Check if variable with given name is an instance variable
+     *
+     * Instance varaibles are local to each mechanism instance and
+     * needs to be accessed with an array index. Such variables are
+     * assigned, range, parameter+range etc.
+     * @param varname Name of the variable
+     * @return True if variable is per mechanism instance
+     */
+    bool is_an_instance_variable(const std::string& varname) const;
+
     /// if we need a call back to wrote_conc in neuron/coreneuron
     bool require_wrote_conc = false;
 };
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c52cc92a3d..341ab03fb6 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -9,6 +9,7 @@
 #include "codegen_llvm_helper_visitor.hpp"
 
 #include "ast/all.hpp"
+#include "codegen/codegen_helper_visitor.hpp"
 #include "utils/logger.hpp"
 #include "visitors/visitor_utils.hpp"
 
@@ -17,91 +18,496 @@ namespace codegen {
 
 using namespace fmt::literals;
 
-void CodegenLLVMHelperVisitor::visit_statement_block(ast::StatementBlock& node) {
-    node.visit_children(*this);
-
-    /// if local list statement exist, we have to replace it
-    auto local_statement = visitor::get_local_list_statement(node);
-    if (local_statement) {
-        /// create codegen variables from local variables
-        ast::CodegenVarVector variables;
-        for (const auto& var: local_statement->get_variables()) {
-            variables.emplace_back(new ast::CodegenVar(0, var->get_name()->clone()));
-        }
-
-        /// remove local list statement now
-        const auto& statements = node.get_statements();
-        node.erase_statement(statements.begin());
+/**
+ * \brief Create variable definition statement
+ *
+ * `LOCAL` variables in NMODL don't have type. These variables need
+ * to be defined with float type. Same for index, loop iteration and
+ * local variables. This helper function function is used to create
+ * type specific local variable.
+ *
+ * @param names Name of the variables to be defined
+ * @param type Type of the variables
+ * @return Statement defining variables
+ */
+static std::shared_ptr<ast::CodegenVarListStatement> create_local_variable_statement(
+    const std::vector<std::string>& names,
+    ast::AstNodeType type) {
+    /// create variables for the given name
+    ast::CodegenVarVector variables;
+    for (const auto& name: names) {
+        auto varname = new ast::Name(new ast::String(name));
+        variables.emplace_back(new ast::CodegenVar(0, varname));
+    }
+    auto var_type = new ast::CodegenVarType(type);
+    /// construct statement and return it
+    return std::make_shared<ast::CodegenVarListStatement>(var_type, variables);
+}
 
-        /// create new codegen variable statement
-        auto type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
-        auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+/**
+ * \brief Create expression for a given NMODL code statement
+ * @param code NMODL code statement
+ * @return Expression representing given NMODL code
+ */
+static std::shared_ptr<ast::Expression> create_statement_as_expression(const std::string& code) {
+    const auto& statement = visitor::create_statement(code);
+    auto expr_statement = std::dynamic_pointer_cast<ast::ExpressionStatement>(statement);
+    auto expr = expr_statement->get_expression()->clone();
+    return std::make_shared<ast::WrappedExpression>(expr);
+}
 
-        /// insert codegen variable statement
-        node.insert_statement(statements.begin(), statement);
-    }
+/**
+ * \brief Create expression for given NMODL code expression
+ * @param code NMODL code expression
+ * @return Expression representing NMODL code
+ */
+std::shared_ptr<ast::Expression> create_expression(const std::string& code) {
+    /// as provided code is only expression and not a full statement, create
+    /// a temporary assignment statement
+    const auto& wrapped_expr = create_statement_as_expression("some_var = " + code);
+    /// now extract RHS (representing original code) and return it as expression
+    auto expr = std::dynamic_pointer_cast<ast::WrappedExpression>(wrapped_expr)->get_expression();
+    auto rhs = std::dynamic_pointer_cast<ast::BinaryExpression>(expr)->get_rhs();
+    return std::make_shared<ast::WrappedExpression>(rhs->clone());
 }
 
-void CodegenLLVMHelperVisitor::add_function_procedure_node(ast::Block& node) {
+/**
+ * \brief Add code generation function for FUNCTION or PROCEDURE block
+ * @param node AST node representing FUNCTION or PROCEDURE
+ *
+ * When we have a PROCEDURE or FUNCTION like
+ *
+ * \code{.mod}
+ *      FUNCTION sum(x,y) {
+ *          LOCAL res
+ *          res = x + y
+ *          sum = res
+ *      }
+ * \endcode
+ *
+ * this gets typically converted to C/C++ code as:
+ *
+ * \code{.cpp}
+ *      double sum(double x, double y) {
+ *          double res;
+ *          double ret_sum;
+ *          res = x + y;
+ *          ret_sum = res;
+ *          return ret_sum;
+ * \endcode
+ *
+ * We perform following transformations so that code generation backends
+ * will have minimum logic:
+ *  - Add return type
+ *  - Add type for the function arguments
+ *  - Define variables and return variable
+ *  - Add return type (int for PROCEDURE and double for FUNCTION)
+ */
+void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
+    /// name of the function from the node
     std::string function_name = node.get_node_name();
-
-    const auto& source_node_type = node.get_node_type();
     auto name = new ast::Name(new ast::String(function_name));
+
+    /// return variable name has "ret_" prefix
     auto return_var = new ast::Name(new ast::String("ret_" + function_name));
-    ast::CodegenVarType* var_type = nullptr;
-    ast::CodegenVarType* return_type = nullptr;
 
     /// return type based on node type
-    bool is_function = source_node_type == ast::AstNodeType::FUNCTION_BLOCK;
-    if (is_function) {
-        var_type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
+    ast::CodegenVarType* ret_var_type = nullptr;
+    if (node.get_node_type() == ast::AstNodeType::FUNCTION_BLOCK) {
+        ret_var_type = new ast::CodegenVarType(FLOAT_TYPE);
     } else {
-        var_type = new ast::CodegenVarType(ast::AstNodeType::INTEGER);
+        ret_var_type = new ast::CodegenVarType(INTEGER_TYPE);
     }
 
-    /// return type is same as variable type
-    return_type = var_type->clone();
-
-    /// function body and it's statement
+    /// function body and it's statement, copy original block
     auto block = node.get_statement_block()->clone();
     const auto& statements = block->get_statements();
 
     /// insert return variable at the start of the block
     ast::CodegenVarVector codegen_vars;
     codegen_vars.emplace_back(new ast::CodegenVar(0, return_var->clone()));
-    auto statement = std::make_shared<ast::CodegenVarListStatement>(var_type, codegen_vars);
+    auto statement = std::make_shared<ast::CodegenVarListStatement>(ret_var_type, codegen_vars);
     block->insert_statement(statements.begin(), statement);
 
     /// add return statement
     auto return_statement = new ast::CodegenReturnStatement(return_var);
     block->emplace_back_statement(return_statement);
 
-    /// prepare arguments
-    ast::CodegenArgumentVector code_arguments;
-    const auto& arguments = node.get_parameters();
-    for (const auto& arg: arguments) {
-        auto type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
-        auto var = arg->get_name()->clone();
-        code_arguments.emplace_back(new ast::CodegenArgument(type, var));
+    /// prepare function arguments based original node arguments
+    ast::CodegenArgumentVector arguments;
+    for (const auto& param: node.get_parameters()) {
+        /// create new type and name for creating new ast node
+        auto type = new ast::CodegenVarType(FLOAT_TYPE);
+        auto var = param->get_name()->clone();
+        arguments.emplace_back(new ast::CodegenArgument(type, var));
     }
 
-    /// add new node to AST
-    auto function =
-        std::make_shared<ast::CodegenFunction>(return_type, name, code_arguments, block);
+    /// return type of the function is same as return variable type
+    ast::CodegenVarType* fun_ret_type = ret_var_type->clone();
+
+    /// we have all information for code generation function, create a new node
+    /// which will be inserted later into AST
+    auto function = std::make_shared<ast::CodegenFunction>(fun_ret_type, name, arguments, block);
     codegen_functions.push_back(function);
 }
 
+static void append_statements_from_block(ast::StatementVector& statements,
+                                         const std::shared_ptr<ast::StatementBlock>& block) {
+    const auto& block_statements = block->get_statements();
+    statements.insert(statements.end(), block_statements.begin(), block_statements.end());
+}
+
+static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(std::string& lhs_str,
+                                                                            std::string& op_str,
+                                                                            std::string& rhs_str) {
+    auto lhs = std::make_shared<ast::Name>(new ast::String(lhs_str));
+    auto op = ast::BinaryOperator(ast::string_to_binaryop(op_str));
+    auto rhs = create_expression(rhs_str);
+    return std::make_shared<ast::CodegenAtomicStatement>(lhs, op, rhs);
+}
+
+/**
+ * For a given block type, add read ion statements
+ *
+ * Depending upon the block type, we have to update read ion variables
+ * during code generation. Depending on block/procedure being printed,
+ * this method adds necessary read ion variable statements and also
+ * corresponding index calculation statements. Note that index statements
+ * are added separately at the beginning for just readability purpose.
+ *
+ * @param type The type of code block being generated
+ * @param int_variables Index variables to be created
+ * @param double_variables Floating point variables to be created
+ * @param index_statements Statements for loading indexes (typically for ions)
+ * @param body_statements main compute/update statements
+ *
+ * \todo After looking into mod2c and neuron implementation, it seems like
+ * Ode block type is not used. Need to look into implementation details.
+ *
+ * \todo Ion copy optimization is not implemented yet. This is currently
+ * implemented in C backend using `ion_read_statements_optimized()`.
+ */
+void CodegenLLVMHelperVisitor::ion_read_statements(BlockType type,
+                                                   std::vector<std::string>& int_variables,
+                                                   std::vector<std::string>& double_variables,
+                                                   ast::StatementVector& index_statements,
+                                                   ast::StatementVector& body_statements) {
+    /// create read ion and corresponding index statements
+    auto create_read_statements = [&](std::pair<std::string, std::string> variable_names) {
+        // variable in current mechanism instance
+        std::string& varname = variable_names.first;
+        // ion variable to be read
+        std::string& ion_varname = variable_names.second;
+        // index for reading ion variable
+        std::string index_varname = "{}_id"_format(varname);
+        // first load the index
+        std::string index_statement = "{} = {}_index[id]"_format(index_varname, ion_varname);
+        // now assign the value
+        std::string read_statement = "{} = {}[{}]"_format(varname, ion_varname, index_varname);
+        // push index definition, index statement and actual read statement
+        int_variables.push_back(index_varname);
+        index_statements.push_back(visitor::create_statement(index_statement));
+        body_statements.push_back(visitor::create_statement(read_statement));
+    };
+
+    /// iterate over all ions and create statements for given block type
+    for (const auto& ion: info.ions) {
+        const std::string& name = ion.name;
+        for (const auto& var: ion.reads) {
+            if (type == BlockType::Ode && ion.is_ionic_conc(var) && info.state_variable(var)) {
+                continue;
+            }
+            auto variable_names = info.read_ion_variable_name(var);
+            create_read_statements(variable_names);
+        }
+        for (const auto& var: ion.writes) {
+            if (type == BlockType::Ode && ion.is_ionic_conc(var) && info.state_variable(var)) {
+                continue;
+            }
+            if (ion.is_ionic_conc(var)) {
+                auto variable_names = info.read_ion_variable_name(var);
+                create_read_statements(variable_names);
+            }
+        }
+    }
+}
+
+/**
+ * For a given block type, add write ion statements
+ *
+ * Depending upon the block type, we have to update write ion variables
+ * during code generation. Depending on block/procedure being printed,
+ * this method adds necessary write ion variable statements and also
+ * corresponding index calculation statements. Note that index statements
+ * are added separately at the beginning for just readability purpose.
+ *
+ * @param type The type of code block being generated
+ * @param int_variables Index variables to be created
+ * @param double_variables Floating point variables to be created
+ * @param index_statements Statements for loading indexes (typically for ions)
+ * @param body_statements main compute/update statements
+ *
+ * \todo If intra or extra cellular ionic concentration is written
+ * then it requires call to `nrn_wrote_conc`. In C backend this is
+ * implemented in `ion_write_statements()` itself but this is not
+ * handled yet.
+ */
+void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
+                                                    std::vector<std::string>& int_variables,
+                                                    std::vector<std::string>& double_variables,
+                                                    ast::StatementVector& index_statements,
+                                                    ast::StatementVector& body_statements) {
+    /// create write ion and corresponding index statements
+    auto create_write_statements = [&](std::string ion_varname, std::string op, std::string rhs) {
+        // index for writing ion variable
+        std::string index_varname = "{}_id"_format(ion_varname);
+        // load index
+        std::string index_statement = "{} = {}_index[id]"_format(index_varname, ion_varname);
+        // ion variable to write (with index)
+        std::string ion_to_write = "{}[{}]"_format(ion_varname, index_varname);
+        // push index definition, index statement and actual write statement
+        int_variables.push_back(index_varname);
+        index_statements.push_back(visitor::create_statement(index_statement));
+        body_statements.push_back(create_atomic_statement(ion_to_write, op, rhs));
+    };
+
+    /// iterate over all ions and create write ion statements for given block type
+    for (const auto& ion: info.ions) {
+        std::string concentration;
+        std::string name = ion.name;
+        for (const auto& var: ion.writes) {
+            auto variable_names = info.write_ion_variable_name(var);
+            /// ionic currents are accumulated
+            if (ion.is_ionic_current(var)) {
+                if (type == BlockType::Equation) {
+                    std::string current = info.breakpoint_current(var);
+                    std::string lhs = variable_names.first;
+                    std::string op = "+=";
+                    std::string rhs = current;
+                    // for synapse type
+                    if (info.point_process) {
+                        auto area = codegen::naming::NODE_AREA_VARIABLE;
+                        rhs += "*(1.e2/{})"_format(area);
+                    }
+                    create_write_statements(lhs, op, rhs);
+                }
+            } else {
+                if (!ion.is_rev_potential(var)) {
+                    concentration = var;
+                }
+                std::string lhs = variable_names.first;
+                std::string op = "=";
+                std::string rhs = variable_names.second;
+                create_write_statements(lhs, op, rhs);
+            }
+        }
+
+        /// still need to handle, need to define easy to use API
+        if (type == BlockType::Initial && !concentration.empty()) {
+            int index = 0;
+            if (ion.is_intra_cell_conc(concentration)) {
+                index = 1;
+            } else if (ion.is_extra_cell_conc(concentration)) {
+                index = 2;
+            } else {
+                /// \todo Unhandled case also in neuron implementation
+                throw std::logic_error("codegen error for {} ion"_format(ion.name));
+            }
+            std::string ion_type_name = "{}_type"_format(ion.name);
+            std::string lhs = "int {}"_format(ion_type_name);
+            std::string op = "=";
+            std::string rhs = ion_type_name;
+            create_write_statements(lhs, op, rhs);
+            logger->error("conc_write_statement() call is required but it's not supported");
+        }
+    }
+}
+
+/**
+ * Convert variables in given node to instance variables
+ *
+ * For code generation, variables of type range, assigned, state or parameter+range
+ * needs to be converted to instance variable i.e. they need to be accessed with
+ * loop index variable. For example, `h` variables needs to be converted to `h[id]`.
+ *
+ * @param node Ast node under which variables to be converted to instance type
+ */
+void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
+                                                            std::string& index_var) {
+    /// collect all variables in the node of type ast::VarName
+    auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
+    for (auto& v: variables) {
+        auto variable = std::dynamic_pointer_cast<ast::VarName>(v);
+        /// if variable is of type instance then convert it to index
+        if (info.is_an_instance_variable(variable->get_node_name())) {
+            auto name = variable->get_name()->clone();
+            auto index = new ast::Name(new ast::String(index_var));
+            auto indexed_name = std::make_shared<ast::IndexedName>(name, index);
+            variable->set_name(indexed_name);
+        }
+    }
+}
+
+/**
+ * \brief Visit StatementBlock and convert Local statement for code generation
+ * @param node AST node representing Statement block
+ *
+ * Statement blocks can have LOCAL statement and if it exist it's typically
+ * first statement in the vector. We have to remove LOCAL statement and convert
+ * it to CodegenVarListStatement that will represent all variables as double.
+ */
+void CodegenLLVMHelperVisitor::visit_statement_block(ast::StatementBlock& node) {
+    /// first process all children blocks if any
+    node.visit_children(*this);
+
+    /// check if block contains LOCAL statement
+    const auto& local_statement = visitor::get_local_list_statement(node);
+    if (local_statement) {
+        /// create codegen variables from local variables
+        /// clone variable to make new independent statement
+        ast::CodegenVarVector variables;
+        for (const auto& var: local_statement->get_variables()) {
+            variables.emplace_back(new ast::CodegenVar(0, var->get_name()->clone()));
+        }
+
+        /// remove local list statement now
+        const auto& statements = node.get_statements();
+        node.erase_statement(statements.begin());
+
+        /// create new codegen variable statement and insert at the beginning of the block
+        auto type = new ast::CodegenVarType(FLOAT_TYPE);
+        auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+        node.insert_statement(statements.begin(), statement);
+    }
+}
+
 void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
     node.visit_children(*this);
-    add_function_procedure_node(node);
+    create_function_for_node(node);
 }
 
 void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
     node.visit_children(*this);
-    add_function_procedure_node(node);
+    create_function_for_node(node);
+}
+
+/**
+ * \brief Convert ast::NrnStateBlock to corresponding code generation function nrn_state
+ * @param node AST node representing ast::NrnStateBlock
+ *
+ * Solver passes converts DERIVATIVE block from MOD into ast::NrnStateBlock node
+ * that represent `nrn_state` function in the generated CPP code. To help this
+ * code generation, we perform various transformation on ast::NrnStateBlock and
+ * create new code generation function.
+ */
+void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
+    /// statements for new function to be generated
+    ast::StatementVector function_statements;
+
+    /// create variable definition for loop index and insert at the beginning
+    std::string loop_index_var = "id";
+    std::vector<std::string> int_variables{"id"};
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+
+    /// create now main compute part : for loop over channel instances
+
+    /// loop constructs : initialization, condition and increment
+    const auto& initialization = create_statement_as_expression("id = 0");
+    const auto& condition = create_expression("id < node_count");
+    const auto& increment = create_statement_as_expression("id = id + 1");
+
+    /// loop body : initialization + solve blocks
+    ast::StatementVector loop_def_statements;
+    ast::StatementVector loop_index_statements;
+    ast::StatementVector loop_body_statements;
+    {
+        std::vector<std::string> int_variables{"node_id"};
+        std::vector<std::string> double_variables{"v"};
+
+        /// access node index and corresponding voltage
+        loop_index_statements.push_back(visitor::create_statement("node_id = node_index[id]"));
+        loop_body_statements.push_back(visitor::create_statement("v = voltage[node_id]"));
+
+        /// read ion variables
+        ion_read_statements(BlockType::State,
+                            int_variables,
+                            double_variables,
+                            loop_index_statements,
+                            loop_body_statements);
+
+        /// main compute node : extract solution expressions from the derivative block
+        const auto& solutions = collect_nodes(node, {ast::AstNodeType::SOLUTION_EXPRESSION});
+        for (const auto& statement: solutions) {
+            const auto& solution = std::dynamic_pointer_cast<ast::SolutionExpression>(statement);
+            const auto& block = std::dynamic_pointer_cast<ast::StatementBlock>(
+                solution->get_node_to_solve());
+            append_statements_from_block(loop_body_statements, block);
+        }
+
+        /// add breakpoint block if no current
+        if (info.currents.empty() && info.breakpoint_node != nullptr) {
+            auto block = info.breakpoint_node->get_statement_block();
+            append_statements_from_block(loop_body_statements, block);
+        }
+
+        /// write ion statements
+        ion_write_statements(BlockType::State,
+                             int_variables,
+                             double_variables,
+                             loop_index_statements,
+                             loop_body_statements);
+
+        loop_def_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+        loop_def_statements.push_back(
+            create_local_variable_statement(double_variables, FLOAT_TYPE));
+
+        // \todo handle process_shadow_update_statement and wrote_conc_call yet
+    }
+
+    ast::StatementVector loop_body;
+    loop_body.insert(loop_body.end(), loop_def_statements.begin(), loop_def_statements.end());
+    loop_body.insert(loop_body.end(), loop_index_statements.begin(), loop_index_statements.end());
+    loop_body.insert(loop_body.end(), loop_body_statements.begin(), loop_body_statements.end());
+
+    /// now construct a new code block which will become the body of the loop
+    auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
+
+    /// convert all variables inside loop body to instance variables
+    convert_to_instance_variable(*loop_block, loop_index_var);
+
+    /// create for loop node
+    auto for_loop_statement = std::make_shared<ast::CodegenForStatement>(initialization,
+                                                                         condition,
+                                                                         increment,
+                                                                         loop_block);
+
+    /// loop itself becomes one of the statement in the function
+    function_statements.push_back(for_loop_statement);
+
+    /// new block for the function
+    auto function_block = new ast::StatementBlock(function_statements);
+
+    /// name of the function and it's return type
+    std::string function_name = "nrn_state_" + stringutils::tolower(info.mod_suffix);
+    auto name = new ast::Name(new ast::String(function_name));
+    auto return_type = new ast::CodegenVarType(ast::AstNodeType::VOID);
+
+    /// \todo : currently there are no arguments
+    ast::CodegenArgumentVector code_arguments;
+
+    /// finally, create new function
+    auto function =
+        std::make_shared<ast::CodegenFunction>(return_type, name, code_arguments, function_block);
+    codegen_functions.push_back(function);
+
+    std::cout << nmodl::to_nmodl(function);
 }
 
 void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
+    /// run codegen helper visitor to collect information
+    CodegenHelperVisitor v;
+    info = v.analyze(node);
+
     logger->info("Running CodegenLLVMHelperVisitor");
     node.visit_children(*this);
     for (auto& fun: codegen_functions) {
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index b7ff57aec1..6b1684e7d1 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -28,19 +28,57 @@ namespace codegen {
 
 /**
  * \class CodegenLLVMHelperVisitor
- * \brief Helper visitor to gather AST information to help LLVM code generation
+ * \brief Helper visitor for AST information to help code generation backends
+ *
+ * Code generation backends convert NMODL AST to C++ code. But during this
+ * C++ code generation, various transformations happens and final code generated
+ * is quite different / large than actual kernel represented in MOD file ro
+ * NMODL AST.
+ *
+ * Currently, these transformations are embedded into code generation backends
+ * like ast::CodegenCVisitor. If we have to generate code for new simulator, there
+ * will be duplication of these transformations. Also, for completely new
+ * backends like NEURON simulator or SIMD library, we will have code duplication.
+ *
+ * In order to avoid this, we perform maximum transformations in this visitor.
+ * Currently we focus on transformations that will help LLVM backend but later
+ * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
+    /// newly generated code generation specific functions
     std::vector<std::shared_ptr<ast::CodegenFunction>> codegen_functions;
 
-    void add_function_procedure_node(ast::Block& node);
+    /// ast information for code generation
+    codegen::CodegenInfo info;
+
+    /// default integer and float node type
+    const ast::AstNodeType INTEGER_TYPE = ast::AstNodeType::INTEGER;
+    const ast::AstNodeType FLOAT_TYPE = ast::AstNodeType::DOUBLE;
+
+    /// create new function for FUNCTION or PROCEDURE block
+    void create_function_for_node(ast::Block& node);
 
   public:
     CodegenLLVMHelperVisitor() = default;
 
+    void ion_read_statements(BlockType type,
+                             std::vector<std::string>& int_variables,
+                             std::vector<std::string>& double_variables,
+                             ast::StatementVector& index_statements,
+                             ast::StatementVector& body_statements);
+
+    void ion_write_statements(BlockType type,
+                              std::vector<std::string>& int_variables,
+                              std::vector<std::string>& double_variables,
+                              ast::StatementVector& index_statements,
+                              ast::StatementVector& body_statements);
+
+    void convert_to_instance_variable(ast::Node& node, std::string& index_var);
+
     void visit_statement_block(ast::StatementBlock& node) override;
     void visit_procedure_block(ast::ProcedureBlock& node) override;
     void visit_function_block(ast::FunctionBlock& node) override;
+    void visit_nrn_state_block(ast::NrnStateBlock& node) override;
     void visit_program(ast::Program& node) override;
 };
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0fa0864d9a..6228b39d04 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -350,7 +350,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     // Keep this for easier development (maybe move to debug mode later).
     std::cout << print_module();
 
-    // not used yet
+    // not used yet : this will be used at the beginning of this function
     {
         CodegenLLVMHelperVisitor v;
         v.visit_program(const_cast<ast::Program&>(node));
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 585ac19917..25fc5151c4 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -66,6 +66,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/boolean.hpp
     ${PROJECT_BINARY_DIR}/src/ast/breakpoint_block.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_argument.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_atomic_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_for_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_function.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
@@ -193,6 +194,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/valence.hpp
     ${PROJECT_BINARY_DIR}/src/ast/var_name.hpp
     ${PROJECT_BINARY_DIR}/src/ast/verbatim.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/void.hpp
     ${PROJECT_BINARY_DIR}/src/ast/watch.hpp
     ${PROJECT_BINARY_DIR}/src/ast/watch_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/while_statement.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 7d5516c196..5e26bc3f0f 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -28,6 +28,9 @@
             - Expression:
                 children:
                   - Number:
+                  - Void:
+                      nmodl: "VOID"
+                      brief: "Represent void type in code generation"
                   - Identifier:
                       children:
                         - CodegenVarType:
@@ -185,7 +188,7 @@
                             brief: "condition expression for the loop"
                             type: Expression
                             optional: true
-                            prefix: {value: ";"}
+                            prefix: {value: "; "}
                             suffix: {value: "; "}
                         - increment:
                             brief: "increment or decrement expression for the loop"
@@ -216,4 +219,36 @@
                             type: CodegenVar
                             vector: true
                             separator: ", "
-                            add: true
\ No newline at end of file
+                            add: true
+                  - CodegenAtomicStatement:
+                      brief: "Represent atomic operation"
+                      description: |
+                        During code generation certain operations like ion updates, vec_rhs or
+                        vec_d updates (for synapse) needs to be atomic operations if executed by
+                        multiple threads. In case of SIMD, there are conflicts for `vec_d` and
+                        `vec_rhs` for synapse types. Here are some statements from C++ backend:
+
+                        \code{.cpp}
+                            vec_d[node_id] += g
+                            vec_rhs[node_id] -= rhs
+                            ion_ina[indexes[some_index]] += ina[id]
+                            ion_cai[indexes[some_index]] = cai[id]  // cai here is state variable
+                        \endcode
+
+                        These operations will be represented by atomic statement node type:
+                        * `vec_d[node_id]` : lhs
+                        * `+=` : atomic_op
+                        * `g` : rhs
+
+                      members:
+                        - lhs:
+                            brief: "Variable to be updated atomically"
+                            type: Identifier
+                        - atomic_op:
+                            brief: "Operator"
+                            type: BinaryOperator
+                            prefix: {value: " "}
+                            suffix: {value: " "}
+                        - rhs:
+                            brief: "Expression for atomic operation"
+                            type: Expression
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 7131e4eba1..a3d28fa3e6 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -120,10 +120,8 @@ endif()
 set(testvisitor_env "PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH}")
 if(NOT LINK_AGAINST_PYTHON)
   list(APPEND testvisitor_env "NMODL_PYLIB=$ENV{NMODL_PYLIB}")
-  list(
-    APPEND
-      testvisitor_env
-      "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  list(APPEND testvisitor_env
+       "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif()
 
 foreach(

From bcc091b34fa6c81dad4e4c3c3e47e61088013b77 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 8 Jan 2021 10:58:58 +0300
Subject: [PATCH 014/331] Running functions from MOD files via LLVM JIT (#482)

This commit introduces a functionality to execute functions from MOD file via LLVM jit.

For that, there is now:
- `JITDriver` class that, given a LLVM IR module, set ups the JIT compiler and is able to look up a function and execute it.
- `Runner` class that wraps around JIT driver. It helps to initialise JIT with LLVM IR module only once, and then run multiple functions from it.

To execute functions, `nmodl_llvm_runner` executable is used. It takes a single mod file and a specified entry-point function, and runs it via LLVM code generation pipeline and JIT driver. Only functions with double result types are supported at the moment.

For example, for MOD file `foo.mod`:
```
FUNCTION one() {
    one = 1
}

FUNCTION bar() {
    bar = one() + exp(1)
}
```
running `nmodl_llvm_runner -f foo.mod -e bar` gives
```
Result: 3.718282
```

Tests for execution of generated  IR have been added as well.

fixes #482

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 cmake/LLVMHelper.cmake                       |   2 +-
 src/codegen/llvm/CMakeLists.txt              |  38 ++++-
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  10 +-
 src/codegen/llvm/jit_driver.cpp              |  81 ++++++++++
 src/codegen/llvm/jit_driver.hpp              |  82 ++++++++++
 src/codegen/llvm/main.cpp                    |  74 +++++++++
 test/unit/CMakeLists.txt                     |  13 ++
 test/unit/codegen/codegen_llvm_execution.cpp | 162 +++++++++++++++++++
 8 files changed, 457 insertions(+), 5 deletions(-)
 create mode 100644 src/codegen/llvm/jit_driver.cpp
 create mode 100644 src/codegen/llvm/jit_driver.hpp
 create mode 100644 src/codegen/llvm/main.cpp
 create mode 100644 test/unit/codegen/codegen_llvm_execution.cpp

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 5d451697b9..a731fa0151 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -5,7 +5,7 @@
 find_package(LLVM REQUIRED CONFIG)
 
 # include LLVM header and core library
-llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core native)
+llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core orcjit native)
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
 set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index db16d4072c..bd54f4143d 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -5,12 +5,44 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp)
 
 # =============================================================================
-# LLVM codegen library
+# LLVM codegen library and executable
 # =============================================================================
 
 include_directories(${LLVM_INCLUDE_DIRS})
-add_library(llvm_codegen STATIC ${LLVM_CODEGEN_SOURCE_FILES})
+add_library(runner_obj OBJECT ${LLVM_CODEGEN_SOURCE_FILES})
+add_dependencies(runner_obj lexer_obj)
+set_property(TARGET runner_obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+add_library(llvm_codegen STATIC $<TARGET_OBJECTS:runner_obj>)
+
 add_dependencies(llvm_codegen lexer util visitor)
+
+if(NOT NMODL_AS_SUBPROJECT)
+  add_executable(nmodl_llvm_runner main.cpp)
+
+  target_link_libraries(
+    nmodl_llvm_runner
+    llvm_codegen
+    codegen
+    visitor
+    symtab
+    lexer
+    util
+    test_util
+    printer
+    ${NMODL_WRAPPER_LIBS}
+    ${LLVM_LIBS_TO_LINK})
+endif()
+
+# =============================================================================
+# Install executable
+# =============================================================================
+
+if(NOT NMODL_AS_SUBPROJECT)
+  install(TARGETS nmodl_llvm_runner DESTINATION ${NMODL_INSTALL_DIR_SUFFIX}bin)
+endif()
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index be4eb04867..599cfc7b58 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -154,6 +154,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     void emit_procedure_or_function_declaration(const ast::Block& node);
 
+    /**
+     * Return module pointer
+     * \return LLVM IR module pointer
+     */
+    std::unique_ptr<llvm::Module> get_module() {
+        return std::move(module);
+    }
+
     /**
      * Visit nmodl function or procedure
      * \param node the AST node representing the function or procedure in NMODL
@@ -173,7 +181,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
 
-    // TODO: use custom printer here
+    // \todo: move this to debug mode (e.g. -v option or --dump-ir)
     std::string print_module() const {
         std::string str;
         llvm::raw_string_ostream os(str);
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
new file mode 100644
index 0000000000..a7673bb2ff
--- /dev/null
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -0,0 +1,81 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "jit_driver.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+namespace nmodl {
+namespace runner {
+
+void JITDriver::init() {
+    llvm::InitializeNativeTarget();
+    llvm::InitializeNativeTargetAsmPrinter();
+
+    set_target_triple(module.get());
+    auto data_layout = module->getDataLayout();
+
+    // Create IR compile function callback.
+    auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
+        -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
+        auto tm = tm_builder.createTargetMachine();
+        if (!tm)
+            return tm.takeError();
+        return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(*tm));
+    };
+
+    auto jit_instance = cantFail(
+        llvm::orc::LLJITBuilder().setCompileFunctionCreator(compile_function_creator).create());
+
+    // Add a ThreadSafeModule to the driver.
+    llvm::orc::ThreadSafeModule tsm(std::move(module), std::make_unique<llvm::LLVMContext>());
+    cantFail(jit_instance->addIRModule(std::move(tsm)));
+    jit = std::move(jit_instance);
+
+    // Resolve symbols.
+    llvm::orc::JITDylib& sym_tab = jit->getMainJITDylib();
+    sym_tab.addGenerator(cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
+        data_layout.getGlobalPrefix())));
+}
+
+void JITDriver::set_target_triple(llvm::Module* module) {
+    auto target_triple = llvm::sys::getDefaultTargetTriple();
+    std::string error;
+    auto target = llvm::TargetRegistry::lookupTarget(target_triple, error);
+    if (!target)
+        throw std::runtime_error("Error: " + error + "\n");
+
+    std::string cpu(llvm::sys::getHostCPUName());
+    llvm::SubtargetFeatures features;
+    llvm::StringMap<bool> host_features;
+
+    if (llvm::sys::getHostCPUFeatures(host_features)) {
+        for (auto& f: host_features)
+            features.AddFeature(f.first(), f.second);
+    }
+
+    std::unique_ptr<llvm::TargetMachine> machine(
+        target->createTargetMachine(target_triple, cpu, features.getString(), {}, {}));
+    if (!machine)
+        throw std::runtime_error("Error: failed to create a target machine\n");
+
+    module->setDataLayout(machine->createDataLayout());
+    module->setTargetTriple(target_triple);
+}
+
+}  // namespace runner
+}  // namespace nmodl
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
new file mode 100644
index 0000000000..d1e9a9412f
--- /dev/null
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -0,0 +1,82 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief Implementation of LLVM's JIT-based execution engine to run functions from MOD files
+ *
+ * \file
+ * \brief \copybrief nmodl::runner::JITDriver
+ */
+
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+
+namespace nmodl {
+namespace runner {
+
+/**
+ * \class JITDriver
+ * \brief Driver to execute MOD file function via LLVM IR backend
+ */
+class JITDriver {
+  private:
+    std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
+
+    std::unique_ptr<llvm::orc::LLJIT> jit;
+
+    std::unique_ptr<llvm::Module> module;
+
+  public:
+    JITDriver(std::unique_ptr<llvm::Module> m)
+        : module(std::move(m)) {}
+
+    /// Initialize the JIT.
+    void init();
+
+    /// Lookup the entry-point in the JIT and execute it, returning the result.
+    template <typename T>
+    T execute(const std::string& entry_point) {
+        auto expected_symbol = jit->lookup(entry_point);
+        if (!expected_symbol)
+            throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
+
+        auto (*res)() = (T(*)())(intptr_t) expected_symbol->getAddress();
+        T result = res();
+        return result;
+    }
+
+    /// Set the target triple on the module.
+    static void set_target_triple(llvm::Module* module);
+};
+
+/**
+ * \class Runner
+ * \brief A wrapper around JITDriver to execute an entry point in the LLVM IR module.
+ */
+class Runner {
+  private:
+    std::unique_ptr<llvm::Module> module;
+
+    std::unique_ptr<JITDriver> driver = std::make_unique<JITDriver>(std::move(module));
+
+  public:
+    Runner(std::unique_ptr<llvm::Module> m)
+        : module(std::move(m)) {
+        driver->init();
+    }
+
+    /// Run the entry-point function.
+    template <typename T>
+    double run(const std::string& entry_point) {
+        return driver->execute<T>(entry_point);
+    }
+};
+
+}  // namespace runner
+}  // namespace nmodl
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
new file mode 100644
index 0000000000..11ea178cb4
--- /dev/null
+++ b/src/codegen/llvm/main.cpp
@@ -0,0 +1,74 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <CLI/CLI.hpp>
+
+#include "ast/program.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "jit_driver.hpp"
+#include "parser/nmodl_driver.hpp"
+#include "utils/logger.hpp"
+#include "visitors/symtab_visitor.hpp"
+
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+using namespace nmodl;
+using namespace runner;
+
+int main(int argc, const char* argv[]) {
+    CLI::App app{
+        "NMODL LLVM Runner : Executes functions from a MOD file via LLVM IR code generation"};
+
+    // Currently, only a single MOD file is supported, as well as an entry point with a double
+    // return type. While returning a double value is a general case in NMODL, it will be nice to
+    // have a more generic functionality. \todo: Add support for different return types (int, void).
+
+    std::string filename;
+    std::string entry_point_name = "main";
+
+    app.add_option("-f,--file,file", filename, "A single MOD file source")
+        ->required()
+        ->check(CLI::ExistingFile);
+    app.add_option("-e,--entry-point,entry-point",
+                   entry_point_name,
+                   "An entry point function from the MOD file");
+
+    CLI11_PARSE(app, argc, argv);
+
+    logger->info("Parsing MOD file to AST");
+    parser::NmodlDriver driver;
+    const auto& ast = driver.parse_file(filename);
+
+    logger->info("Running Symtab Visitor");
+    visitor::SymtabVisitor().visit_program(*ast);
+
+    logger->info("Running LLVM Visitor");
+    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", /*opt_passes=*/false);
+    llvm_visitor.visit_program(*ast);
+    std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+
+    // Check if the entry-point is valid for JIT driver to execute.
+    auto func = module->getFunction(entry_point_name);
+    if (!func)
+        throw std::runtime_error("Error: entry-point is not found\n");
+
+    if (func->getNumOperands() != 0)
+        throw std::runtime_error("Error: entry-point functions with arguments are not supported\n");
+
+    if (!func->getReturnType()->isDoubleTy())
+        throw std::runtime_error(
+            "Error: entry-point functions with non-double return type are not supported\n");
+
+    Runner runner(std::move(module));
+
+    // Since only double type is supported, provide explicit double type to the running function.
+    auto r = runner.run<double>(entry_point_name);
+    fprintf(stderr, "Result: %f\n", r);
+
+    return 0;
+}
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index a3d28fa3e6..65adeab88e 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -98,6 +98,7 @@ target_link_libraries(
 if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS})
   add_executable(testllvm visitor/main.cpp codegen/llvm.cpp)
+  add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_llvm_execution.cpp)
   target_link_libraries(
     testllvm
     llvm_codegen
@@ -110,6 +111,18 @@ if(NMODL_ENABLE_LLVM)
     printer
     ${NMODL_WRAPPER_LIBS}
     ${LLVM_LIBS_TO_LINK})
+  target_link_libraries(
+    test_llvm_runner
+    llvm_codegen
+    codegen
+    visitor
+    symtab
+    lexer
+    util
+    test_util
+    printer
+    ${NMODL_WRAPPER_LIBS}
+    ${LLVM_LIBS_TO_LINK})
   set(CODEGEN_TEST testllvm)
 endif()
 
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
new file mode 100644
index 0000000000..6f1bf7b8ca
--- /dev/null
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -0,0 +1,162 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <catch/catch.hpp>
+#include <regex>
+
+#include "ast/program.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/jit_driver.hpp"
+#include "parser/nmodl_driver.hpp"
+#include "visitors/checkparent_visitor.hpp"
+#include "visitors/symtab_visitor.hpp"
+
+using namespace nmodl;
+using namespace runner;
+using namespace visitor;
+using nmodl::parser::NmodlDriver;
+
+static double EPSILON = 1e-15;
+
+//=============================================================================
+// No optimisations
+//=============================================================================
+
+SCENARIO("Arithmetic expression", "[llvm][runner]") {
+    GIVEN("Functions with some arithmetic expressions") {
+        std::string nmodl_text = R"(
+            FUNCTION exponential() {
+                LOCAL i
+                i = 1
+                exponential = exp(i)
+            }
+
+            FUNCTION constant() {
+                constant = 10
+            }
+
+            FUNCTION arithmetic() {
+                LOCAL x, y
+                x = 3
+                y = 7
+                arithmetic = x * y / (x + y)
+            }
+
+            FUNCTION bar() {
+                LOCAL i, j
+                i = 2
+                j = i + 2
+                bar = 2 * 3 + j
+            }
+
+            FUNCTION function_call() {
+                foo()
+                function_call = bar() / constant()
+            }
+
+            PROCEDURE foo() {}
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        SymtabVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/false);
+        llvm_visitor.visit_program(*ast);
+
+        std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
+        Runner runner(std::move(m));
+
+        THEN("functions are evaluated correctly") {
+            auto exp_result = runner.run<double>("exponential");
+            REQUIRE(fabs(exp_result - 2.718281828459045) < EPSILON);
+
+            auto constant_result = runner.run<double>("constant");
+            REQUIRE(fabs(constant_result - 10.0) < EPSILON);
+
+            auto arithmetic_result = runner.run<double>("arithmetic");
+            REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
+
+            auto function_call_result = runner.run<double>("function_call");
+            REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
+        }
+    }
+}
+
+//=============================================================================
+// With optimisations
+//=============================================================================
+
+SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
+    GIVEN("Functions with some arithmetic expressions") {
+        std::string nmodl_text = R"(
+            FUNCTION exponential() {
+                LOCAL i
+                i = 1
+                exponential = exp(i)
+            }
+
+            FUNCTION constant() {
+                constant = 10 * 2 - 100 / 50 * 5
+            }
+
+            FUNCTION arithmetic() {
+                LOCAL x, y
+                x = 3
+                y = 7
+                arithmetic = x * y / (x + y)
+            }
+
+            FUNCTION bar() {
+                LOCAL i, j
+                i = 2
+                j = i + 2
+                bar = 2 * 3 + j
+            }
+
+            FUNCTION function_call() {
+                foo()
+                function_call = bar() / constant()
+            }
+
+            PROCEDURE foo() {}
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        SymtabVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/true);
+        llvm_visitor.visit_program(*ast);
+
+        std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
+        Runner runner(std::move(m));
+
+        THEN("optimizations preserve function results") {
+            // Check exponential is turned into a constant.
+            auto exp_result = runner.run<double>("exponential");
+            REQUIRE(fabs(exp_result - 2.718281828459045) < EPSILON);
+
+            // Check constant folding.
+            auto constant_result = runner.run<double>("constant");
+            REQUIRE(fabs(constant_result - 10.0) < EPSILON);
+
+            // Check constant folding.
+            auto arithmetic_result = runner.run<double>("arithmetic");
+            REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
+
+            auto function_call_result = runner.run<double>("function_call");
+            REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
+        }
+    }
+}

From 838ed6fd225af620a4d73ecee627726847cf5eda Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 12 Jan 2021 12:50:22 +0300
Subject: [PATCH 015/331] Extended support for binary ops and refactoring
 (#489)

* Added more bin ops and refactored code
   - Now, there are code generation functions for all comparison
      and logical operators.
   - Code generation functions are now split based on the expression "type"
      (assignment, arithmetic, comparison, logical). Moreover, the lhs and rhs
      expression results can be both double and integer. This is important for
      control flow code generation and for the new AST node CodegenVarType.
* Added support for NOT op
* Added default type flag to switch between float and double
* Added tests for single precision
* Renames LLVM test file to codegen_llvm_ir.cpp to follow convention.
* NOTE : Tests for new operators will be added when the first control
                flow node (most likely FOR node) will land.

fixes #453
---
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 158 ++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  47 +++++-
 src/main.cpp                                  |   8 +-
 test/unit/CMakeLists.txt                      |   2 +-
 .../codegen/{llvm.cpp => codegen_llvm_ir.cpp} |  20 ++-
 5 files changed, 188 insertions(+), 47 deletions(-)
 rename test/unit/codegen/{llvm.cpp => codegen_llvm_ir.cpp} (95%)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6228b39d04..6f134149e3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -65,6 +65,12 @@ unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& i
     return static_cast<unsigned>(*macro->get_value());
 }
 
+llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
+    if (use_single_precision)
+        return llvm::Type::getFloatTy(*context);
+    return llvm::Type::getDoubleTy(*context);
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
@@ -139,10 +145,10 @@ void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Block
     // Procedure or function parameters are doubles by default.
     std::vector<llvm::Type*> arg_types;
     for (size_t i = 0; i < parameters.size(); ++i)
-        arg_types.push_back(llvm::Type::getDoubleTy(*context));
+        arg_types.push_back(get_default_fp_type());
 
     // If visiting a function, the return type is a double by default.
-    llvm::Type* return_type = node.is_function_block() ? llvm::Type::getDoubleTy(*context)
+    llvm::Type* return_type = node.is_function_block() ? get_default_fp_type()
                                                        : llvm::Type::getVoidTy(*context);
 
     // Create a function that is automatically inserted into module's symbol table.
@@ -152,6 +158,90 @@ void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Block
                            *module);
 }
 
+llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
+                                                         llvm::Value* rhs,
+                                                         unsigned op) {
+    const auto& bin_op = static_cast<ast::BinaryOp>(op);
+    llvm::Type* lhs_type = lhs->getType();
+    llvm::Value* result;
+
+    switch (bin_op) {
+#define DISPATCH(binary_op, llvm_fp_op, llvm_int_op)         \
+    case binary_op:                                          \
+        if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
+            result = llvm_fp_op(lhs, rhs);                   \
+        else                                                 \
+            result = llvm_int_op(lhs, rhs);                  \
+        return result;
+
+        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd, builder.CreateAdd);
+        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv, builder.CreateSDiv);
+        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul, builder.CreateMul);
+        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub, builder.CreateSub);
+
+#undef DISPATCH
+
+    default:
+        return nullptr;
+    }
+}
+
+void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs) {
+    auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
+    if (!var) {
+        throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
+    }
+
+    const auto& identifier = var->get_name();
+    if (identifier->is_name()) {
+        llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
+        builder.CreateStore(rhs, alloca);
+    } else if (identifier->is_indexed_name()) {
+        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        builder.CreateStore(rhs, codegen_indexed_name(*indexed_name));
+    } else {
+        throw std::runtime_error("Error: Unsupported variable type");
+    }
+}
+
+llvm::Value* CodegenLLVMVisitor::visit_logical_bin_op(llvm::Value* lhs,
+                                                      llvm::Value* rhs,
+                                                      unsigned op) {
+    const auto& bin_op = static_cast<ast::BinaryOp>(op);
+    return bin_op == ast::BinaryOp::BOP_AND ? builder.CreateAnd(lhs, rhs)
+                                            : builder.CreateOr(lhs, rhs);
+}
+
+llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
+                                                         llvm::Value* rhs,
+                                                         unsigned op) {
+    const auto& bin_op = static_cast<ast::BinaryOp>(op);
+    llvm::Type* lhs_type = lhs->getType();
+    llvm::Value* result;
+
+    switch (bin_op) {
+#define DISPATCH(binary_op, f_llvm_op, i_llvm_op)            \
+    case binary_op:                                          \
+        if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
+            result = f_llvm_op(lhs, rhs);                    \
+        else                                                 \
+            result = i_llvm_op(lhs, rhs);                    \
+        return result;
+
+        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, builder.CreateICmpEQ, builder.CreateFCmpOEQ);
+        DISPATCH(ast::BinaryOp::BOP_GREATER, builder.CreateICmpSGT, builder.CreateFCmpOGT);
+        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL, builder.CreateICmpSGE, builder.CreateFCmpOGE);
+        DISPATCH(ast::BinaryOp::BOP_LESS, builder.CreateICmpSLT, builder.CreateFCmpOLT);
+        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, builder.CreateICmpSLE, builder.CreateFCmpOLE);
+        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, builder.CreateICmpNE, builder.CreateFCmpONE);
+
+#undef DISPATCH
+
+    default:
+        return nullptr;
+    }
+}
+
 void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
     const auto& name = node.get_node_name();
     const auto& parameters = node.get_parameters();
@@ -222,44 +312,39 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
     llvm::Value* rhs = values.back();
     values.pop_back();
     if (op == ast::BinaryOp::BOP_ASSIGN) {
-        auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
-        if (!var) {
-            throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
-        }
-
-        const auto& identifier = var->get_name();
-        if (identifier->is_name()) {
-            llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
-            builder.CreateStore(rhs, alloca);
-        } else if (identifier->is_indexed_name()) {
-            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-            builder.CreateStore(rhs, codegen_indexed_name(*indexed_name));
-        } else {
-            throw std::runtime_error("Error: Unsupported variable type");
-        }
+        visit_assign_op(node, rhs);
         return;
     }
 
     node.get_lhs()->accept(*this);
     llvm::Value* lhs = values.back();
     values.pop_back();
-    llvm::Value* result;
 
-    // \todo: Support other binary operators
+    llvm::Value* result;
     switch (op) {
-#define DISPATCH(binary_op, llvm_op) \
-    case binary_op:                  \
-        result = llvm_op(lhs, rhs);  \
-        values.push_back(result);    \
+    case ast::BOP_ADDITION:
+    case ast::BOP_DIVISION:
+    case ast::BOP_MULTIPLICATION:
+    case ast::BOP_SUBTRACTION:
+        result = visit_arithmetic_bin_op(lhs, rhs, op);
         break;
-
-        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd);
-        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv);
-        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul);
-        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub);
-
-#undef DISPATCH
+    case ast::BOP_AND:
+    case ast::BOP_OR:
+        result = visit_logical_bin_op(lhs, rhs, op);
+        break;
+    case ast::BOP_EXACT_EQUAL:
+    case ast::BOP_GREATER:
+    case ast::BOP_GREATER_EQUAL:
+    case ast::BOP_LESS:
+    case ast::BOP_LESS_EQUAL:
+    case ast::BOP_NOT_EQUAL:
+        result = visit_comparison_bin_op(lhs, rhs, op);
+        break;
+    default:
+        throw std::runtime_error("Error: binary operator is not supported\n");
     }
+
+    values.push_back(result);
 }
 
 void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
@@ -269,8 +354,7 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
 }
 
 void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
-    const auto& constant = llvm::ConstantFP::get(llvm::Type::getDoubleTy(*context),
-                                                 node.get_value());
+    const auto& constant = llvm::ConstantFP::get(get_default_fp_type(), node.get_value());
     values.push_back(constant);
 }
 
@@ -310,10 +394,10 @@ void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatemen
         if (identifier->is_indexed_name()) {
             auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
             unsigned length = get_array_index_or_length(*indexed_name);
-            var_type = llvm::ArrayType::get(llvm::Type::getDoubleTy(*context), length);
+            var_type = llvm::ArrayType::get(get_default_fp_type(), length);
         } else if (identifier->is_name()) {
             // This case corresponds to a scalar local variable. Its type is double by default.
-            var_type = llvm::Type::getDoubleTy(*context);
+            var_type = get_default_fp_type();
         } else {
             throw std::runtime_error("Error: Unsupported local variable type");
         }
@@ -367,10 +451,10 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
     llvm::Value* value = values.back();
     values.pop_back();
     if (op == ast::UOP_NEGATION) {
-        llvm::Value* result = builder.CreateFNeg(value);
-        values.push_back(result);
+        values.push_back(builder.CreateFNeg(value));
+    } else if (op == ast::UOP_NOT) {
+        values.push_back(builder.CreateNot(value));
     } else {
-        // Support only `double` operators for now.
         throw std::runtime_error("Error: unsupported unary operator\n");
     }
 }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 599cfc7b58..066bdf35e3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -76,6 +76,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Run optimisation passes if true.
     bool opt_passes;
 
+    // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
+    bool use_single_precision;
+
     /**
      *\brief Run LLVM optimisation passes on generated IR
      *
@@ -93,10 +96,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
-                       bool opt_passes)
+                       bool opt_passes,
+                       bool use_single_precision = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
+        , use_single_precision(use_single_precision)
         , builder(*context)
         , fpm(module.get()) {}
 
@@ -129,6 +134,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     unsigned get_array_index_or_length(const ast::IndexedName& node);
 
+    /**
+     * Returns 64-bit or 32-bit LLVM floating type
+     * \return     \c LLVM floating point type according to `use_single_precision` flag
+     */
+    llvm::Type* get_default_fp_type();
+
     /**
      * Create a function call to an external method
      * \param name external method name
@@ -162,6 +173,40 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         return std::move(module);
     }
 
+    /**
+     * Visit nmodl arithmetic binary operator
+     * \param lhs LLVM value of evaluated lhs expression
+     * \param rhs LLVM value of evaluated rhs expression
+     * \param op the AST binary operator (ADD, DIV, MUL, SUB)
+     * \return LLVM IR value result
+     */
+    llvm::Value* visit_arithmetic_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
+
+    /**
+     * Visit nmodl assignment operator (ASSIGN)
+     * \param node the AST node representing the binary expression in NMODL
+     * \param rhs LLVM value of evaluated rhs expression
+     */
+    void visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs);
+
+    /**
+     * Visit nmodl logical binary operator
+     * \param lhs LLVM value of evaluated lhs expression
+     * \param rhs LLVM value of evaluated rhs expression
+     * \param op the AST binary operator (AND, OR)
+     * \return LLVM IR value result
+     */
+    llvm::Value* visit_logical_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
+
+    /**
+     * Visit nmodl comparison binary operator
+     * \param lhs LLVM value of evaluated lhs expression
+     * \param rhs LLVM value of evaluated rhs expression
+     * \param op the AST binary operator (EXACT_EQUAL, GREATER, GREATER_EQUAL, LESS, LESS_EQUAL,
+     * NOT_EQUAL) \return LLVM IR value result
+     */
+    llvm::Value* visit_comparison_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
+
     /**
      * Visit nmodl function or procedure
      * \param node the AST node representing the function or procedure in NMODL
diff --git a/src/main.cpp b/src/main.cpp
index 53ff1f0f47..035189f4cc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -167,6 +167,9 @@ int main(int argc, const char* argv[]) {
     /// generate llvm IR
     bool llvm_ir(false);
 
+    /// use single precision floating-point types
+    bool llvm_float_type(false);
+
     /// run llvm optimisation passes
     bool llvm_opt_passes(false);
 #endif
@@ -282,6 +285,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--opt",
         llvm_opt_passes,
         "Run LLVM optimisation passes ({})"_format(llvm_opt_passes))->ignore_case();
+    llvm_opt->add_flag("--single-precision",
+                       llvm_float_type,
+                       "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
 #endif
     // clang-format on
 
@@ -577,7 +583,7 @@ int main(int argc, const char* argv[]) {
 #ifdef NMODL_LLVM_BACKEND
             if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes);
+                CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes, llvm_float_type);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm"));
             }
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 65adeab88e..ef24242b69 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -97,7 +97,7 @@ target_link_libraries(
 
 if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS})
-  add_executable(testllvm visitor/main.cpp codegen/llvm.cpp)
+  add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_llvm_execution.cpp)
   target_link_libraries(
     testllvm
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
similarity index 95%
rename from test/unit/codegen/llvm.cpp
rename to test/unit/codegen/codegen_llvm_ir.cpp
index d644947e79..e44b2b15cd 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -22,13 +22,18 @@ using nmodl::parser::NmodlDriver;
 // Utility to get LLVM module as a string
 //=============================================================================
 
-std::string run_llvm_visitor(const std::string& text, bool opt = false) {
+std::string run_llvm_visitor(const std::string& text,
+                             bool opt = false,
+                             bool use_single_precision = false) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
 
-    codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".", opt);
+    codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                             /*output_dir=*/".",
+                                             opt,
+                                             use_single_precision);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.print_module();
 }
@@ -47,14 +52,15 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
         )";
 
         THEN("variables are loaded and add instruction is created") {
-            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::string module_string =
+                run_llvm_visitor(nmodl_text, /*opt=*/false, /*use_single_precision=*/true);
             std::smatch m;
 
-            std::regex rhs(R"(%1 = load double, double\* %b)");
-            std::regex lhs(R"(%2 = load double, double\* %a)");
-            std::regex res(R"(%3 = fadd double %2, %1)");
+            std::regex rhs(R"(%1 = load float, float\* %b)");
+            std::regex lhs(R"(%2 = load float, float\* %a)");
+            std::regex res(R"(%3 = fadd float %2, %1)");
 
-            // Check the values are loaded correctly and added
+            // Check the float values are loaded correctly and added
             REQUIRE(std::regex_search(module_string, m, rhs));
             REQUIRE(std::regex_search(module_string, m, lhs));
             REQUIRE(std::regex_search(module_string, m, res));

From eaeb7aa9bc13efc56d6db2aea92c33cefa09e617 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Tue, 12 Jan 2021 10:55:23 +0100
Subject: [PATCH 016/331] Avoid converting LOCAL statement in all
 StatementBlocks (#492)

* visit_statement_block of all FUNCTION and PROCEDURE
    blocks was called resulting in changing LOCAL
    statement to DOUBLE statement
  * As statement block doesn't need to be visited for this
    purpose, rename function to convert_local_statement
  * Call convert_local_statement only when required i.e.
    only when codegen function creation time.

fixes #491
---
 src/codegen/llvm/codegen_llvm_helper_visitor.cpp | 8 +++++++-
 src/codegen/llvm/codegen_llvm_helper_visitor.hpp | 3 ++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 341ab03fb6..4dec93c52e 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -123,6 +123,9 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     auto block = node.get_statement_block()->clone();
     const auto& statements = block->get_statements();
 
+    /// convert local statement to codegenvar statement
+    convert_local_statement(*block);
+
     /// insert return variable at the start of the block
     ast::CodegenVarVector codegen_vars;
     codegen_vars.emplace_back(new ast::CodegenVar(0, return_var->clone()));
@@ -356,7 +359,7 @@ void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
  * first statement in the vector. We have to remove LOCAL statement and convert
  * it to CodegenVarListStatement that will represent all variables as double.
  */
-void CodegenLLVMHelperVisitor::visit_statement_block(ast::StatementBlock& node) {
+void CodegenLLVMHelperVisitor::convert_local_statement(ast::StatementBlock& node) {
     /// first process all children blocks if any
     node.visit_children(*this);
 
@@ -475,6 +478,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// convert all variables inside loop body to instance variables
     convert_to_instance_variable(*loop_block, loop_index_var);
 
+    /// convert local statement to codegenvar statement
+    convert_local_statement(*loop_block);
+
     /// create for loop node
     auto for_loop_statement = std::make_shared<ast::CodegenForStatement>(initialization,
                                                                          condition,
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 6b1684e7d1..1db659c1b4 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -75,7 +75,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
 
     void convert_to_instance_variable(ast::Node& node, std::string& index_var);
 
-    void visit_statement_block(ast::StatementBlock& node) override;
+    void convert_local_statement(ast::StatementBlock& node);
+
     void visit_procedure_block(ast::ProcedureBlock& node) override;
     void visit_function_block(ast::FunctionBlock& node) override;
     void visit_nrn_state_block(ast::NrnStateBlock& node) override;

From 6d60ca954c5581e49f026c34f2c842ef8c926540 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Wed, 13 Jan 2021 11:55:57 +0100
Subject: [PATCH 017/331] Handle CodegenVarType type in JSON printer  (#494)

* Handle CodegenVarType type in JSON printer
  - As AstNodeType is enum type and node itself,
    we need to print that explicitly
* Indent json visitor jinja template
 - initially template was not indented as code generated
   was not looking good
 - now all generated code is automatically clang-formatted
   so it's less of a concern. Readability is important.

fixes #493
---
 src/language/node_info.py                     |  1 +
 src/language/nodes.py                         |  4 ++
 .../templates/visitors/json_visitor.cpp       | 47 +++++++++++--------
 3 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/src/language/node_info.py b/src/language/node_info.py
index bd81a0d14a..8b4e5fe0a2 100644
--- a/src/language/node_info.py
+++ b/src/language/node_info.py
@@ -169,6 +169,7 @@
 STRING_NODE = "String"
 UNIT_BLOCK = "UnitBlock"
 AST_NODETYPE_NODE= "AstNodeType"
+CODEGEN_VAR_TYPE_NODE = "CodegenVarType"
 
 # name of variable in prime node which represent order of derivative
 ORDER_VAR_NAME = "order"
diff --git a/src/language/nodes.py b/src/language/nodes.py
index 05f53f3b97..4f96659569 100644
--- a/src/language/nodes.py
+++ b/src/language/nodes.py
@@ -151,6 +151,10 @@ def is_name_node(self):
     def is_ast_nodetype_node(self):
         return self.class_name == node_info.AST_NODETYPE_NODE
 
+    @property
+    def is_codegen_var_type_node(self):
+        return self.class_name == node_info.CODEGEN_VAR_TYPE_NODE
+
     @property
     def is_enum_node(self):
         data_type = node_info.DATA_TYPES[self.class_name]
diff --git a/src/language/templates/visitors/json_visitor.cpp b/src/language/templates/visitors/json_visitor.cpp
index e96bcbf10c..2a0c6d68a9 100644
--- a/src/language/templates/visitors/json_visitor.cpp
+++ b/src/language/templates/visitors/json_visitor.cpp
@@ -22,33 +22,40 @@ using namespace ast;
 {% for node in nodes %}
 void JSONVisitor::visit_{{ node.class_name|snake_case }}(const {{ node.class_name }}& node) {
     {% if node.has_children() %}
-    printer->push_block(node.get_node_type_name());
-    if (embed_nmodl) {
-        printer->add_block_property("nmodl", to_nmodl(node));
-    }
-    node.visit_children(*this);
-    {% if node.is_data_type_node %}
+        printer->push_block(node.get_node_type_name());
+        if (embed_nmodl) {
+            printer->add_block_property("nmodl", to_nmodl(node));
+        }
+        node.visit_children(*this);
+        {% if node.is_data_type_node %}
             {% if node.is_integer_node %}
-    if(!node.get_macro()) {
-        std::stringstream ss;
-        ss << node.eval();
-        printer->add_node(ss.str());
-    }
+                if(!node.get_macro()) {
+                    std::stringstream ss;
+                    ss << node.eval();
+                    printer->add_node(ss.str());
+                }
             {% else %}
-    std::stringstream ss;
-    ss << node.eval();
-    printer->add_node(ss.str());
+                std::stringstream ss;
+                ss << node.eval();
+                printer->add_node(ss.str());
             {% endif %}
         {% endif %}
-    printer->pop_block();
+
+        {% if node.is_codegen_var_type_node %}
+            printer->add_node(ast::to_string(node.get_type()));
+        {% endif %}
+
+        printer->pop_block();
+
         {% if node.is_program_node %}
-    if (node.get_parent() == nullptr) {
-        flush();
-    }
+            if (node.get_parent() == nullptr) {
+                flush();
+            }
         {% endif %}
+
     {% else %}
-    (void)node;
-    printer->add_node("{{ node.class_name }}");
+        (void)node;
+        printer->add_node("{{ node.class_name }}");
     {% endif %}
 }
 

From 5b32b31e699f707701b5a30955dc74d01ac63ada Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Mon, 25 Jan 2021 15:59:04 +0300
Subject: [PATCH 018/331] Integrating LLVM helper into LLVM visitor (#497)

* LLVM Helper visitor now can return a vector of `CodegenFunction`s.
* LLVM Helper visitor has been integrated into LLVM visitor:
   - The type of variables is still double by default, but can also be inferred from `CodegenVarType` node.
   - Procedure's return type changed to int (so that error codes can be returned in the future).
   - New visitor functions added: for `CodegenReturn`, `CodegenFunction`, `CodegenVarList` and `CodegenVarType`.
---
 .../llvm/codegen_llvm_helper_visitor.cpp      |   6 +-
 .../llvm/codegen_llvm_helper_visitor.hpp      |   8 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 234 ++++++++++--------
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  17 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  39 ++-
 5 files changed, 177 insertions(+), 127 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 4dec93c52e..751fecfc81 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -71,6 +71,11 @@ std::shared_ptr<ast::Expression> create_expression(const std::string& code) {
     return std::make_shared<ast::WrappedExpression>(rhs->clone());
 }
 
+CodegenFunctionVector CodegenLLVMHelperVisitor::get_codegen_functions(const ast::Program& node) {
+    const_cast<ast::Program&>(node).accept(*this);
+    return codegen_functions;
+}
+
 /**
  * \brief Add code generation function for FUNCTION or PROCEDURE block
  * @param node AST node representing FUNCTION or PROCEDURE
@@ -98,7 +103,6 @@ std::shared_ptr<ast::Expression> create_expression(const std::string& code) {
  *
  * We perform following transformations so that code generation backends
  * will have minimum logic:
- *  - Add return type
  *  - Add type for the function arguments
  *  - Define variables and return variable
  *  - Add return type (int for PROCEDURE and double for FUNCTION)
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 1db659c1b4..0ec3792b9d 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -21,6 +21,9 @@
 namespace nmodl {
 namespace codegen {
 
+
+typedef std::vector<std::shared_ptr<ast::CodegenFunction>> CodegenFunctionVector;
+
 /**
  * @addtogroup llvm_codegen_details
  * @{
@@ -46,7 +49,7 @@ namespace codegen {
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     /// newly generated code generation specific functions
-    std::vector<std::shared_ptr<ast::CodegenFunction>> codegen_functions;
+    CodegenFunctionVector codegen_functions;
 
     /// ast information for code generation
     codegen::CodegenInfo info;
@@ -61,6 +64,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
   public:
     CodegenLLVMHelperVisitor() = default;
 
+    /// run visitor and return code generation functions
+    CodegenFunctionVector get_codegen_functions(const ast::Program& node);
+
     void ion_read_statements(BlockType type,
                              std::vector<std::string>& int_variables,
                              std::vector<std::string>& double_variables,
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6f134149e3..2d762c0e92 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -9,7 +9,6 @@
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 
 #include "ast/all.hpp"
-#include "codegen/codegen_helper_visitor.hpp"
 #include "visitors/rename_visitor.hpp"
 
 #include "llvm/IR/BasicBlock.h"
@@ -28,8 +27,10 @@ namespace codegen {
 /****************************************************************************************/
 
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
-    llvm::Type* array_type =
-        local_named_values->lookup(node.get_node_name())->getType()->getPointerElementType();
+    llvm::Type* array_type = current_func->getValueSymbolTable()
+                                 ->lookup(node.get_node_name())
+                                 ->getType()
+                                 ->getPointerElementType();
     unsigned length = array_type->getArrayNumElements();
     return 0 <= index && index < length;
 }
@@ -40,7 +41,7 @@ llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, unsigned in
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
     indices.push_back(llvm::ConstantInt::get(index_type, index));
 
-    return builder.CreateInBoundsGEP(local_named_values->lookup(name), indices);
+    return builder.CreateInBoundsGEP(current_func->getValueSymbolTable()->lookup(name), indices);
 }
 
 llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
@@ -65,6 +66,21 @@ unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& i
     return static_cast<unsigned>(*macro->get_value());
 }
 
+llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType& node) {
+    switch (node.get_type()) {
+    case ast::AstNodeType::BOOLEAN:
+        return llvm::Type::getInt1Ty(*context);
+    case ast::AstNodeType::DOUBLE:
+        return get_default_fp_type();
+    case ast::AstNodeType::INTEGER:
+        return llvm::Type::getInt32Ty(*context);
+    case ast::AstNodeType::VOID:
+        return llvm::Type::getVoidTy(*context);
+    default:
+        throw std::runtime_error("Error: expecting a type in CodegenVarType node\n");
+    }
+}
+
 llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
     if (use_single_precision)
         return llvm::Type::getFloatTy(*context);
@@ -138,18 +154,16 @@ void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
     values.push_back(call);
 }
 
-void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Block& node) {
+void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::CodegenFunction& node) {
     const auto& name = node.get_node_name();
-    const auto& parameters = node.get_parameters();
+    const auto& arguments = node.get_arguments();
 
     // Procedure or function parameters are doubles by default.
     std::vector<llvm::Type*> arg_types;
-    for (size_t i = 0; i < parameters.size(); ++i)
-        arg_types.push_back(get_default_fp_type());
+    for (size_t i = 0; i < arguments.size(); ++i)
+        arg_types.push_back(get_codegen_var_type(*arguments[i]->get_type()));
 
-    // If visiting a function, the return type is a double by default.
-    llvm::Type* return_type = node.is_function_block() ? get_default_fp_type()
-                                                       : llvm::Type::getVoidTy(*context);
+    llvm::Type* return_type = get_codegen_var_type(*node.get_return_type());
 
     // Create a function that is automatically inserted into module's symbol table.
     llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
@@ -194,7 +208,7 @@ void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm
 
     const auto& identifier = var->get_name();
     if (identifier->is_name()) {
-        llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
+        llvm::Value* alloca = current_func->getValueSymbolTable()->lookup(var->get_node_name());
         builder.CreateStore(rhs, alloca);
     } else if (identifier->is_indexed_name()) {
         auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
@@ -242,62 +256,6 @@ llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
     }
 }
 
-void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
-    const auto& name = node.get_node_name();
-    const auto& parameters = node.get_parameters();
-    llvm::Function* func = module->getFunction(name);
-
-    // Create the entry basic block of the function/procedure and point the local named values table
-    // to the symbol table.
-    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
-    builder.SetInsertPoint(body);
-    local_named_values = func->getValueSymbolTable();
-
-    // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
-    // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
-    // symbolic conflicts. Then, allocate the return variable on the local stack.
-    std::string return_var_name = "ret_" + name;
-    const auto& block = node.get_statement_block();
-    if (node.is_function_block()) {
-        visitor::RenameVisitor v(name, return_var_name);
-        block->accept(v);
-        builder.CreateAlloca(llvm::Type::getDoubleTy(*context),
-                             /*ArraySize=*/nullptr,
-                             return_var_name);
-    }
-
-    // Allocate parameters on the stack and add them to the symbol table.
-    unsigned i = 0;
-    for (auto& arg: func->args()) {
-        std::string arg_name = parameters[i++].get()->get_node_name();
-        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
-        arg.setName(arg_name);
-        builder.CreateStore(&arg, alloca);
-    }
-
-    // Process function or procedure body.
-    const auto& statements = block->get_statements();
-    for (const auto& statement: statements) {
-        // \todo: Support other statement types.
-        if (statement->is_local_list_statement() || statement->is_expression_statement())
-            statement->accept(*this);
-    }
-
-    // Add the terminator. If visiting function, we need to return the value specified by
-    // ret_<function_name>.
-    if (node.is_function_block()) {
-        llvm::Value* return_var = builder.CreateLoad(local_named_values->lookup(return_var_name));
-        builder.CreateRet(return_var);
-    } else {
-        builder.CreateRetVoid();
-    }
-
-    // Clear local values stack and remove the pointer to the local symbol table.
-    values.clear();
-    local_named_values = nullptr;
-}
-
-
 /****************************************************************************************/
 /*                            Overloaded visitor routines                               */
 /****************************************************************************************/
@@ -353,13 +311,101 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
     values.push_back(constant);
 }
 
+void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node) {
+    const auto& name = node.get_node_name();
+    const auto& arguments = node.get_arguments();
+    llvm::Function* func = module->getFunction(name);
+    current_func = func;
+
+    // Create the entry basic block of the function/procedure and point the local named values table
+    // to the symbol table.
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
+    builder.SetInsertPoint(body);
+
+    // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
+    // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
+    // symbolic conflicts.
+    std::string return_var_name = "ret_" + name;
+    const auto& block = node.get_statement_block();
+    visitor::RenameVisitor v(name, return_var_name);
+    block->accept(v);
+
+
+    // Allocate parameters on the stack and add them to the symbol table.
+    unsigned i = 0;
+    for (auto& arg: func->args()) {
+        std::string arg_name = arguments[i++].get()->get_node_name();
+        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
+        arg.setName(arg_name);
+        builder.CreateStore(&arg, alloca);
+    }
+
+    // Process function or procedure body. The return statement is handled in a separate visitor.
+    const auto& statements = block->get_statements();
+    for (const auto& statement: statements) {
+        // \todo: Support other statement types.
+        if (statement->is_codegen_var_list_statement() || statement->is_expression_statement() ||
+            statement->is_codegen_return_statement())
+            statement->accept(*this);
+    }
+
+    // If function has a void return type, add a terminator not handled by CodegenReturnVar.
+    if (node.is_void())
+        builder.CreateRetVoid();
+
+    // Clear local values stack and remove the pointer to the local symbol table.
+    values.clear();
+    current_func = nullptr;
+}
+
+void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturnStatement& node) {
+    if (!node.get_statement()->is_name())
+        throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
+
+    std::string ret = "ret_" + current_func->getName().str();
+    llvm::Value* ret_value = builder.CreateLoad(current_func->getValueSymbolTable()->lookup(ret));
+    builder.CreateRet(ret_value);
+}
+
+void CodegenLLVMVisitor::visit_codegen_var_list_statement(
+    const ast::CodegenVarListStatement& node) {
+    llvm::Type* scalar_var_type = get_codegen_var_type(*node.get_var_type());
+    for (const auto& variable: node.get_variables()) {
+        std::string name = variable->get_node_name();
+        const auto& identifier = variable->get_name();
+        // Local variable can be a scalar (Node AST class) or an array (IndexedName AST class). For
+        // each case, create memory allocations with the corresponding LLVM type.
+        llvm::Type* var_type;
+        if (identifier->is_indexed_name()) {
+            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+            unsigned length = get_array_index_or_length(*indexed_name);
+            var_type = llvm::ArrayType::get(scalar_var_type, length);
+        } else if (identifier->is_name()) {
+            // This case corresponds to a scalar local variable. Its type is double by default.
+            var_type = scalar_var_type;
+        } else {
+            throw std::runtime_error("Error: Unsupported local variable type");
+        }
+        llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+
+        // Check if the variable we process is a procedure return variable (i.e. it has a name
+        // "ret_<current_function_name>" and the function return type is integer). If so, initialise
+        // it to 0.
+        std::string ret_val_name = "ret_" + current_func->getName().str();
+        if (name == ret_val_name && current_func->getReturnType()->isIntegerTy()) {
+            llvm::Value* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 0);
+            builder.CreateStore(zero, alloca);
+        }
+    }
+}
+
 void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
     const auto& constant = llvm::ConstantFP::get(get_default_fp_type(), node.get_value());
     values.push_back(constant);
 }
 
 void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
-    visit_procedure_or_function(node);
+    // do nothing. \todo: remove old function blocks from ast.
 }
 
 void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
@@ -384,41 +430,19 @@ void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
     values.push_back(constant);
 }
 
-void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatement& node) {
-    for (const auto& variable: node.get_variables()) {
-        std::string name = variable->get_node_name();
-        const auto& identifier = variable->get_name();
-        // Local variable can be a scalar (Node AST class) or an array (IndexedName AST class). For
-        // each case, create memory allocations with the corresponding LLVM type.
-        llvm::Type* var_type;
-        if (identifier->is_indexed_name()) {
-            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-            unsigned length = get_array_index_or_length(*indexed_name);
-            var_type = llvm::ArrayType::get(get_default_fp_type(), length);
-        } else if (identifier->is_name()) {
-            // This case corresponds to a scalar local variable. Its type is double by default.
-            var_type = get_default_fp_type();
-        } else {
-            throw std::runtime_error("Error: Unsupported local variable type");
-        }
-        builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
-    }
-}
-
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
-    // Before generating LLVM, gather information about AST. For now, information about functions
-    // and procedures is used only.
-    CodegenHelperVisitor v;
-    CodegenInfo info = v.analyze(node);
-
-    // For every function and procedure, generate its declaration. Thus, we can look up
+    // Before generating LLVM:
+    //   - convert function and procedure blocks into CodegenFunctions
+    //   - gather information about AST. For now, information about functions
+    //     and procedures is used only.
+    CodegenLLVMHelperVisitor v;
+    const auto& functions = v.get_codegen_functions(node);
+
+    // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
-    for (const auto& func: info.functions) {
+    for (const auto& func: functions) {
         emit_procedure_or_function_declaration(*func);
     }
-    for (const auto& proc: info.procedures) {
-        emit_procedure_or_function_declaration(*proc);
-    }
 
     // Set the AST symbol table.
     sym_tab = node.get_symbol_table();
@@ -433,16 +457,10 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     // Keep this for easier development (maybe move to debug mode later).
     std::cout << print_module();
-
-    // not used yet : this will be used at the beginning of this function
-    {
-        CodegenLLVMHelperVisitor v;
-        v.visit_program(const_cast<ast::Program&>(node));
-    }
 }
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
-    visit_procedure_or_function(node);
+    // do nothing. \todo: remove old procedures from ast.
 }
 
 void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node) {
@@ -466,7 +484,7 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 
     llvm::Value* ptr;
     if (identifier->is_name())
-        ptr = local_named_values->lookup(node.get_node_name());
+        ptr = current_func->getValueSymbolTable()->lookup(node.get_node_name());
 
     if (identifier->is_indexed_name()) {
         auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 066bdf35e3..c6123a040d 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -67,8 +67,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Stack to hold visited values
     std::vector<llvm::Value*> values;
 
-    // Pointer to the local symbol table.
-    llvm::ValueSymbolTable* local_named_values = nullptr;
+    // Pointer to the current function.
+    llvm::Function* current_func = nullptr;
 
     // Pointer to AST symbol table.
     symtab::SymbolTable* sym_tab;
@@ -134,6 +134,13 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     unsigned get_array_index_or_length(const ast::IndexedName& node);
 
+    /**
+     * Returns LLVM type for the given CodegenVarType node
+     * \param node CodegenVarType
+     * \return LLVM type
+     */
+    llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
+
     /**
      * Returns 64-bit or 32-bit LLVM floating type
      * \return     \c LLVM floating point type according to `use_single_precision` flag
@@ -163,7 +170,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      *
      * \param node the AST node representing the function or procedure in NMODL
      */
-    void emit_procedure_or_function_declaration(const ast::Block& node);
+    void emit_procedure_or_function_declaration(const ast::CodegenFunction& node);
 
     /**
      * Return module pointer
@@ -216,11 +223,13 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Visitors
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
+    void visit_codegen_function(const ast::CodegenFunction& node) override;
+    void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
+    void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
     void visit_function_call(const ast::FunctionCall& node) override;
     void visit_integer(const ast::Integer& node) override;
-    void visit_local_list_statement(const ast::LocalListStatement& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index e44b2b15cd..c328113f93 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -198,12 +198,12 @@ SCENARIO("Function call", "[visitor][llvm]") {
             }
         )";
 
-        THEN("a void call instruction is created") {
+        THEN("an int call instruction is created") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
             // Check for call instruction.
-            std::regex call(R"(call void @bar\(\))");
+            std::regex call(R"(call i32 @bar\(\))");
             REQUIRE(std::regex_search(module_string, m, call));
         }
     }
@@ -408,13 +408,20 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             PROCEDURE empty() {}
         )";
 
-        THEN("empty void function is produced") {
+        THEN("a function returning 0 integer is produced") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check procedure has empty body with a void return.
-            std::regex procedure(R"(define void @empty\(\) \{\n(\s)*ret void\n\})");
-            REQUIRE(std::regex_search(module_string, m, procedure));
+            // Check procedure has empty body with a dummy 0 allocation.
+            std::regex signature(R"(define i32 @empty)");
+            std::regex alloc(R"(%ret_empty = alloca i32)");
+            std::regex store(R"(store i32 0, i32\* %ret_empty)");
+            std::regex load(R"(%1 = load i32, i32\* %ret_empty)");
+            std::regex ret(R"(ret i32 %1)");
+            REQUIRE(std::regex_search(module_string, m, signature));
+            REQUIRE(std::regex_search(module_string, m, alloc));
+            REQUIRE(std::regex_search(module_string, m, store));
+            REQUIRE(std::regex_search(module_string, m, ret));
         }
     }
 
@@ -423,23 +430,29 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             PROCEDURE with_argument(x) {}
         )";
 
-        THEN("void function is produced with arguments allocated on stack") {
+        THEN("int function is produced with arguments allocated on stack") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
             // Check procedure signature.
-            std::regex function_signature(R"(define void @with_argument\(double %x1\) \{)");
+            std::regex function_signature(R"(define i32 @with_argument\(double %x1\) \{)");
             REQUIRE(std::regex_search(module_string, m, function_signature));
 
+            // Check dummy return.
+            std::regex dummy_alloca(R"(%ret_with_argument = alloca i32)");
+            std::regex dummy_store(R"(store i32 0, i32\* %ret_with_argument)");
+            std::regex dummy_load(R"(%1 = load i32, i32\* %ret_with_argument)");
+            std::regex ret(R"(ret i32 %1)");
+            REQUIRE(std::regex_search(module_string, m, dummy_alloca));
+            REQUIRE(std::regex_search(module_string, m, dummy_store));
+            REQUIRE(std::regex_search(module_string, m, dummy_load));
+            REQUIRE(std::regex_search(module_string, m, ret));
+
             // Check that procedure arguments are allocated on the local stack.
             std::regex alloca_instr(R"(%x = alloca double)");
             std::regex store_instr(R"(store double %x1, double\* %x)");
             REQUIRE(std::regex_search(module_string, m, alloca_instr));
             REQUIRE(std::regex_search(module_string, m, store_instr));
-
-            // Check terminator.
-            std::regex terminator(R"(ret void)");
-            REQUIRE(std::regex_search(module_string, m, terminator));
         }
     }
 }
@@ -493,7 +506,7 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
 
             // Check if the values are optimised out
             std::regex empty_proc(
-                R"(define void @add\(double %a1, double %b2\) \{\n(\s)*ret void\n\})");
+                R"(define i32 @add\(double %a1, double %b2\) \{\n(\s)*ret i32 0\n\})");
             REQUIRE(std::regex_search(module_string, m, empty_proc));
         }
     }

From bc305bac69cd1b70af13ff168f8ce264b88dd46a Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Mon, 25 Jan 2021 17:06:22 +0300
Subject: [PATCH 019/331] LLVM code generation for if/else statements (#499)

* Added a new code generation function for conditional statements (`if`, `else if`, `else` and their nested variations).
* Added tests for the new code generation:
   - IR unit tests.
   - Execution tests.
* Fixed FP and integer comparison ordering in macros.

fixes #468
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    |  86 +++++++-
 src/codegen/llvm/codegen_llvm_visitor.hpp    |   1 +
 test/unit/codegen/codegen_llvm_execution.cpp |  28 +++
 test/unit/codegen/codegen_llvm_ir.cpp        | 203 +++++++++++++++++++
 4 files changed, 314 insertions(+), 4 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 2d762c0e92..bde36f3dd4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -26,6 +26,11 @@ namespace codegen {
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
+static bool is_supported_statement(const ast::Statement& statement) {
+    return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
+           statement.is_codegen_return_statement() || statement.is_if_statement();
+}
+
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
     llvm::Type* array_type = current_func->getValueSymbolTable()
                                  ->lookup(node.get_node_name())
@@ -234,7 +239,7 @@ llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
     llvm::Value* result;
 
     switch (bin_op) {
-#define DISPATCH(binary_op, f_llvm_op, i_llvm_op)            \
+#define DISPATCH(binary_op, i_llvm_op, f_llvm_op)            \
     case binary_op:                                          \
         if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
             result = f_llvm_op(lhs, rhs);                    \
@@ -343,9 +348,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     // Process function or procedure body. The return statement is handled in a separate visitor.
     const auto& statements = block->get_statements();
     for (const auto& statement: statements) {
-        // \todo: Support other statement types.
-        if (statement->is_codegen_var_list_statement() || statement->is_expression_statement() ||
-            statement->is_codegen_return_statement())
+        if (is_supported_statement(*statement))
             statement->accept(*this);
     }
 
@@ -424,6 +427,81 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
     }
 }
 
+void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
+    // Get the current and the next blocks within the function.
+    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* next = curr_block->getNextNode();
+    llvm::Function* func = curr_block->getParent();
+
+    // Add a true block and a merge block where the control flow merges.
+    llvm::BasicBlock* true_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+    llvm::BasicBlock* merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+
+    // Add condition to the current block.
+    node.get_condition()->accept(*this);
+    llvm::Value* cond = values.back();
+    values.pop_back();
+
+    // Process the true block.
+    builder.SetInsertPoint(true_block);
+    for (const auto& statement: node.get_statement_block()->get_statements()) {
+        if (is_supported_statement(*statement))
+            statement->accept(*this);
+    }
+    builder.CreateBr(merge_block);
+
+    // Save the merge block and proceed with codegen for `else if` statements.
+    llvm::BasicBlock* exit = merge_block;
+    for (const auto& else_if: node.get_elseifs()) {
+        // Link the current block to the true and else blocks.
+        llvm::BasicBlock* else_block =
+            llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
+        builder.SetInsertPoint(curr_block);
+        builder.CreateCondBr(cond, true_block, else_block);
+
+        // Process else block.
+        builder.SetInsertPoint(else_block);
+        else_if->get_condition()->accept(*this);
+        cond = values.back();
+        values.pop_back();
+
+        // Reassign true and merge blocks respectively. Note that the new merge block has to be
+        // connected to the old merge block (tmp).
+        true_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
+        llvm::BasicBlock* tmp = merge_block;
+        merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
+        builder.SetInsertPoint(merge_block);
+        builder.CreateBr(tmp);
+
+        // Process true block.
+        builder.SetInsertPoint(true_block);
+        for (const auto& statement: else_if->get_statement_block()->get_statements()) {
+            if (is_supported_statement(*statement))
+                statement->accept(*this);
+        }
+        builder.CreateBr(merge_block);
+        curr_block = else_block;
+    }
+
+    // Finally, generate code for `else` statement if it exists.
+    const auto& elses = node.get_elses();
+    llvm::BasicBlock* else_block;
+    if (elses) {
+        else_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
+        builder.SetInsertPoint(else_block);
+        for (const auto& statement: elses->get_statement_block()->get_statements()) {
+            if (is_supported_statement(*statement))
+                statement->accept(*this);
+        }
+        builder.CreateBr(merge_block);
+    } else {
+        else_block = merge_block;
+    }
+    builder.SetInsertPoint(curr_block);
+    builder.CreateCondBr(cond, true_block, else_block);
+    builder.SetInsertPoint(exit);
+}
+
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
                                                   node.get_value());
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index c6123a040d..28129b2fb8 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -229,6 +229,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
     void visit_function_call(const ast::FunctionCall& node) override;
+    void visit_if_statement(const ast::IfStatement& node) override;
     void visit_integer(const ast::Integer& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 6f1bf7b8ca..34311bf2c3 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -114,6 +114,30 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
                 arithmetic = x * y / (x + y)
             }
 
+            FUNCTION conditionals() {
+                LOCAL x, y, z
+                x = 100
+                y = -100
+                z = 0
+                if (x == 200) {
+                    conditionals = 1
+                } else if (x == 400) {
+                    conditionals = 2
+                } else if (x == 100) {
+                    if (y == -100 && z != 0) {
+                        conditionals = 3
+                    } else {
+                        if (y < -99 && z == 0) {
+                          conditionals = 4
+                        } else {
+                            conditionals = 5
+                        }
+                    }
+                } else {
+                    conditionals = 6
+                }
+            }
+
             FUNCTION bar() {
                 LOCAL i, j
                 i = 2
@@ -151,6 +175,10 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
             auto constant_result = runner.run<double>("constant");
             REQUIRE(fabs(constant_result - 10.0) < EPSILON);
 
+            // Check nested conditionals
+            auto conditionals_result = runner.run<double>("conditionals");
+            REQUIRE(fabs(conditionals_result - 4.0) < EPSILON);
+
             // Check constant folding.
             auto arithmetic_result = runner.run<double>("arithmetic");
             REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index c328113f93..292256193c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -147,6 +147,209 @@ SCENARIO("Define", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// If/Else statements and comparison operators
+//=============================================================================
+
+SCENARIO("Comparison", "[visitor][llvm]") {
+    GIVEN("Procedure with comparison operators") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo(x) {
+                if (x < 10) {
+
+                } else if (x >= 10 && x <= 100) {
+
+                } else if (x == 120) {
+
+                } else if (!(x != 200)) {
+
+                }
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check less than.
+            std::regex lt(R"(fcmp olt double %(.+), 1\.000000e\+01)");
+            REQUIRE(std::regex_search(module_string, m, lt));
+
+            // Check greater or equal than and logical and.
+            std::regex ge(R"(fcmp ole double %(.+), 1\.000000e\+02)");
+            std::regex logical_and(R"(and i1 %(.+), %(.+))");
+            REQUIRE(std::regex_search(module_string, m, ge));
+            REQUIRE(std::regex_search(module_string, m, logical_and));
+
+            // Check equals.
+            std::regex eq(R"(fcmp oeq double %(.+), 1\.200000e\+02)");
+            REQUIRE(std::regex_search(module_string, m, eq));
+
+            // Check not equals.
+            std::regex ne(R"(fcmp one double %(.+), 2\.000000e\+02)");
+            REQUIRE(std::regex_search(module_string, m, ne));
+        }
+    }
+}
+
+SCENARIO("If/Else", "[visitor][llvm]") {
+    GIVEN("Function with only if statement") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(y) {
+                LOCAL x
+                x = 100
+                if (y == 20) {
+                    x = 20
+                }
+                foo = x + y
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex cond_br(
+                "br i1 %2, label %3, label %4\n"
+                "\n"
+                "3:.*\n"
+                "  store double 2\\.000000e\\+01, double\\* %x.*\n"
+                "  br label %4\n"
+                "\n"
+                "4:");
+            REQUIRE(std::regex_search(module_string, m, cond_br));
+        }
+    }
+
+    GIVEN("Function with both if and else statements") {
+        std::string nmodl_text = R"(
+            FUNCTION sign(x) {
+                LOCAL s
+                if (x < 0) {
+                    s = -1
+                } else {
+                    s = 1
+                }
+                sign = s
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex if_else_br(
+                "br i1 %2, label %3, label %4\n"
+                "\n"
+                "3:.*\n"
+                "  store double -1\\.000000e\\+00, double\\* %s.*\n"
+                "  br label %5\n"
+                "\n"
+                "4:.*\n"
+                "  store double 1\\.000000e\\+00, double\\* %s.*\n"
+                "  br label %5\n"
+                "\n"
+                "5:");
+            REQUIRE(std::regex_search(module_string, m, if_else_br));
+        }
+    }
+
+    GIVEN("Function with both if and else if statements") {
+        std::string nmodl_text = R"(
+            FUNCTION bar(x) {
+                LOCAL s
+                s = -1
+                if (x <= 0) {
+                    s = 0
+                } else if (0 < x && x <= 1) {
+                    s = 1
+                }
+                bar = s
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex if_else_if(
+                "br i1 %2, label %3, label %4\n"
+                "\n"
+                "3:.*\n"
+                "  .*\n"
+                "  br label %12\n"
+                "\n"
+                "4:.*\n"
+                "  .*\n"
+                "  .*\n"
+                "  .*\n"
+                "  .*\n"
+                "  %.+ = and i1 %.+, %.+\n"
+                "  br i1 %.+, label %10, label %11\n"
+                "\n"
+                "10:.*\n"
+                "  .*\n"
+                "  br label %11\n"
+                "\n"
+                "11:.*\n"
+                "  br label %12\n"
+                "\n"
+                "12:");
+            REQUIRE(std::regex_search(module_string, m, if_else_if));
+        }
+    }
+
+    GIVEN("Function with if, else if anf else statements") {
+        std::string nmodl_text = R"(
+            FUNCTION bar(x) {
+                LOCAL s
+                if (x <= 0) {
+                    s = 0
+                } else if (0 < x && x <= 1) {
+                    s = 1
+                } else {
+                    s = 100
+                }
+                bar = s
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex if_else_if_else(
+                "br i1 %2, label %3, label %4\n"
+                "\n"
+                "3:.*\n"
+                "  .*\n"
+                "  br label %13\n"
+                "\n"
+                "4:.*\n"
+                "  .*\n"
+                "  .*\n"
+                "  .*\n"
+                "  .*\n"
+                "  %9 = and i1 %.+, %.+\n"
+                "  br i1 %9, label %10, label %11\n"
+                "\n"
+                "10:.*\n"
+                "  .*\n"
+                "  br label %12\n"
+                "\n"
+                "11:.*\n"
+                "  .*\n"
+                "  br label %12\n"
+                "\n"
+                "12:.*\n"
+                "  br label %13\n"
+                "\n"
+                "13:");
+            REQUIRE(std::regex_search(module_string, m, if_else_if_else));
+        }
+    }
+}
+
 //=============================================================================
 // FunctionBlock
 //=============================================================================

From c8ea99494f3b10356680465d7389e1deab55eb7d Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 26 Jan 2021 09:27:31 +0300
Subject: [PATCH 020/331] Added error handling for values not in scope (#502)

Added error handling when a non-scope value is looked up. Before, such a lookup would yield a nullptr, therefore leading to a segmentation fault. This PR adds a lookup function that wraps around value symbol lookup, and throws an error with a message if nullptr is returned.
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 18 +++++++++++-------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  6 ++++++
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index bde36f3dd4..86619b899e 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -32,10 +32,7 @@ static bool is_supported_statement(const ast::Statement& statement) {
 }
 
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
-    llvm::Type* array_type = current_func->getValueSymbolTable()
-                                 ->lookup(node.get_node_name())
-                                 ->getType()
-                                 ->getPointerElementType();
+    llvm::Type* array_type = lookup(node.get_node_name())->getType()->getPointerElementType();
     unsigned length = array_type->getArrayNumElements();
     return 0 <= index && index < length;
 }
@@ -46,7 +43,7 @@ llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, unsigned in
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
     indices.push_back(llvm::ConstantInt::get(index_type, index));
 
-    return builder.CreateInBoundsGEP(current_func->getValueSymbolTable()->lookup(name), indices);
+    return builder.CreateInBoundsGEP(lookup(name), indices);
 }
 
 llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
@@ -177,6 +174,13 @@ void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Codeg
                            *module);
 }
 
+llvm::Value* CodegenLLVMVisitor::lookup(const std::string& name) {
+    auto val = current_func->getValueSymbolTable()->lookup(name);
+    if (!val)
+        throw std::runtime_error("Error: variable " + name + " is not in scope\n");
+    return val;
+}
+
 llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
                                                          llvm::Value* rhs,
                                                          unsigned op) {
@@ -213,7 +217,7 @@ void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm
 
     const auto& identifier = var->get_name();
     if (identifier->is_name()) {
-        llvm::Value* alloca = current_func->getValueSymbolTable()->lookup(var->get_node_name());
+        llvm::Value* alloca = lookup(var->get_node_name());
         builder.CreateStore(rhs, alloca);
     } else if (identifier->is_indexed_name()) {
         auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
@@ -562,7 +566,7 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 
     llvm::Value* ptr;
     if (identifier->is_name())
-        ptr = current_func->getValueSymbolTable()->lookup(node.get_node_name());
+        ptr = lookup(node.get_node_name());
 
     if (identifier->is_indexed_name()) {
         auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 28129b2fb8..82c0c038ca 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -180,6 +180,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         return std::move(module);
     }
 
+    /**
+     * Lookup the given name in the current function's symbol table
+     * \return LLVM value
+     */
+    llvm::Value* lookup(const std::string& name);
+
     /**
      * Visit nmodl arithmetic binary operator
      * \param lhs LLVM value of evaluated lhs expression

From a32f76b2cb7dc361e094ab4787642e665f3997fa Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 26 Jan 2021 11:19:06 +0300
Subject: [PATCH 021/331] Added support for WHILE statement (#501)

Added support for WHILE statement code generation. Corresponding tests for IR generation and execution were also added.

Additional visitor for StatementBlock was added to reduce code duplication.

fixes #500
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    | 59 ++++++++++++++------
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  2 +
 test/unit/codegen/codegen_llvm_execution.cpp | 21 +++++++
 test/unit/codegen/codegen_llvm_ir.cpp        | 44 +++++++++++++++
 4 files changed, 108 insertions(+), 18 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 86619b899e..831c43317a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -28,7 +28,8 @@ namespace codegen {
 
 static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_codegen_return_statement() || statement.is_if_statement();
+           statement.is_codegen_return_statement() || statement.is_if_statement() ||
+           statement.is_while_statement();
 }
 
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
@@ -314,6 +315,14 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
     values.push_back(result);
 }
 
+void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node) {
+    const auto& statements = node.get_statements();
+    for (const auto& statement: statements) {
+        if (is_supported_statement(*statement))
+            statement->accept(*this);
+    }
+}
+
 void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*context),
                                                   node.get_value());
@@ -350,11 +359,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     }
 
     // Process function or procedure body. The return statement is handled in a separate visitor.
-    const auto& statements = block->get_statements();
-    for (const auto& statement: statements) {
-        if (is_supported_statement(*statement))
-            statement->accept(*this);
-    }
+    block->accept(*this);
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
     if (node.is_void())
@@ -448,10 +453,7 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
 
     // Process the true block.
     builder.SetInsertPoint(true_block);
-    for (const auto& statement: node.get_statement_block()->get_statements()) {
-        if (is_supported_statement(*statement))
-            statement->accept(*this);
-    }
+    node.get_statement_block()->accept(*this);
     builder.CreateBr(merge_block);
 
     // Save the merge block and proceed with codegen for `else if` statements.
@@ -479,10 +481,7 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
 
         // Process true block.
         builder.SetInsertPoint(true_block);
-        for (const auto& statement: else_if->get_statement_block()->get_statements()) {
-            if (is_supported_statement(*statement))
-                statement->accept(*this);
-        }
+        else_if->get_statement_block()->accept(*this);
         builder.CreateBr(merge_block);
         curr_block = else_block;
     }
@@ -493,10 +492,7 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     if (elses) {
         else_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
         builder.SetInsertPoint(else_block);
-        for (const auto& statement: elses->get_statement_block()->get_statements()) {
-            if (is_supported_statement(*statement))
-                statement->accept(*this);
-        }
+        elses->get_statement_block()->accept(*this);
         builder.CreateBr(merge_block);
     } else {
         else_block = merge_block;
@@ -578,5 +574,32 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
     values.push_back(var);
 }
 
+void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
+    // Get the current and the next blocks within the function.
+    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* next = curr_block->getNextNode();
+    llvm::Function* func = curr_block->getParent();
+
+    // Add a header and the body blocks.
+    llvm::BasicBlock* header = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+    llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+
+    builder.CreateBr(header);
+    builder.SetInsertPoint(header);
+
+    // Generate code for condition and create branch to the body block.
+    node.get_condition()->accept(*this);
+    llvm::Value* condition = values.back();
+    values.pop_back();
+    builder.CreateCondBr(condition, body, exit);
+
+    builder.SetInsertPoint(body);
+    node.get_statement_block()->accept(*this);
+    builder.CreateBr(header);
+
+    builder.SetInsertPoint(exit);
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 82c0c038ca..3003a119b5 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -229,6 +229,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Visitors
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
+    void visit_statement_block(const ast::StatementBlock& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
     void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
@@ -241,6 +242,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_program(const ast::Program& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
+    void visit_while_statement(const ast::WhileStatement& node) override;
 
     // \todo: move this to debug mode (e.g. -v option or --dump-ir)
     std::string print_module() const {
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 34311bf2c3..90e8fb3cc2 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -59,6 +59,23 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
             }
 
             PROCEDURE foo() {}
+
+            FUNCTION loop() {
+                LOCAL i, j, sum, result
+                result = 0
+                j = 0
+                WHILE (j < 2) {
+                    i = 0
+                    sum = 0
+                    WHILE (i < 10) {
+                        sum = sum + i
+                        i = i + 1
+                    }
+                    j = j + 1
+                    result = result + sum
+                }
+                loop = result
+            }
         )";
 
 
@@ -86,6 +103,9 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
 
             auto function_call_result = runner.run<double>("function_call");
             REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
+
+            auto loop_result = runner.run<double>("loop");
+            REQUIRE(fabs(loop_result - 90.0) < EPSILON);
         }
     }
 }
@@ -151,6 +171,7 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
             }
 
             PROCEDURE foo() {}
+
         )";
 
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 292256193c..d16b02b2f5 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -690,6 +690,50 @@ SCENARIO("Unary expression", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// WhileStatement
+//=============================================================================
+
+SCENARIO("While", "[visitor][llvm]") {
+    GIVEN("Procedure with a simple while loop") {
+        std::string nmodl_text = R"(
+            FUNCTION loop() {
+                LOCAL i
+                i = 0
+                WHILE (i < 10) {
+                    i = i + 1
+                }
+                loop = 0
+            }
+        )";
+
+        THEN("correct loop is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex loop(
+                "  br label %1\n"
+                "\n"
+                "1:.*\n"
+                "  %2 = load double, double\\* %i.*\n"
+                "  %3 = fcmp olt double %2, 1\\.000000e\\+01\n"
+                "  br i1 %3, label %4, label %7\n"
+                "\n"
+                "4:.*\n"
+                "  %5 = load double, double\\* %i.*\n"
+                "  %6 = fadd double %5, 1\\.000000e\\+00\n"
+                "  store double %6, double\\* %i.*\n"
+                "  br label %1\n"
+                "\n"
+                "7:.*\n"
+                "  store double 0\\.000000e\\+00, double\\* %ret_loop.*\n");
+            // Check that 3 blocks are created: header, body and exit blocks. Also, there must be
+            // a backedge from the body to the header.
+            REQUIRE(std::regex_search(module_string, m, loop));
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From bfaff728445c684af21f3e7d245cceee97320025 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 1 Feb 2021 22:01:33 +0100
Subject: [PATCH 022/331] Create mechanism instance struct in LLVM IR (#507)

* Moved info related function to codegen_info
  - Moved get_float_variables, codegen_int_variables,
     codegen_global_variables, codegen_shadow_variables
     into CodegenHelper
  - Move small utility functions from CodegenCVisitor to codeged_utils
* Add proper variables to the mech_Instance
* Adding LLVMStructBlock
* Added test and visitor
* Fix llvm codegen tests with x[0-9].*
---
 src/codegen/codegen_c_visitor.cpp             | 292 ++----------------
 src/codegen/codegen_c_visitor.hpp             | 123 --------
 src/codegen/codegen_helper_visitor.cpp        |   5 +
 src/codegen/codegen_helper_visitor.hpp        |  10 +
 src/codegen/codegen_info.cpp                  | 197 ++++++++++++
 src/codegen/codegen_info.hpp                  | 135 ++++++++
 src/codegen/codegen_ispc_visitor.cpp          |  18 +-
 .../llvm/codegen_llvm_helper_visitor.cpp      |  21 ++
 .../llvm/codegen_llvm_helper_visitor.hpp      |   3 +
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  17 +
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  10 +
 src/language/code_generator.cmake             |   1 +
 src/language/nmodl.yaml                       |  12 +
 test/unit/codegen/codegen_llvm_ir.cpp         |  46 ++-
 14 files changed, 494 insertions(+), 396 deletions(-)

diff --git a/src/codegen/codegen_c_visitor.cpp b/src/codegen/codegen_c_visitor.cpp
index 63de87807f..49ec6436de 100644
--- a/src/codegen/codegen_c_visitor.cpp
+++ b/src/codegen/codegen_c_visitor.cpp
@@ -348,49 +348,6 @@ bool CodegenCVisitor::statement_to_skip(const Statement& node) const {
 }
 
 
-bool CodegenCVisitor::net_send_buffer_required() const noexcept {
-    if (net_receive_required() && !info.artificial_cell) {
-        if (info.net_event_used || info.net_send_used || info.is_watch_used()) {
-            return true;
-        }
-    }
-    return false;
-}
-
-
-bool CodegenCVisitor::net_receive_buffering_required() const noexcept {
-    return info.point_process && !info.artificial_cell && info.net_receive_node != nullptr;
-}
-
-
-bool CodegenCVisitor::nrn_state_required() const noexcept {
-    if (info.artificial_cell) {
-        return false;
-    }
-    return info.nrn_state_block != nullptr || info.currents.empty();
-}
-
-
-bool CodegenCVisitor::nrn_cur_required() const noexcept {
-    return info.breakpoint_node != nullptr && !info.currents.empty();
-}
-
-
-bool CodegenCVisitor::net_receive_exist() const noexcept {
-    return info.net_receive_node != nullptr;
-}
-
-
-bool CodegenCVisitor::breakpoint_exist() const noexcept {
-    return info.breakpoint_node != nullptr;
-}
-
-
-bool CodegenCVisitor::net_receive_required() const noexcept {
-    return net_receive_exist();
-}
-
-
 /**
  * \details When floating point data type is not default (i.e. double) then we
  * have to copy old array to new type (for range variables).
@@ -415,7 +372,7 @@ bool CodegenCVisitor::state_variable(const std::string& name) const {
 
 int CodegenCVisitor::position_of_float_var(const std::string& name) const {
     int index = 0;
-    for (const auto& var: codegen_float_variables) {
+    for (const auto& var: info.codegen_float_variables) {
         if (var->get_name() == name) {
             return index;
         }
@@ -427,7 +384,7 @@ int CodegenCVisitor::position_of_float_var(const std::string& name) const {
 
 int CodegenCVisitor::position_of_int_var(const std::string& name) const {
     int index = 0;
-    for (const auto& var: codegen_int_variables) {
+    for (const auto& var: info.codegen_int_variables) {
         if (var.symbol->get_name() == name) {
             return index;
         }
@@ -546,11 +503,11 @@ int CodegenCVisitor::float_variables_size() const {
         float_size++;
     }
     /// for g_unused variable
-    if (breakpoint_exist()) {
+    if (info.breakpoint_exist()) {
         float_size++;
     }
     /// for tsave variable
-    if (net_receive_exist()) {
+    if (info.net_receive_exist()) {
         float_size++;
     }
     return float_size;
@@ -810,186 +767,6 @@ void CodegenCVisitor::update_index_semantics() {
 }
 
 
-std::vector<SymbolType> CodegenCVisitor::get_float_variables() {
-    // sort with definition order
-    auto comparator = [](const SymbolType& first, const SymbolType& second) -> bool {
-        return first->get_definition_order() < second->get_definition_order();
-    };
-
-    auto assigned = info.assigned_vars;
-    auto states = info.state_vars;
-
-    // each state variable has corresponding Dstate variable
-    for (auto& state: states) {
-        auto name = "D" + state->get_name();
-        auto symbol = make_symbol(name);
-        if (state->is_array()) {
-            symbol->set_as_array(state->get_length());
-        }
-        symbol->set_definition_order(state->get_definition_order());
-        assigned.push_back(symbol);
-    }
-    std::sort(assigned.begin(), assigned.end(), comparator);
-
-    auto variables = info.range_parameter_vars;
-    variables.insert(variables.end(),
-                     info.range_assigned_vars.begin(),
-                     info.range_assigned_vars.end());
-    variables.insert(variables.end(), info.range_state_vars.begin(), info.range_state_vars.end());
-    variables.insert(variables.end(), assigned.begin(), assigned.end());
-
-    if (info.vectorize) {
-        variables.push_back(make_symbol(naming::VOLTAGE_UNUSED_VARIABLE));
-    }
-    if (breakpoint_exist()) {
-        std::string name = info.vectorize ? naming::CONDUCTANCE_UNUSED_VARIABLE
-                                          : naming::CONDUCTANCE_VARIABLE;
-        variables.push_back(make_symbol(name));
-    }
-    if (net_receive_exist()) {
-        variables.push_back(make_symbol(naming::T_SAVE_VARIABLE));
-    }
-    return variables;
-}
-
-
-/**
- * IndexVariableInfo has following constructor arguments:
- *      - symbol
- *      - is_vdata   (false)
- *      - is_index   (false
- *      - is_integer (false)
- *
- * Which variables are constant qualified?
- *
- *  - node area is read only
- *  - read ion variables are read only
- *  - style_ionname is index / offset
- */
-std::vector<IndexVariableInfo> CodegenCVisitor::get_int_variables() {
-    std::vector<IndexVariableInfo> variables;
-    if (info.point_process) {
-        variables.emplace_back(make_symbol(naming::NODE_AREA_VARIABLE));
-        variables.back().is_constant = true;
-        /// note that this variable is not printed in neuron implementation
-        if (info.artificial_cell) {
-            variables.emplace_back(make_symbol(naming::POINT_PROCESS_VARIABLE), true);
-        } else {
-            variables.emplace_back(make_symbol(naming::POINT_PROCESS_VARIABLE), false, false, true);
-            variables.back().is_constant = true;
-        }
-    }
-
-    for (const auto& ion: info.ions) {
-        bool need_style = false;
-        std::unordered_map<std::string, int> ion_vars;  // used to keep track of the variables to
-                                                        // not have doubles between read/write. Same
-                                                        // name variables are allowed
-        for (const auto& var: ion.reads) {
-            const std::string name = "ion_" + var;
-            variables.emplace_back(make_symbol(name));
-            variables.back().is_constant = true;
-            ion_vars[name] = variables.size() - 1;
-        }
-
-        /// symbol for di_ion_dv var
-        std::shared_ptr<symtab::Symbol> ion_di_dv_var = nullptr;
-
-        for (const auto& var: ion.writes) {
-            const std::string name = "ion_" + var;
-
-            const auto ion_vars_it = ion_vars.find(name);
-            if (ion_vars_it != ion_vars.end()) {
-                variables[ion_vars_it->second].is_constant = false;
-            } else {
-                variables.emplace_back(make_symbol("ion_" + var));
-            }
-            if (ion.is_ionic_current(var)) {
-                ion_di_dv_var = make_symbol("ion_di" + ion.name + "dv");
-            }
-            if (ion.is_intra_cell_conc(var) || ion.is_extra_cell_conc(var)) {
-                need_style = true;
-            }
-        }
-
-        /// insert after read/write variables but before style ion variable
-        if (ion_di_dv_var != nullptr) {
-            variables.emplace_back(ion_di_dv_var);
-        }
-
-        if (need_style) {
-            variables.emplace_back(make_symbol("style_" + ion.name), false, true);
-            variables.back().is_constant = true;
-        }
-    }
-
-    for (const auto& var: info.pointer_variables) {
-        auto name = var->get_name();
-        if (var->has_any_property(NmodlType::pointer_var)) {
-            variables.emplace_back(make_symbol(name));
-        } else {
-            variables.emplace_back(make_symbol(name), true);
-        }
-    }
-
-    if (info.diam_used) {
-        variables.emplace_back(make_symbol(naming::DIAM_VARIABLE));
-    }
-
-    if (info.area_used) {
-        variables.emplace_back(make_symbol(naming::AREA_VARIABLE));
-    }
-
-    // for non-artificial cell, when net_receive buffering is enabled
-    // then tqitem is an offset
-    if (info.net_send_used) {
-        if (info.artificial_cell) {
-            variables.emplace_back(make_symbol(naming::TQITEM_VARIABLE), true);
-        } else {
-            variables.emplace_back(make_symbol(naming::TQITEM_VARIABLE), false, false, true);
-            variables.back().is_constant = true;
-        }
-        info.tqitem_index = variables.size() - 1;
-    }
-
-    /**
-     * \note Variables for watch statements : there is one extra variable
-     * used in coreneuron compared to actual watch statements for compatibility
-     * with neuron (which uses one extra Datum variable)
-     */
-    if (!info.watch_statements.empty()) {
-        for (int i = 0; i < info.watch_statements.size() + 1; i++) {
-            variables.emplace_back(make_symbol("watch{}"_format(i)), false, false, true);
-        }
-    }
-    return variables;
-}
-
-
-/**
- * \details When we enable fine level parallelism at channel level, we have do updates
- * to ion variables in atomic way. As cpus don't have atomic instructions in
- * simd loop, we have to use shadow vectors for every ion variables. Here
- * we return list of all such variables.
- *
- * \todo If conductances are specified, we don't need all below variables
- */
-std::vector<SymbolType> CodegenCVisitor::get_shadow_variables() {
-    std::vector<SymbolType> variables;
-    for (const auto& ion: info.ions) {
-        for (const auto& var: ion.writes) {
-            variables.push_back({make_symbol(shadow_varname("ion_" + var))});
-            if (ion.is_ionic_current(var)) {
-                variables.push_back({make_symbol(shadow_varname("ion_di" + ion.name + "dv"))});
-            }
-        }
-    }
-    variables.push_back({make_symbol("ml_rhs")});
-    variables.push_back({make_symbol("ml_d")});
-    return variables;
-}
-
-
 /****************************************************************************************/
 /*                      Routines must be overloaded in backend                          */
 /****************************************************************************************/
@@ -1078,7 +855,7 @@ bool CodegenCVisitor::nrn_cur_reduction_loop_required() {
 
 
 bool CodegenCVisitor::shadow_vector_setup_required() {
-    return (channel_task_dependency_enabled() && !codegen_shadow_variables.empty());
+    return (channel_task_dependency_enabled() && !info.codegen_shadow_variables.empty());
 }
 
 
@@ -1933,8 +1710,8 @@ std::string CodegenCVisitor::process_verbatim_text(std::string text) {
 
 
 std::string CodegenCVisitor::register_mechanism_arguments() const {
-    auto nrn_cur = nrn_cur_required() ? method_name(naming::NRN_CUR_METHOD) : "NULL";
-    auto nrn_state = nrn_state_required() ? method_name(naming::NRN_STATE_METHOD) : "NULL";
+    auto nrn_cur = info.nrn_cur_required() ? method_name(naming::NRN_CUR_METHOD) : "NULL";
+    auto nrn_state = info.nrn_state_required() ? method_name(naming::NRN_STATE_METHOD) : "NULL";
     auto nrn_alloc = method_name(naming::NRN_ALLOC_METHOD);
     auto nrn_init = method_name(naming::NRN_INIT_METHOD);
     return "mechanism, {}, {}, NULL, {}, {}, first_pointer_var_index()"
@@ -2052,7 +1829,7 @@ void CodegenCVisitor::print_num_variable_getter() {
 
 
 void CodegenCVisitor::print_net_receive_arg_size_getter() {
-    if (!net_receive_exist()) {
+    if (!info.net_receive_exist()) {
         return;
     }
     printer->add_newline(2);
@@ -2245,17 +2022,18 @@ std::string CodegenCVisitor::get_variable_name(const std::string& name, bool use
     // clang-format on
 
     // float variable
-    auto f = std::find_if(codegen_float_variables.begin(),
-                          codegen_float_variables.end(),
+    auto f = std::find_if(info.codegen_float_variables.begin(),
+                          info.codegen_float_variables.end(),
                           symbol_comparator);
-    if (f != codegen_float_variables.end()) {
+    if (f != info.codegen_float_variables.end()) {
         return float_variable_name(*f, use_instance);
     }
 
     // integer variable
-    auto i =
-        std::find_if(codegen_int_variables.begin(), codegen_int_variables.end(), index_comparator);
-    if (i != codegen_int_variables.end()) {
+    auto i = std::find_if(info.codegen_int_variables.begin(),
+                          info.codegen_int_variables.end(),
+                          index_comparator);
+    if (i != info.codegen_int_variables.end()) {
         return int_variable_name(*i, varname, use_instance);
     }
 
@@ -2268,10 +2046,10 @@ std::string CodegenCVisitor::get_variable_name(const std::string& name, bool use
     }
 
     // shadow variable
-    auto s = std::find_if(codegen_shadow_variables.begin(),
-                          codegen_shadow_variables.end(),
+    auto s = std::find_if(info.codegen_shadow_variables.begin(),
+                          info.codegen_shadow_variables.end(),
                           symbol_comparator);
-    if (s != codegen_shadow_variables.end()) {
+    if (s != info.codegen_shadow_variables.end()) {
         return ion_shadow_variable_name(*s);
     }
 
@@ -2700,7 +2478,7 @@ void CodegenCVisitor::print_mechanism_register() {
     if (info.artificial_cell) {
         printer->add_line("add_nrn_artcell(mech_type, {});"_format(info.tqitem_index));
     }
-    if (net_receive_buffering_required()) {
+    if (info.net_receive_buffering_required()) {
         printer->add_line("hoc_register_net_receive_buffering({}, mech_type);"_format(
             method_name("net_buf_receive")));
     }
@@ -2801,13 +2579,13 @@ void CodegenCVisitor::print_mechanism_range_var_structure() {
     printer->add_newline(2);
     printer->add_line("/** all mechanism instance variables */");
     printer->start_block("struct {} "_format(instance_struct()));
-    for (auto& var: codegen_float_variables) {
+    for (auto& var: info.codegen_float_variables) {
         auto name = var->get_name();
         auto type = get_range_var_float_type(var);
         auto qualifier = is_constant_variable(name) ? k_const() : "";
         printer->add_line("{}{}* {}{};"_format(qualifier, type, ptr_type_qualifier(), name));
     }
-    for (auto& var: codegen_int_variables) {
+    for (auto& var: info.codegen_int_variables) {
         auto name = var.symbol->get_name();
         if (var.is_index || var.is_integer) {
             auto qualifier = var.is_constant ? k_const() : "";
@@ -2820,7 +2598,7 @@ void CodegenCVisitor::print_mechanism_range_var_structure() {
         }
     }
     if (channel_task_dependency_enabled()) {
-        for (auto& var: codegen_shadow_variables) {
+        for (auto& var: info.codegen_shadow_variables) {
             auto name = var->get_name();
             printer->add_line("{}* {}{};"_format(float_type, ptr_type_qualifier(), name));
         }
@@ -3029,7 +2807,7 @@ void CodegenCVisitor::print_shadow_vector_setup() {
     printer->start_block("static inline void setup_shadow_vectors({}) "_format(args));
     if (channel_task_dependency_enabled()) {
         printer->add_line("int nodecount = ml->nodecount;");
-        for (auto& var: codegen_shadow_variables) {
+        for (auto& var: info.codegen_shadow_variables) {
             auto name = var->get_name();
             auto type = default_float_data_type();
             auto allocation = "({0}*) mem_alloc(nodecount, sizeof({0}))"_format(type);
@@ -3042,7 +2820,7 @@ void CodegenCVisitor::print_shadow_vector_setup() {
     args = "{}* inst"_format(instance_struct());
     printer->start_block("static inline void free_shadow_vectors({}) "_format(args));
     if (channel_task_dependency_enabled()) {
-        for (auto& var: codegen_shadow_variables) {
+        for (auto& var: info.codegen_shadow_variables) {
             auto name = var->get_name();
             printer->add_line("mem_free(inst->{});"_format(name));
         }
@@ -3109,7 +2887,7 @@ void CodegenCVisitor::print_instance_variable_setup() {
     printer->add_line("/** initialize mechanism instance variables */");
     printer->start_block("static inline void setup_instance(NrnThread* nt, Memb_list* ml) ");
     printer->add_line("{0}* inst = ({0}*) mem_alloc(1, sizeof({0}));"_format(instance_struct()));
-    if (channel_task_dependency_enabled() && !codegen_shadow_variables.empty()) {
+    if (channel_task_dependency_enabled() && !info.codegen_shadow_variables.empty()) {
         printer->add_line("setup_shadow_vectors(inst, ml);");
     }
 
@@ -3127,7 +2905,7 @@ void CodegenCVisitor::print_instance_variable_setup() {
     int id = 0;
     std::vector<std::string> variables_to_free;
 
-    for (auto& var: codegen_float_variables) {
+    for (auto& var: info.codegen_float_variables) {
         auto name = var->get_name();
         auto range_var_type = get_range_var_float_type(var);
         if (float_type == range_var_type) {
@@ -3142,7 +2920,7 @@ void CodegenCVisitor::print_instance_variable_setup() {
         id += var->get_length();
     }
 
-    for (auto& var: codegen_int_variables) {
+    for (auto& var: info.codegen_int_variables) {
         auto name = var.symbol->get_name();
         std::string variable = name;
         std::string type = "";
@@ -3681,7 +3459,7 @@ void CodegenCVisitor::print_net_receive_loop_end() {
 
 
 void CodegenCVisitor::print_net_receive_buffering(bool need_mech_inst) {
-    if (!net_receive_required() || info.artificial_cell) {
+    if (!info.net_receive_required() || info.artificial_cell) {
         return;
     }
     printer->add_newline(2);
@@ -3730,7 +3508,7 @@ void CodegenCVisitor::print_net_send_buffering_grow() {
 }
 
 void CodegenCVisitor::print_net_send_buffering() {
-    if (!net_send_buffer_required()) {
+    if (!info.net_send_buffer_required()) {
         return;
     }
 
@@ -3796,7 +3574,7 @@ void CodegenCVisitor::visit_for_netcon(const ast::ForNetcon& node) {
 }
 
 void CodegenCVisitor::print_net_receive_kernel() {
-    if (!net_receive_required()) {
+    if (!info.net_receive_required()) {
         return;
     }
     codegen = true;
@@ -3859,7 +3637,7 @@ void CodegenCVisitor::print_net_receive_kernel() {
 
 
 void CodegenCVisitor::print_net_receive() {
-    if (!net_receive_required()) {
+    if (!info.net_receive_required()) {
         return;
     }
     codegen = true;
@@ -4007,7 +3785,7 @@ void CodegenCVisitor::visit_solution_expression(const SolutionExpression& node)
 
 
 void CodegenCVisitor::print_nrn_state() {
-    if (!nrn_state_required()) {
+    if (!info.nrn_state_required()) {
         return;
     }
     codegen = true;
@@ -4217,7 +3995,7 @@ void CodegenCVisitor::print_fast_imem_calculation() {
 }
 
 void CodegenCVisitor::print_nrn_cur() {
-    if (!nrn_cur_required()) {
+    if (!info.nrn_cur_required()) {
         return;
     }
 
@@ -4365,10 +4143,6 @@ void CodegenCVisitor::setup(const Program& node) {
         logger->warn("CodegenCVisitor : MOD file uses non-thread safe constructs of NMODL");
     }
 
-    codegen_float_variables = get_float_variables();
-    codegen_int_variables = get_int_variables();
-    codegen_shadow_variables = get_shadow_variables();
-
     update_index_semantics();
     rename_function_arguments();
 }
diff --git a/src/codegen/codegen_c_visitor.hpp b/src/codegen/codegen_c_visitor.hpp
index 7b3ad57e7f..64f4477eeb 100644
--- a/src/codegen/codegen_c_visitor.hpp
+++ b/src/codegen/codegen_c_visitor.hpp
@@ -65,41 +65,6 @@ enum class MemberType {
     thread
 };
 
-
-/**
- * \class IndexVariableInfo
- * \brief Helper to represent information about index/int variables
- *
- */
-struct IndexVariableInfo {
-    /// symbol for the variable
-    const std::shared_ptr<symtab::Symbol> symbol;
-
-    /// if variable reside in vdata field of NrnThread
-    /// typically true for bbcore pointer
-    bool is_vdata = false;
-
-    /// if this is pure index (e.g. style_ion) variables is directly
-    /// index and shouldn't be printed with data/vdata
-    bool is_index = false;
-
-    /// if this is an integer (e.g. tqitem, point_process) variable which
-    /// is printed as array accesses
-    bool is_integer = false;
-
-    /// if the variable is qualified as constant (this is property of IndexVariable)
-    bool is_constant = false;
-
-    IndexVariableInfo(std::shared_ptr<symtab::Symbol> symbol,
-                      bool is_vdata = false,
-                      bool is_index = false,
-                      bool is_integer = false)
-        : symbol(std::move(symbol))
-        , is_vdata(is_vdata)
-        , is_index(is_index)
-        , is_integer(is_integer) {}
-};
-
 /** @} */  // end of codegen_details
 
 
@@ -163,11 +128,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
      */
     symtab::SymbolTable* program_symtab = nullptr;
 
-    /**
-     * All float variables for the model
-     */
-    std::vector<SymbolType> codegen_float_variables;
-
     /**
      * All int variables for the model
      */
@@ -356,26 +316,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     }
 
 
-    /**
-     * Constructs a shadow variable name
-     * \param name The name of the variable
-     * \return     The name of the variable prefixed with \c shadow_
-     */
-    std::string shadow_varname(const std::string& name) const {
-        return "shadow_" + name;
-    }
-
-
-    /**
-     * Creates a temporary symbol
-     * \param name The name of the symbol
-     * \return     A symbol based on the given name
-     */
-    SymbolType make_symbol(const std::string& name) const {
-        return std::make_shared<symtab::Symbol>(name, ModToken());
-    }
-
-
     /**
      * Checks if the given variable name belongs to a state variable
      * \param name The variable name
@@ -384,36 +324,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     bool state_variable(const std::string& name) const;
 
 
-    /**
-     * Check if net receive/send buffering kernels required
-     */
-    bool net_receive_buffering_required() const noexcept;
-
-
-    /**
-     * Check if nrn_state function is required
-     */
-    bool nrn_state_required() const noexcept;
-
-
-    /**
-     * Check if nrn_cur function is required
-     */
-    bool nrn_cur_required() const noexcept;
-
-
-    /**
-     * Check if net_receive function is required
-     */
-    bool net_receive_required() const noexcept;
-
-
-    /**
-     * Check if net_send_buffer is required
-     */
-    bool net_send_buffer_required() const noexcept;
-
-
     /**
      * Check if setup_range_variable function is required
      * \return
@@ -421,18 +331,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     bool range_variable_setup_required() const noexcept;
 
 
-    /**
-     * Check if net_receive node exist
-     */
-    bool net_receive_exist() const noexcept;
-
-
-    /**
-     * Check if breakpoint node exist
-     */
-    bool breakpoint_exist() const noexcept;
-
-
     /**
      * Check if given method is defined in this model
      * \param name The name of the method to check
@@ -598,27 +496,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     void update_index_semantics();
 
 
-    /**
-     * Determine all \c float variables required during code generation
-     * \return A \c vector of \c float variables
-     */
-    std::vector<SymbolType> get_float_variables();
-
-
-    /**
-     * Determine all \c int variables required during code generation
-     * \return A \c vector of \c int variables
-     */
-    std::vector<IndexVariableInfo> get_int_variables();
-
-
-    /**
-     * Determine all ion write variables that require shadow vectors during code generation
-     * \return A \c vector of ion variables
-     */
-    std::vector<SymbolType> get_shadow_variables();
-
-
     /**
      * Print the items in a vector as a list
      *
diff --git a/src/codegen/codegen_helper_visitor.cpp b/src/codegen/codegen_helper_visitor.cpp
index 38e5c3c1e0..9c4944a23e 100644
--- a/src/codegen/codegen_helper_visitor.cpp
+++ b/src/codegen/codegen_helper_visitor.cpp
@@ -22,6 +22,7 @@ using namespace ast;
 using symtab::syminfo::NmodlType;
 using symtab::syminfo::Status;
 
+
 /**
  * How symbols are stored in NEURON? See notes written in markdown file.
  *
@@ -273,6 +274,7 @@ void CodegenHelperVisitor::find_non_range_variables() {
     // clang-format on
 }
 
+
 /**
  * Find range variables i.e. ones that are belong to per instance allocation
  *
@@ -664,6 +666,9 @@ void CodegenHelperVisitor::visit_program(const ast::Program& node) {
     find_range_variables();
     find_non_range_variables();
     find_table_variables();
+    info.get_int_variables();
+    info.get_shadow_variables();
+    info.get_float_variables();
 }
 
 
diff --git a/src/codegen/codegen_helper_visitor.hpp b/src/codegen/codegen_helper_visitor.hpp
index 4f32d1cef8..a6fd10a16b 100644
--- a/src/codegen/codegen_helper_visitor.hpp
+++ b/src/codegen/codegen_helper_visitor.hpp
@@ -75,6 +75,16 @@ class CodegenHelperVisitor: public visitor::ConstAstVisitor {
     void find_non_range_variables();
     void sort_with_mod2c_symbol_order(std::vector<SymbolType>& symbols) const;
 
+    /**
+     * Check if breakpoint node exist
+     */
+    bool breakpoint_exist() const noexcept;
+
+    /**
+     * Check if net_receive node exist
+     */
+    bool net_receive_exist() const noexcept;
+
   public:
     CodegenHelperVisitor() = default;
 
diff --git a/src/codegen/codegen_info.cpp b/src/codegen/codegen_info.cpp
index 2219a18913..26696fbc18 100644
--- a/src/codegen/codegen_info.cpp
+++ b/src/codegen/codegen_info.cpp
@@ -20,6 +20,16 @@ using namespace fmt::literals;
 using symtab::syminfo::NmodlType;
 using visitor::VarUsageVisitor;
 
+SymbolType make_symbol(const std::string& name) {
+    return std::make_shared<symtab::Symbol>(name, ModToken());
+}
+
+
+std::string shadow_varname(const std::string& name) {
+    return "shadow_" + name;
+}
+
+
 /// if any ion has write variable
 bool CodegenInfo::ion_has_write_variable() const {
     for (const auto& ion: ions) {
@@ -205,5 +215,192 @@ bool CodegenInfo::is_an_instance_variable(const std::string& varname) const {
     return false;
 }
 
+
+/**
+ * IndexVariableInfo has following constructor arguments:
+ *      - symbol
+ *      - is_vdata   (false)
+ *      - is_index   (false
+ *      - is_integer (false)
+ *
+ * Which variables are constant qualified?
+ *
+ *  - node area is read only
+ *  - read ion variables are read only
+ *  - style_ionname is index / offset
+ */
+void CodegenInfo::get_int_variables() {
+    if (point_process) {
+        codegen_int_variables.emplace_back(make_symbol(naming::NODE_AREA_VARIABLE));
+        codegen_int_variables.back().is_constant = true;
+        /// note that this variable is not printed in neuron implementation
+        if (artificial_cell) {
+            codegen_int_variables.emplace_back(make_symbol(naming::POINT_PROCESS_VARIABLE), true);
+        } else {
+            codegen_int_variables.emplace_back(make_symbol(naming::POINT_PROCESS_VARIABLE),
+                                               false,
+                                               false,
+                                               true);
+            codegen_int_variables.back().is_constant = true;
+        }
+    }
+
+    for (const auto& ion: ions) {
+        bool need_style = false;
+        std::unordered_map<std::string, int> ion_vars;  // used to keep track of the variables to
+                                                        // not have doubles between read/write. Same
+                                                        // name variables are allowed
+        for (const auto& var: ion.reads) {
+            const std::string name = "ion_" + var;
+            codegen_int_variables.emplace_back(make_symbol(name));
+            codegen_int_variables.back().is_constant = true;
+            ion_vars[name] = codegen_int_variables.size() - 1;
+        }
+
+        /// symbol for di_ion_dv var
+        std::shared_ptr<symtab::Symbol> ion_di_dv_var = nullptr;
+
+        for (const auto& var: ion.writes) {
+            const std::string name = "ion_" + var;
+
+            const auto ion_vars_it = ion_vars.find(name);
+            if (ion_vars_it != ion_vars.end()) {
+                codegen_int_variables[ion_vars_it->second].is_constant = false;
+            } else {
+                codegen_int_variables.emplace_back(make_symbol("ion_" + var));
+            }
+            if (ion.is_ionic_current(var)) {
+                ion_di_dv_var = make_symbol("ion_di" + ion.name + "dv");
+            }
+            if (ion.is_intra_cell_conc(var) || ion.is_extra_cell_conc(var)) {
+                need_style = true;
+            }
+        }
+
+        /// insert after read/write variables but before style ion variable
+        if (ion_di_dv_var != nullptr) {
+            codegen_int_variables.emplace_back(ion_di_dv_var);
+        }
+
+        if (need_style) {
+            codegen_int_variables.emplace_back(make_symbol("style_" + ion.name), false, true);
+            codegen_int_variables.back().is_constant = true;
+        }
+    }
+
+    for (const auto& var: pointer_variables) {
+        auto name = var->get_name();
+        if (var->has_any_property(NmodlType::pointer_var)) {
+            codegen_int_variables.emplace_back(make_symbol(name));
+        } else {
+            codegen_int_variables.emplace_back(make_symbol(name), true);
+        }
+    }
+
+    if (diam_used) {
+        codegen_int_variables.emplace_back(make_symbol(naming::DIAM_VARIABLE));
+    }
+
+    if (area_used) {
+        codegen_int_variables.emplace_back(make_symbol(naming::AREA_VARIABLE));
+    }
+
+    // for non-artificial cell, when net_receive buffering is enabled
+    // then tqitem is an offset
+    if (net_send_used) {
+        if (artificial_cell) {
+            codegen_int_variables.emplace_back(make_symbol(naming::TQITEM_VARIABLE), true);
+        } else {
+            codegen_int_variables.emplace_back(make_symbol(naming::TQITEM_VARIABLE),
+                                               false,
+                                               false,
+                                               true);
+            codegen_int_variables.back().is_constant = true;
+        }
+        tqitem_index = codegen_int_variables.size() - 1;
+    }
+
+    /**
+     * \note Variables for watch statements : there is one extra variable
+     * used in coreneuron compared to actual watch statements for compatibility
+     * with neuron (which uses one extra Datum variable)
+     */
+    if (!watch_statements.empty()) {
+        for (int i = 0; i < watch_statements.size() + 1; i++) {
+            codegen_int_variables.emplace_back(make_symbol("watch{}"_format(i)),
+                                               false,
+                                               false,
+                                               true);
+        }
+    }
+}
+
+
+/**
+ * \details When we enable fine level parallelism at channel level, we have do updates
+ * to ion variables in atomic way. As cpus don't have atomic instructions in
+ * simd loop, we have to use shadow vectors for every ion variables. Here
+ * we return list of all such variables.
+ *
+ * \todo If conductances are specified, we don't need all below variables
+ */
+void CodegenInfo::get_shadow_variables() {
+    for (const auto& ion: ions) {
+        for (const auto& var: ion.writes) {
+            codegen_shadow_variables.push_back({make_symbol(shadow_varname("ion_" + var))});
+            if (ion.is_ionic_current(var)) {
+                codegen_shadow_variables.push_back(
+                    {make_symbol(shadow_varname("ion_di" + ion.name + "dv"))});
+            }
+        }
+    }
+    codegen_shadow_variables.push_back({make_symbol("ml_rhs")});
+    codegen_shadow_variables.push_back({make_symbol("ml_d")});
+}
+
+
+void CodegenInfo::get_float_variables() {
+    // sort with definition order
+    auto comparator = [](const SymbolType& first, const SymbolType& second) -> bool {
+        return first->get_definition_order() < second->get_definition_order();
+    };
+
+    auto assigned = assigned_vars;
+    auto states = state_vars;
+
+    // each state variable has corresponding Dstate variable
+    for (auto& state: states) {
+        auto name = "D" + state->get_name();
+        auto symbol = make_symbol(name);
+        if (state->is_array()) {
+            symbol->set_as_array(state->get_length());
+        }
+        symbol->set_definition_order(state->get_definition_order());
+        assigned.push_back(symbol);
+    }
+    std::sort(assigned.begin(), assigned.end(), comparator);
+
+    codegen_float_variables = range_parameter_vars;
+    codegen_float_variables.insert(codegen_float_variables.end(),
+                                   range_assigned_vars.begin(),
+                                   range_assigned_vars.end());
+    codegen_float_variables.insert(codegen_float_variables.end(),
+                                   range_state_vars.begin(),
+                                   range_state_vars.end());
+    codegen_float_variables.insert(codegen_float_variables.end(), assigned.begin(), assigned.end());
+
+    if (vectorize) {
+        codegen_float_variables.push_back(make_symbol(naming::VOLTAGE_UNUSED_VARIABLE));
+    }
+    if (breakpoint_exist()) {
+        std::string name = vectorize ? naming::CONDUCTANCE_UNUSED_VARIABLE
+                                     : naming::CONDUCTANCE_VARIABLE;
+        codegen_float_variables.push_back(make_symbol(name));
+    }
+    if (net_receive_exist()) {
+        codegen_float_variables.push_back(make_symbol(naming::T_SAVE_VARIABLE));
+    }
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/codegen_info.hpp b/src/codegen/codegen_info.hpp
index 3298391674..b0a41583b5 100644
--- a/src/codegen/codegen_info.hpp
+++ b/src/codegen/codegen_info.hpp
@@ -21,6 +21,56 @@
 namespace nmodl {
 namespace codegen {
 
+using SymbolType = std::shared_ptr<symtab::Symbol>;
+
+/**
+ * Creates a temporary symbol
+ * \param name The name of the symbol
+ * \return     A symbol based on the given name
+ */
+SymbolType make_symbol(const std::string& name);
+
+/**
+ * Constructs a shadow variable name
+ * \param name The name of the variable
+ * \return     The name of the variable prefixed with \c shadow_
+ */
+std::string shadow_varname(const std::string& name);
+
+/**
+ * \class IndexVariableInfo
+ * \brief Helper to represent information about index/int variables
+ *
+ */
+struct IndexVariableInfo {
+    /// symbol for the variable
+    const std::shared_ptr<symtab::Symbol> symbol;
+
+    /// if variable reside in vdata field of NrnThread
+    /// typically true for bbcore pointer
+    bool is_vdata = false;
+
+    /// if this is pure index (e.g. style_ion) variables is directly
+    /// index and shouldn't be printed with data/vdata
+    bool is_index = false;
+
+    /// if this is an integer (e.g. tqitem, point_process) variable which
+    /// is printed as array accesses
+    bool is_integer = false;
+
+    /// if the variable is qualified as constant (this is property of IndexVariable)
+    bool is_constant = false;
+
+    IndexVariableInfo(std::shared_ptr<symtab::Symbol> symbol,
+                      bool is_vdata = false,
+                      bool is_index = false,
+                      bool is_integer = false)
+        : symbol(std::move(symbol))
+        , is_vdata(is_vdata)
+        , is_index(is_index)
+        , is_integer(is_integer) {}
+};
+
 /**
  * @addtogroup codegen_details
  * @{
@@ -374,6 +424,15 @@ struct CodegenInfo {
     /// new one used in print_ion_types
     std::vector<SymbolType> use_ion_variables;
 
+    /// all int variables for the model
+    std::vector<IndexVariableInfo> codegen_int_variables;
+
+    /// all ion variables that could be possibly written
+    std::vector<SymbolType> codegen_shadow_variables;
+
+    /// all float variables for the model
+    std::vector<SymbolType> codegen_float_variables;
+
     /// this is the order in which they appear in derivative block
     /// this is required while printing them in initlist function
     std::vector<SymbolType> prime_variables_by_order;
@@ -452,6 +511,64 @@ struct CodegenInfo {
     /// true if WatchStatement uses voltage v variable
     bool is_voltage_used_by_watch_statements() const;
 
+    /**
+     * Check if net_send_buffer is required
+     */
+    bool net_send_buffer_required() const noexcept {
+        if (net_receive_required() && !artificial_cell) {
+            if (net_event_used || net_send_used || is_watch_used()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Check if net receive/send buffering kernels required
+     */
+    bool net_receive_buffering_required() const noexcept {
+        return point_process && !artificial_cell && net_receive_node != nullptr;
+    }
+
+    /**
+     * Check if nrn_state function is required
+     */
+    bool nrn_state_required() const noexcept {
+        if (artificial_cell) {
+            return false;
+        }
+        return nrn_state_block != nullptr || currents.empty();
+    }
+
+    /**
+     * Check if nrn_cur function is required
+     */
+    bool nrn_cur_required() const noexcept {
+        return breakpoint_node != nullptr && !currents.empty();
+    }
+
+    /**
+     * Check if net_receive node exist
+     */
+    bool net_receive_exist() const noexcept {
+        return net_receive_node != nullptr;
+    }
+
+    /**
+     * Check if breakpoint node exist
+     */
+    bool breakpoint_exist() const noexcept {
+        return breakpoint_node != nullptr;
+    }
+
+
+    /**
+     * Check if net_receive function is required
+     */
+    bool net_receive_required() const noexcept {
+        return net_receive_exist();
+    }
+
     /**
      * Checks if the given variable name belongs to a state variable
      * \param name The variable name
@@ -494,6 +611,24 @@ struct CodegenInfo {
 
     /// if we need a call back to wrote_conc in neuron/coreneuron
     bool require_wrote_conc = false;
+
+    /**
+     * Determine all \c int variables required during code generation
+     * \return A \c vector of \c int variables
+     */
+    void get_int_variables();
+
+    /**
+     * Determine all ion write variables that require shadow vectors during code generation
+     * \return A \c vector of ion variables
+     */
+    void get_shadow_variables();
+
+    /**
+     * Determine all \c float variables required during code generation
+     * \return A \c vector of \c float variables
+     */
+    void get_float_variables();
 };
 
 /** @} */  // end of codegen_backends
diff --git a/src/codegen/codegen_ispc_visitor.cpp b/src/codegen/codegen_ispc_visitor.cpp
index b2822f1078..808aad1690 100644
--- a/src/codegen/codegen_ispc_visitor.cpp
+++ b/src/codegen/codegen_ispc_visitor.cpp
@@ -437,7 +437,7 @@ void CodegenIspcVisitor::print_ion_variable() {
 /****************************************************************************************/
 
 void CodegenIspcVisitor::print_net_receive_buffering_wrapper() {
-    if (!net_receive_required() || info.artificial_cell) {
+    if (!info.net_receive_required() || info.artificial_cell) {
         return;
     }
     printer->add_newline(2);
@@ -515,19 +515,19 @@ void CodegenIspcVisitor::print_backend_compute_routine_decl() {
             "extern \"C\" void {}({});"_format(compute_function, get_parameter_str(params)));
     }
 
-    if (nrn_cur_required() && !emit_fallback[BlockType::Equation]) {
+    if (info.nrn_cur_required() && !emit_fallback[BlockType::Equation]) {
         compute_function = compute_method_name(BlockType::Equation);
         printer->add_line(
             "extern \"C\" void {}({});"_format(compute_function, get_parameter_str(params)));
     }
 
-    if (nrn_state_required() && !emit_fallback[BlockType::State]) {
+    if (info.nrn_state_required() && !emit_fallback[BlockType::State]) {
         compute_function = compute_method_name(BlockType::State);
         printer->add_line(
             "extern \"C\" void {}({});"_format(compute_function, get_parameter_str(params)));
     }
 
-    if (net_receive_required()) {
+    if (info.net_receive_required()) {
         auto net_recv_params = ParamVector();
         net_recv_params.emplace_back("", "{}*"_format(instance_struct()), "", "inst");
         net_recv_params.emplace_back("", "NrnThread*", "", "nt");
@@ -547,7 +547,7 @@ bool CodegenIspcVisitor::check_incompatibilities() {
     };
 
     // instance vars
-    if (check_incompatible_var_name<SymbolType>(codegen_float_variables,
+    if (check_incompatible_var_name<SymbolType>(info.codegen_float_variables,
                                                 get_name_from_symbol_type_vector)) {
         return true;
     }
@@ -613,11 +613,11 @@ bool CodegenIspcVisitor::check_incompatibilities() {
                                    visitor::calls_function(*info.net_receive_node, "net_send")));
 
     emit_fallback[BlockType::Equation] = emit_fallback[BlockType::Equation] ||
-                                         (nrn_cur_required() && info.breakpoint_node &&
+                                         (info.nrn_cur_required() && info.breakpoint_node &&
                                           has_incompatible_nodes(*info.breakpoint_node));
 
     emit_fallback[BlockType::State] = emit_fallback[BlockType::State] ||
-                                      (nrn_state_required() && info.nrn_state_block &&
+                                      (info.nrn_state_required() && info.nrn_state_block &&
                                        has_incompatible_nodes(*info.nrn_state_block));
 
 
@@ -674,7 +674,7 @@ void CodegenIspcVisitor::print_block_wrappers_initial_equation_state() {
         print_wrapper_routine(naming::NRN_INIT_METHOD, BlockType::Initial);
     }
 
-    if (nrn_cur_required()) {
+    if (info.nrn_cur_required()) {
         if (emit_fallback[BlockType::Equation]) {
             logger->warn("Falling back to C backend for emitting breakpoint block");
             fallback_codegen.print_nrn_cur();
@@ -683,7 +683,7 @@ void CodegenIspcVisitor::print_block_wrappers_initial_equation_state() {
         }
     }
 
-    if (nrn_state_required()) {
+    if (info.nrn_state_required()) {
         if (emit_fallback[BlockType::State]) {
             logger->warn("Falling back to C backend for emitting state block");
             fallback_codegen.print_nrn_state();
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 751fecfc81..fc8fda3d04 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -158,6 +158,23 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     codegen_functions.push_back(function);
 }
 
+std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_struct() {
+    ast::CodegenVarVector codegen_vars;
+    /// float variables are standard pointers to float vectors
+    for (auto& float_var: info.codegen_float_variables) {
+        auto name = new ast::Name(new ast::String(float_var->get_name()));
+        auto codegen_var = new ast::CodegenVar(1, name);
+        codegen_vars.emplace_back(codegen_var);
+    }
+    /// int variables are pointers to indexes for other vectors
+    for (auto& int_var: info.codegen_int_variables) {
+        auto name = new ast::Name(new ast::String(int_var.symbol->get_name()));
+        auto codegen_var = new ast::CodegenVar(1, name);
+        codegen_vars.emplace_back(codegen_var);
+    }
+    return std::make_shared<ast::InstanceStruct>(codegen_vars);
+}
+
 static void append_statements_from_block(ast::StatementVector& statements,
                                          const std::shared_ptr<ast::StatementBlock>& block) {
     const auto& block_statements = block->get_statements();
@@ -523,7 +540,11 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
+
+    auto llvm_instance_struct = create_instance_struct();
+    node.emplace_back_node(llvm_instance_struct);
 }
 
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 0ec3792b9d..5634d39bd8 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -61,6 +61,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     /// create new function for FUNCTION or PROCEDURE block
     void create_function_for_node(ast::Block& node);
 
+    /// create new InstanceStruct
+    std::shared_ptr<ast::InstanceStruct> create_instance_struct();
+
   public:
     CodegenLLVMHelperVisitor() = default;
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 831c43317a..1433b5a648 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -90,6 +90,12 @@ llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
     return llvm::Type::getDoubleTy(*context);
 }
 
+llvm::Type* CodegenLLVMVisitor::get_default_fp_ptr_type() {
+    if (use_single_precision)
+        return llvm::Type::getFloatPtrTy(*context);
+    return llvm::Type::getDoublePtrTy(*context);
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
@@ -574,6 +580,17 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
     values.push_back(var);
 }
 
+void CodegenLLVMVisitor::visit_instance_struct(const ast::InstanceStruct& node) {
+    std::vector<llvm::Type*> members;
+    for (const auto& variable: node.get_codegen_vars()) {
+        members.push_back(get_default_fp_ptr_type());
+    }
+
+    llvm_struct = llvm::StructType::create(*context, mod_filename + "_Instance");
+    llvm_struct->setBody(members);
+    module->getOrInsertGlobal("inst", llvm_struct);
+}
+
 void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
     // Get the current and the next blocks within the function.
     llvm::BasicBlock* curr_block = builder.GetInsertBlock();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 3003a119b5..7a5488de43 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -79,6 +79,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
     bool use_single_precision;
 
+    // LLVM mechanism struct
+    llvm::StructType* llvm_struct;
+
     /**
      *\brief Run LLVM optimisation passes on generated IR
      *
@@ -147,6 +150,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Type* get_default_fp_type();
 
+    /**
+     * Returns pointer to 64-bit or 32-bit LLVM floating type
+     * \return     \c LLVM pointer to floating point type according to `use_single_precision` flag
+     */
+    llvm::Type* get_default_fp_ptr_type();
+
     /**
      * Create a function call to an external method
      * \param name external method name
@@ -242,6 +251,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_program(const ast::Program& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
+    void visit_instance_struct(const ast::InstanceStruct& node) override;
     void visit_while_statement(const ast::WhileStatement& node) override;
 
     // \todo: move this to debug mode (e.g. -v option or --dump-ir)
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 25fc5151c4..24cc5a6ddf 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -117,6 +117,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/independent_definition.hpp
     ${PROJECT_BINARY_DIR}/src/ast/indexed_name.hpp
     ${PROJECT_BINARY_DIR}/src/ast/initial_block.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/instance_struct.hpp
     ${PROJECT_BINARY_DIR}/src/ast/integer.hpp
     ${PROJECT_BINARY_DIR}/src/ast/kinetic_block.hpp
     ${PROJECT_BINARY_DIR}/src/ast/lag_statement.hpp
diff --git a/src/language/nmodl.yaml b/src/language/nmodl.yaml
index 54da340b7b..765aeeda37 100644
--- a/src/language/nmodl.yaml
+++ b/src/language/nmodl.yaml
@@ -438,6 +438,18 @@
                               is base class and defines common interface for these nodes.
 
                       children:
+                        - InstanceStruct:
+                            nmodl: "INSTANCE_STRUCT "
+                            members:
+                              - codegen_vars:
+                                  brief: "Vector of CodegenVars"
+                                  type: CodegenVar
+                                  vector: true
+                                  add: true
+                                  separator: "\\n    "
+                                  prefix: {value: "{\\n    ", force: true}
+                                  suffix: {value: "\\n}", force: true}
+                            brief: "LLVM IR Struct that holds the mechanism instance's variables"
 
                         - ParamBlock:
                             nmodl: "PARAMETER "
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index d16b02b2f5..2d5ca7ef2a 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -367,12 +367,12 @@ SCENARIO("Function", "[visitor][llvm]") {
             std::smatch m;
 
             // Check function signature. The return type should be the default double type.
-            std::regex function_signature(R"(define double @foo\(double %x1\) \{)");
+            std::regex function_signature(R"(define double @foo\(double %x[0-9].*\) \{)");
             REQUIRE(std::regex_search(module_string, m, function_signature));
 
             // Check that function arguments are allocated on the local stack.
             std::regex alloca_instr(R"(%x = alloca double)");
-            std::regex store_instr(R"(store double %x1, double\* %x)");
+            std::regex store_instr(R"(store double %x[0-9].*, double\* %x)");
             REQUIRE(std::regex_search(module_string, m, alloca_instr));
             REQUIRE(std::regex_search(module_string, m, store_instr));
 
@@ -638,7 +638,7 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             std::smatch m;
 
             // Check procedure signature.
-            std::regex function_signature(R"(define i32 @with_argument\(double %x1\) \{)");
+            std::regex function_signature(R"(define i32 @with_argument\(double %x[0-9].*\) \{)");
             REQUIRE(std::regex_search(module_string, m, function_signature));
 
             // Check dummy return.
@@ -653,7 +653,7 @@ SCENARIO("Procedure", "[visitor][llvm]") {
 
             // Check that procedure arguments are allocated on the local stack.
             std::regex alloca_instr(R"(%x = alloca double)");
-            std::regex store_instr(R"(store double %x1, double\* %x)");
+            std::regex store_instr(R"(store double %x[0-9].*, double\* %x)");
             REQUIRE(std::regex_search(module_string, m, alloca_instr));
             REQUIRE(std::regex_search(module_string, m, store_instr));
         }
@@ -753,8 +753,44 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
 
             // Check if the values are optimised out
             std::regex empty_proc(
-                R"(define i32 @add\(double %a1, double %b2\) \{\n(\s)*ret i32 0\n\})");
+                R"(define i32 @add\(double %a[0-9].*, double %b[0-9].*\) \{\n(\s)*ret i32 0\n\})");
             REQUIRE(std::regex_search(module_string, m, empty_proc));
         }
     }
 }
+
+//=============================================================================
+// Create Instance Struct
+//=============================================================================
+
+SCENARIO("Creation of Instance Struct", "[visitor][llvm][instance_struct]") {
+    GIVEN("NEURON block with RANGE variables and IONS") {
+        std::string nmodl_text = R"(
+            NEURON {
+                USEION na READ ena WRITE ina
+                NONSPECIFIC_CURRENT il
+                RANGE minf, hinf
+            }
+
+            STATE {
+                m
+            }
+
+            ASSIGNED {
+                v (mV)
+                celsius (degC)
+                minf
+                hinf
+            }
+        )";
+
+        THEN("create struct with the declared variables") {
+            std::string module_string = run_llvm_visitor(nmodl_text, true);
+            std::smatch m;
+
+            std::regex instance_struct_declaration(
+                R"(%unknown_Instance = type \{ double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\* \})");
+            REQUIRE(std::regex_search(module_string, m, instance_struct_declaration));
+        }
+    }
+}

From b69096119780d66d914c77341c4149469efa7919 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Wed, 3 Feb 2021 22:45:41 +0300
Subject: [PATCH 023/331] Printf support in LLVM IR codegen (#510)

- Added support for string function arguments. These are
   converted into global `i8` array values.
- Added support for `printf` function call with variable number
   of arguments.
- Refactored function/procedure call argument processing into
   a separate function.

fixes #510
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 57 +++++++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp | 18 +++++--
 test/unit/codegen/codegen_llvm_ir.cpp     | 36 ++++++++++++++
 3 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 1433b5a648..3bb3b38dfc 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -116,6 +116,11 @@ void CodegenLLVMVisitor::run_llvm_opt_passes() {
 
 void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
                                                      const ast::ExpressionVector& arguments) {
+    if (name == "printf") {
+        create_printf_call(arguments);
+        return;
+    }
+
     std::vector<llvm::Value*> argument_values;
     std::vector<llvm::Type*> argument_types;
     for (const auto& arg: arguments) {
@@ -145,24 +150,39 @@ void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
                                               const std::string& name,
                                               const ast::ExpressionVector& arguments) {
     // Check that function is called with the expected number of arguments.
-    if (arguments.size() != func->arg_size()) {
+    if (!func->isVarArg() && arguments.size() != func->arg_size()) {
         throw std::runtime_error("Error: Incorrect number of arguments passed");
     }
 
-    // Process each argument and add it to a vector to pass to the function call instruction. Note
-    // that type checks are not needed here as NMODL operates on doubles by default.
+    // Pack function call arguments to vector and create a call instruction.
     std::vector<llvm::Value*> argument_values;
-    for (const auto& arg: arguments) {
-        arg->accept(*this);
-        llvm::Value* value = values.back();
-        values.pop_back();
-        argument_values.push_back(value);
-    }
-
+    argument_values.reserve(arguments.size());
+    pack_function_call_arguments(arguments, argument_values);
     llvm::Value* call = builder.CreateCall(func, argument_values);
     values.push_back(call);
 }
 
+void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& arguments) {
+    // First, create printf declaration or insert it if it does not exit.
+    std::string name = "printf";
+    llvm::Function* printf = module->getFunction(name);
+    if (!printf) {
+        llvm::Type* ptr_type = llvm::Type::getInt8PtrTy(*context);
+        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+        llvm::FunctionType* printf_type =
+            llvm::FunctionType::get(i32_type, ptr_type, /*isVarArg=*/true);
+
+        printf =
+            llvm::Function::Create(printf_type, llvm::Function::ExternalLinkage, name, *module);
+    }
+
+    // Create a call instruction.
+    std::vector<llvm::Value*> argument_values;
+    argument_values.reserve(arguments.size());
+    pack_function_call_arguments(arguments, argument_values);
+    builder.CreateCall(printf, argument_values);
+}
+
 void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::CodegenFunction& node) {
     const auto& name = node.get_node_name();
     const auto& arguments = node.get_arguments();
@@ -188,6 +208,23 @@ llvm::Value* CodegenLLVMVisitor::lookup(const std::string& name) {
     return val;
 }
 
+void CodegenLLVMVisitor::pack_function_call_arguments(const ast::ExpressionVector& arguments,
+                                                      std::vector<llvm::Value*>& arg_values) {
+    for (const auto& arg: arguments) {
+        if (arg->is_string()) {
+            // If the argument is a string, create a global i8* variable with it.
+            auto string_arg = std::dynamic_pointer_cast<ast::String>(arg);
+            llvm::Value* str = builder.CreateGlobalStringPtr(string_arg->get_value());
+            arg_values.push_back(str);
+        } else {
+            arg->accept(*this);
+            llvm::Value* value = values.back();
+            values.pop_back();
+            arg_values.push_back(value);
+        }
+    }
+}
+
 llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
                                                          llvm::Value* rhs,
                                                          unsigned op) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 7a5488de43..9bdbdef7e9 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -173,6 +173,11 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void create_function_call(llvm::Function* func,
                               const std::string& name,
                               const ast::ExpressionVector& arguments);
+    /**
+     * Create a function call to printf function
+     * \param arguments expressions passed as arguments to the printf call
+     */
+    void create_printf_call(const ast::ExpressionVector& arguments);
 
     /**
      * Emit function or procedure declaration in LLVM given the node
@@ -195,6 +200,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Value* lookup(const std::string& name);
 
+    /**
+     * Fills values vector with processed NMODL function call arguments
+     * \param arguments expression vector
+     * \param arg_values vector of LLVM IR values to fill
+     */
+    void pack_function_call_arguments(const ast::ExpressionVector& arguments,
+                                      std::vector<llvm::Value*>& arg_values);
+
     /**
      * Visit nmodl arithmetic binary operator
      * \param lhs LLVM value of evaluated lhs expression
@@ -229,11 +242,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Value* visit_comparison_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
 
-    /**
-     * Visit nmodl function or procedure
-     * \param node the AST node representing the function or procedure in NMODL
-     */
-    void visit_procedure_or_function(const ast::Block& node);
 
     // Visitors
     void visit_binary_expression(const ast::BinaryExpression& node) override;
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 2d5ca7ef2a..ba0c725c0c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -474,6 +474,42 @@ SCENARIO("Function call", "[visitor][llvm]") {
         }
     }
 
+    GIVEN("A call to printf") {
+        std::string nmodl_text = R"(
+            PROCEDURE bar() {
+                LOCAL i
+                i = 0
+                printf("foo")
+                printf("bar %d", i)
+            }
+        )";
+
+        THEN("printf is declared and global string values are created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check for global string values.
+            std::regex str1(
+                R"(@[0-9]+ = private unnamed_addr constant \[6 x i8\] c\"\\22foo\\22\\00\")");
+            std::regex str2(
+                R"(@[0-9]+ = private unnamed_addr constant \[9 x i8\] c\"\\22bar %d\\22\\00\")");
+            REQUIRE(std::regex_search(module_string, m, str1));
+            REQUIRE(std::regex_search(module_string, m, str2));
+
+            // Check for printf declaration.
+            std::regex declaration(R"(declare i32 @printf\(i8\*, \.\.\.\))");
+            REQUIRE(std::regex_search(module_string, m, declaration));
+
+            // Check the correct calls are made.
+            std::regex call1(
+                R"(call i32 \(i8\*, \.\.\.\) @printf\(i8\* getelementptr inbounds \(\[6 x i8\], \[6 x i8\]\* @[0-9]+, i32 0, i32 0\)\))");
+            std::regex call2(
+                R"(call i32 \(i8\*, \.\.\.\) @printf\(i8\* getelementptr inbounds \(\[9 x i8\], \[9 x i8\]\* @[0-9]+, i32 0, i32 0\), double %[0-9]+\))");
+            REQUIRE(std::regex_search(module_string, m, call1));
+            REQUIRE(std::regex_search(module_string, m, call2));
+        }
+    }
+
     GIVEN("A call to function with the wrong number of arguments") {
         std::string nmodl_text = R"(
             FUNCTION foo(x, y) {

From a561c97b88e830fc09d0553249f9b779e0221633 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 15 Feb 2021 11:21:58 +0100
Subject: [PATCH 024/331] =?UTF-8?q?Fix=20issue=20error:=20=E2=80=98runtime?=
 =?UTF-8?q?=5Ferror=E2=80=99=20is=20not=20a=20member=20of=20=E2=80=98std?=
 =?UTF-8?q?=E2=80=99=20(#512)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/language/templates/ast/ast_decl.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/language/templates/ast/ast_decl.hpp b/src/language/templates/ast/ast_decl.hpp
index 196dc9daf4..546c7dcb40 100644
--- a/src/language/templates/ast/ast_decl.hpp
+++ b/src/language/templates/ast/ast_decl.hpp
@@ -14,6 +14,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <stdexcept>
 
 /// \file
 /// \brief Auto generated  AST node types and aliases declaration

From 5c80684c498132a2827f674e42c63ac9c090cad6 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Fri, 5 Mar 2021 19:56:29 +0100
Subject: [PATCH 025/331] Move code gen specific InstanceStruct node to
 codegen.yaml (#526)

* Move code gen specific InstanceStruct node to codegen.yaml
  - nmodl.yaml file is more for language constructs
  - InstanceStruct is specific for code generation and hence
    move it to codegen.yaml
* Update CI scripts
* fix cmake-format with v==0.6.13
---
 src/language/codegen.yaml | 12 ++++++++++++
 src/language/nmodl.yaml   | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 5e26bc3f0f..aeb596b490 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -143,6 +143,18 @@
                                   brief: "Body of the function"
                                   type: StatementBlock
                                   getter: {override: true}
+                        - InstanceStruct:
+                            nmodl: "INSTANCE_STRUCT "
+                            members:
+                              - codegen_vars:
+                                  brief: "Vector of CodegenVars"
+                                  type: CodegenVar
+                                  vector: true
+                                  add: true
+                                  separator: "\\n    "
+                                  prefix: {value: "{\\n    ", force: true}
+                                  suffix: {value: "\\n}", force: true}
+                            brief: "LLVM IR Struct that holds the mechanism instance's variables"
                   - WrappedExpression:
                       brief: "Wrap any other expression type"
                       members:
diff --git a/src/language/nmodl.yaml b/src/language/nmodl.yaml
index 765aeeda37..54da340b7b 100644
--- a/src/language/nmodl.yaml
+++ b/src/language/nmodl.yaml
@@ -438,18 +438,6 @@
                               is base class and defines common interface for these nodes.
 
                       children:
-                        - InstanceStruct:
-                            nmodl: "INSTANCE_STRUCT "
-                            members:
-                              - codegen_vars:
-                                  brief: "Vector of CodegenVars"
-                                  type: CodegenVar
-                                  vector: true
-                                  add: true
-                                  separator: "\\n    "
-                                  prefix: {value: "{\\n    ", force: true}
-                                  suffix: {value: "\\n}", force: true}
-                            brief: "LLVM IR Struct that holds the mechanism instance's variables"
 
                         - ParamBlock:
                             nmodl: "PARAMETER "

From 5b72bc395b3750982b5fdaa88a6490d8350ceac1 Mon Sep 17 00:00:00 2001
From: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sat, 27 Feb 2021 13:15:09 +0100
Subject: [PATCH 026/331] * Improvements to codegen helper (Part I)  - instance
 structure now contains all global variables  - instance structure now
 contains index variables for ions  - nrn_state kernel now has all variables
 converted to instance  - InstanceVarHelper added to query variable and it's
 location * Support for codegen variable with type * Add nmodl_to_json helper
 added in main.cpp * Added --vector-width CLI option * Add instance struct
 argument to nrn_state_hh * Add comments as TODOs to support LLVM IR
 generation

Note that this commit and next commit (Part II) are required to
make LLVM IR code generation working. Vector IR generation is
working except indirect indexes. See comment in #531.
---
 src/codegen/codegen_naming.hpp                |  6 ++
 .../llvm/codegen_llvm_helper_visitor.cpp      | 78 ++++++++++++++-----
 .../llvm/codegen_llvm_helper_visitor.hpp      | 71 ++++++++++++++++-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 39 +++++++++-
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  8 ++
 src/language/code_generator.cmake             |  3 +-
 src/language/codegen.yaml                     | 25 ++++--
 src/language/node_info.py                     |  1 +
 src/language/nodes.py                         |  4 +
 .../templates/visitors/nmodl_visitor.cpp      |  7 +-
 src/main.cpp                                  | 66 +++++++++-------
 11 files changed, 250 insertions(+), 58 deletions(-)

diff --git a/src/codegen/codegen_naming.hpp b/src/codegen/codegen_naming.hpp
index 6d8875a000..e1cbfaf6f0 100644
--- a/src/codegen/codegen_naming.hpp
+++ b/src/codegen/codegen_naming.hpp
@@ -80,6 +80,12 @@ static constexpr char VOLTAGE_UNUSED_VARIABLE[] = "v_unused";
 /// variable t indicating last execution time of net receive block
 static constexpr char T_SAVE_VARIABLE[] = "tsave";
 
+/// global variable celsius
+static constexpr char CELSIUS_VARIABLE[] = "celsius";
+
+/// global variable second_order
+static constexpr char SECOND_ORDER_VARIABLE[] = "secondorder";
+
 /// shadow rhs variable in neuron thread structure
 static constexpr char NTHREAD_RHS_SHADOW[] = "_shadow_rhs";
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index fc8fda3d04..b3f75b9372 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -141,12 +141,12 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     block->emplace_back_statement(return_statement);
 
     /// prepare function arguments based original node arguments
-    ast::CodegenArgumentVector arguments;
+    ast::CodegenVarWithTypeVector arguments;
     for (const auto& param: node.get_parameters()) {
         /// create new type and name for creating new ast node
         auto type = new ast::CodegenVarType(FLOAT_TYPE);
         auto var = param->get_name()->clone();
-        arguments.emplace_back(new ast::CodegenArgument(type, var));
+        arguments.emplace_back(new ast::CodegenVarWithType(type, 0, var));
     }
 
     /// return type of the function is same as return variable type
@@ -159,19 +159,43 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
 }
 
 std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_struct() {
-    ast::CodegenVarVector codegen_vars;
+    ast::CodegenVarWithTypeVector codegen_vars;
+
+    auto add_var_with_type =
+        [&](const std::string& name, const ast::AstNodeType type, int is_pointer) {
+            auto var_name = new ast::Name(new ast::String(name));
+            auto var_type = new ast::CodegenVarType(type);
+            auto codegen_var = new ast::CodegenVarWithType(var_type, is_pointer, var_name);
+            codegen_vars.emplace_back(codegen_var);
+        };
+
     /// float variables are standard pointers to float vectors
     for (auto& float_var: info.codegen_float_variables) {
-        auto name = new ast::Name(new ast::String(float_var->get_name()));
-        auto codegen_var = new ast::CodegenVar(1, name);
-        codegen_vars.emplace_back(codegen_var);
+        add_var_with_type(float_var->get_name(), FLOAT_TYPE, 1);
     }
+
     /// int variables are pointers to indexes for other vectors
     for (auto& int_var: info.codegen_int_variables) {
-        auto name = new ast::Name(new ast::String(int_var.symbol->get_name()));
-        auto codegen_var = new ast::CodegenVar(1, name);
-        codegen_vars.emplace_back(codegen_var);
+        add_var_with_type(int_var.symbol->get_name(), FLOAT_TYPE, 1);
+    }
+
+    // for integer variables, there should be index
+    for (auto& int_var: info.codegen_int_variables) {
+        std::string var_name = int_var.symbol->get_name() + "_index";
+        add_var_with_type(var_name, INTEGER_TYPE, 1);
     }
+
+    // add voltage and node index
+    add_var_with_type("voltage", FLOAT_TYPE, 1);
+    add_var_with_type("node_index", INTEGER_TYPE, 1);
+
+    // add dt, t, celsius
+    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, 0);
+    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, 0);
+    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, 0);
+    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, 0);
+    add_var_with_type(MECH_NODECOUNT_VAR, INTEGER_TYPE, 0);
+
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
 
@@ -362,13 +386,24 @@ void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
     auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
     for (auto& v: variables) {
         auto variable = std::dynamic_pointer_cast<ast::VarName>(v);
-        /// if variable is of type instance then convert it to index
-        if (info.is_an_instance_variable(variable->get_node_name())) {
+        auto variable_name = variable->get_node_name();
+
+        /// all instance variables defined in the mod file should be converted to
+        /// indexed variables based on the loop iteration variable
+        if (info.is_an_instance_variable(variable_name)) {
             auto name = variable->get_name()->clone();
             auto index = new ast::Name(new ast::String(index_var));
             auto indexed_name = std::make_shared<ast::IndexedName>(name, index);
             variable->set_name(indexed_name);
         }
+
+        /// instance_var_helper check of instance variables from mod file as well
+        /// as extra variables like ion index variables added for code generation
+        if (instance_var_helper.is_an_instance_variable(variable_name)) {
+            auto name = new ast::Name(new ast::String(MECH_INSTANCE_VAR));
+            auto var = std::make_shared<ast::CodegenInstanceVar>(name, variable->clone());
+            variable->set_name(var);
+        }
     }
 }
 
@@ -438,7 +473,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// loop constructs : initialization, condition and increment
     const auto& initialization = create_statement_as_expression("id = 0");
     const auto& condition = create_expression("id < node_count");
-    const auto& increment = create_statement_as_expression("id = id + 1");
+    const auto& increment = create_statement_as_expression("id = id + {}"_format(vector_width));
 
     /// loop body : initialization + solve blocks
     ast::StatementVector loop_def_statements;
@@ -496,9 +531,6 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// now construct a new code block which will become the body of the loop
     auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
 
-    /// convert all variables inside loop body to instance variables
-    convert_to_instance_variable(*loop_block, loop_index_var);
-
     /// convert local statement to codegenvar statement
     convert_local_statement(*loop_block);
 
@@ -508,6 +540,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
                                                                          increment,
                                                                          loop_block);
 
+    /// convert all variables inside loop body to instance variables
+    convert_to_instance_variable(*for_loop_statement, loop_index_var);
+
     /// loop itself becomes one of the statement in the function
     function_statements.push_back(for_loop_statement);
 
@@ -520,7 +555,12 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     auto return_type = new ast::CodegenVarType(ast::AstNodeType::VOID);
 
     /// \todo : currently there are no arguments
-    ast::CodegenArgumentVector code_arguments;
+    ast::CodegenVarWithTypeVector code_arguments;
+
+    auto instance_var_type = new ast::CodegenVarType(ast::AstNodeType::INSTANCE_STRUCT);
+    auto instance_var_name = new ast::Name(new ast::String("mech"));
+    auto instance_var = new ast::CodegenVarWithType(instance_var_type, 1, instance_var_name);
+    code_arguments.emplace_back(instance_var);
 
     /// finally, create new function
     auto function =
@@ -535,14 +575,16 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     CodegenHelperVisitor v;
     info = v.analyze(node);
 
+    instance_var_helper.instance = create_instance_struct();
+    node.emplace_back_node(instance_var_helper.instance);
+
     logger->info("Running CodegenLLVMHelperVisitor");
     node.visit_children(*this);
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
 
-    auto llvm_instance_struct = create_instance_struct();
-    node.emplace_back_node(llvm_instance_struct);
+    std::cout << nmodl::to_nmodl(node);
 }
 
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 5634d39bd8..981372b4d5 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -14,6 +14,7 @@
 
 #include <string>
 
+#include "ast/instance_struct.hpp"
 #include "codegen/codegen_info.hpp"
 #include "symtab/symbol_table.hpp"
 #include "visitors/ast_visitor.hpp"
@@ -21,7 +22,7 @@
 namespace nmodl {
 namespace codegen {
 
-
+using namespace fmt::literals;
 typedef std::vector<std::shared_ptr<ast::CodegenFunction>> CodegenFunctionVector;
 
 /**
@@ -29,6 +30,57 @@ typedef std::vector<std::shared_ptr<ast::CodegenFunction>> CodegenFunctionVector
  * @{
  */
 
+/**
+ * \class InstanceVarHelper
+ * \brief Helper to query instance variables information
+ *
+ * For LLVM IR generation we need to know the variable, it's type and
+ * location in the instance structure. This helper provides convenient
+ * functions to query this information.
+ */
+struct InstanceVarHelper {
+    /// pointer to instance node in the AST
+    std::shared_ptr<ast::InstanceStruct> instance;
+
+    /// find variable with given name and return the iterator
+    ast::CodegenVarWithTypeVector::const_iterator find_variable(
+        const ast::CodegenVarWithTypeVector& vars,
+        const std::string& name) {
+        return find_if(vars.begin(),
+                       vars.end(),
+                       [&](const std::shared_ptr<ast::CodegenVarWithType>& v) {
+                           return v->get_node_name() == name;
+                       });
+    }
+
+    /// check if given variable is instance variable
+    bool is_an_instance_variable(const std::string& name) {
+        const auto& vars = instance->get_codegen_vars();
+        return find_variable(vars, name) != vars.end();
+    }
+
+    /// return codegen variable with a given name
+    const std::shared_ptr<ast::CodegenVarWithType>& get_variable(const std::string& name) {
+        const auto& vars = instance->get_codegen_vars();
+        auto it = find_variable(vars, name);
+        if (it == vars.end()) {
+            throw std::runtime_error("Can not find variable with name {}"_format(name));
+        }
+        return *it;
+    }
+
+    /// return position of the variable in the instance structure
+    int get_variable_index(const std::string& name) {
+        const auto& vars = instance->get_codegen_vars();
+        auto it = find_variable(vars, name);
+        if (it == vars.end()) {
+            throw std::runtime_error("Can not find codegen variable with name {}"_format(name));
+        }
+        return (it - vars.begin());
+    }
+};
+
+
 /**
  * \class CodegenLLVMHelperVisitor
  * \brief Helper visitor for AST information to help code generation backends
@@ -48,16 +100,26 @@ typedef std::vector<std::shared_ptr<ast::CodegenFunction>> CodegenFunctionVector
  * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
+    // explicit vectorisation width
+    int vector_width;
+
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
 
     /// ast information for code generation
     codegen::CodegenInfo info;
 
+    /// mechanism data helper
+    InstanceVarHelper instance_var_helper;
+
     /// default integer and float node type
     const ast::AstNodeType INTEGER_TYPE = ast::AstNodeType::INTEGER;
     const ast::AstNodeType FLOAT_TYPE = ast::AstNodeType::DOUBLE;
 
+    /// name of the mechanism instance parameter
+    const std::string MECH_INSTANCE_VAR = "mech";
+    const std::string MECH_NODECOUNT_VAR = "node_count";
+
     /// create new function for FUNCTION or PROCEDURE block
     void create_function_for_node(ast::Block& node);
 
@@ -65,7 +127,12 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     std::shared_ptr<ast::InstanceStruct> create_instance_struct();
 
   public:
-    CodegenLLVMHelperVisitor() = default;
+    CodegenLLVMHelperVisitor(int vector_width)
+        : vector_width(vector_width){};
+
+    const InstanceVarHelper& get_instance_var_helper() {
+        return instance_var_helper;
+    }
 
     /// run visitor and return code generation functions
     CodegenFunctionVector get_codegen_functions(const ast::Program& node);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 3bb3b38dfc..80bdfd20e3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -6,7 +6,6 @@
  *************************************************************************/
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 
 #include "ast/all.hpp"
 #include "visitors/rename_visitor.hpp"
@@ -79,6 +78,8 @@ llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType&
         return llvm::Type::getInt32Ty(*context);
     case ast::AstNodeType::VOID:
         return llvm::Type::getVoidTy(*context);
+    // TODO :: George/Ioannis : Here we have to also return INSTANCE_STRUCT type
+    //         as it is used as an argument to nrn_state function
     default:
         throw std::runtime_error("Error: expecting a type in CodegenVarType node\n");
     }
@@ -556,8 +557,13 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v;
+    CodegenLLVMHelperVisitor v{vector_width};
     const auto& functions = v.get_codegen_functions(node);
+    instance_var_helper = v.get_instance_var_helper();
+
+    // TODO :: George / Ioannis :: before emitting procedures, we have
+    //         to emmit INSTANCE_STRUCT type as it's used as an argument.
+    //         Currently it's done in node.visit_children which is late.
 
     // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
@@ -603,6 +609,16 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
     if (!identifier->is_name() && !identifier->is_indexed_name())
         throw std::runtime_error("Error: Unsupported variable type");
 
+    // TODO :: George :: here instance_var_helper can be used to query
+    // variable type and it's index into structure
+    auto name = node.get_node_name();
+
+    auto codegen_var_with_type = instance_var_helper.get_variable(name);
+    auto codegen_var_index = instance_var_helper.get_variable_index(name);
+    // this will be INTEGER or DOUBLE
+    auto var_type = codegen_var_with_type->get_type()->get_type();
+    auto is_pointer = codegen_var_with_type->get_is_pointer();
+
     llvm::Value* ptr;
     if (identifier->is_name())
         ptr = lookup(node.get_node_name());
@@ -620,7 +636,24 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 void CodegenLLVMVisitor::visit_instance_struct(const ast::InstanceStruct& node) {
     std::vector<llvm::Type*> members;
     for (const auto& variable: node.get_codegen_vars()) {
-        members.push_back(get_default_fp_ptr_type());
+        // TODO :: Ioannis / George :: we have now double*, int*, double and int
+        //         variables in the instance structure. Each variable is of type
+        //         ast::CodegenVarWithType. So we can query variable type and if
+        //         it's pointer.
+        auto is_pointer = variable->get_is_pointer();
+        auto type = variable->get_type()->get_type();
+
+        // todo : clean up ?
+        if (type == ast::AstNodeType::DOUBLE) {
+            auto llvm_type = is_pointer ? get_default_fp_ptr_type() : get_default_fp_type();
+            members.push_back(llvm_type);
+        } else {
+            if (is_pointer) {
+                members.push_back(llvm::Type::getInt32PtrTy(*context));
+            } else {
+                members.push_back(llvm::Type::getInt32Ty(*context));
+            }
+        }
     }
 
     llvm_struct = llvm::StructType::create(*context, mod_filename + "_Instance");
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 9bdbdef7e9..b20a19bac7 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -18,6 +18,7 @@
 #include <ostream>
 #include <string>
 
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
@@ -56,6 +57,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     std::string output_dir;
 
   private:
+    InstanceVarHelper instance_var_helper;
+
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
 
     std::unique_ptr<llvm::Module> module = std::make_unique<llvm::Module>(mod_filename, *context);
@@ -79,6 +82,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
     bool use_single_precision;
 
+    // explicit vectorisation width
+    int vector_width;
+
     // LLVM mechanism struct
     llvm::StructType* llvm_struct;
 
@@ -100,11 +106,13 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
                        bool opt_passes,
+                       int vector_width = 1,
                        bool use_single_precision = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , use_single_precision(use_single_precision)
+        , vector_width(vector_width)
         , builder(*context)
         , fpm(module.get()) {}
 
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 24cc5a6ddf..46dc01ea9f 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -65,15 +65,16 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/block_comment.hpp
     ${PROJECT_BINARY_DIR}/src/ast/boolean.hpp
     ${PROJECT_BINARY_DIR}/src/ast/breakpoint_block.hpp
-    ${PROJECT_BINARY_DIR}/src/ast/codegen_argument.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_atomic_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_for_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_function.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_instance_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_struct.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_list_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_type.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_var_with_type.hpp
     ${PROJECT_BINARY_DIR}/src/ast/compartment.hpp
     ${PROJECT_BINARY_DIR}/src/ast/conductance_hint.hpp
     ${PROJECT_BINARY_DIR}/src/ast/conserve.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index aeb596b490..3dc802c982 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -49,17 +49,30 @@
                                   brief: "Name of the variable"
                                   type: Identifier
                                   node_name: true
-                        - CodegenArgument:
-                            brief: "Represent argument to a function"
+                        - CodegenVarWithType:
+                            brief: "Represent variable used for code generation"
                             members:
                               - type:
-                                  brief: "Type of the argument"
+                                  brief: "Type of the variable"
                                   type: CodegenVarType
                                   suffix: {value: " "}
+                              - is_pointer:
+                                  brief: "If variable is pointer type"
+                                  type: int
                               - name:
-                                  brief: "Name of the argument"
+                                  brief: "Name of the variable"
                                   type: Identifier
                                   node_name: true
+                        - CodegenInstanceVar:
+                            brief: "Represent instance variable"
+                            members:
+                              - instance_var:
+                                  brief: "Instance variable"
+                                  type: Name
+                                  suffix: {value: "->"}
+                              - member_var:
+                                  brief: "Member variable within instance"
+                                  type: Identifier
                   - Block:
                       children:
                         - NrnStateBlock:
@@ -134,7 +147,7 @@
                                   node_name: true
                               - arguments:
                                   brief: "Vector of the parameters to the function"
-                                  type: CodegenArgument
+                                  type: CodegenVarWithType
                                   vector: true
                                   prefix: {value: "(", force: true}
                                   suffix: {value: ")", force: true}
@@ -148,7 +161,7 @@
                             members:
                               - codegen_vars:
                                   brief: "Vector of CodegenVars"
-                                  type: CodegenVar
+                                  type: CodegenVarWithType
                                   vector: true
                                   add: true
                                   separator: "\\n    "
diff --git a/src/language/node_info.py b/src/language/node_info.py
index 8b4e5fe0a2..57833af229 100644
--- a/src/language/node_info.py
+++ b/src/language/node_info.py
@@ -170,6 +170,7 @@
 UNIT_BLOCK = "UnitBlock"
 AST_NODETYPE_NODE= "AstNodeType"
 CODEGEN_VAR_TYPE_NODE = "CodegenVarType"
+CODEGEN_VAR_WITH_TYPE_NODE = "CodegenVarWithType"
 
 # name of variable in prime node which represent order of derivative
 ORDER_VAR_NAME = "order"
diff --git a/src/language/nodes.py b/src/language/nodes.py
index 4f96659569..cf7aa1f30b 100644
--- a/src/language/nodes.py
+++ b/src/language/nodes.py
@@ -155,6 +155,10 @@ def is_ast_nodetype_node(self):
     def is_codegen_var_type_node(self):
         return self.class_name == node_info.CODEGEN_VAR_TYPE_NODE
 
+    @property
+    def is_codegen_var_with_type_node(self):
+        return self.class_name == node_info.CODEGEN_VAR_WITH_TYPE_NODE
+
     @property
     def is_enum_node(self):
         data_type = node_info.DATA_TYPES[self.class_name]
diff --git a/src/language/templates/visitors/nmodl_visitor.cpp b/src/language/templates/visitors/nmodl_visitor.cpp
index f7bb8279ca..01b470e70d 100644
--- a/src/language/templates/visitors/nmodl_visitor.cpp
+++ b/src/language/templates/visitors/nmodl_visitor.cpp
@@ -115,7 +115,12 @@ void NmodlPrintVisitor::visit_{{ node.class_name|snake_case}}(const {{ node.clas
     {% endif %}
     {% for child in node.children %}
         {% call guard(child.force_prefix, child.force_suffix) -%}
-        {% if child.is_base_type_node %}
+
+        {% if node.is_codegen_var_with_type_node and child.varname == "is_pointer" %}
+             if(node.get_{{ child.varname }}()) {
+                printer->add_element("*");
+             }
+        {% elif child.is_base_type_node %}
             {% if child.is_ast_nodetype_node %}
                printer->add_element(ast::to_string(node.get_{{child.varname}}()));
             {% endif %}
diff --git a/src/main.cpp b/src/main.cpp
index 035189f4cc..f9e083f930 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -172,6 +172,9 @@ int main(int argc, const char* argv[]) {
 
     /// run llvm optimisation passes
     bool llvm_opt_passes(false);
+
+    /// llvm vector width;
+    int llvm_vec_width = 1;
 #endif
 
     app.get_formatter()->column_width(40);
@@ -288,6 +291,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
+    llvm_opt->add_option("--vector-width",
+        llvm_vec_width,
+        "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
 #endif
     // clang-format on
 
@@ -317,15 +323,24 @@ int main(int argc, const char* argv[]) {
         }
     };
 
+    /// write ast to nmodl
+    const auto ast_to_json = [json_ast](ast::Program& ast, const std::string& filepath) {
+        if (json_ast) {
+            JSONVisitor(filepath).write(ast);
+            logger->info("AST to JSON transformation written to {}", filepath);
+        }
+    };
+
     for (const auto& file: mod_files) {
         logger->info("Processing {}", file);
 
         const auto modfile = utils::remove_extension(utils::base_name(file));
 
         /// create file path for nmodl file
-        auto filepath = [scratch_dir, modfile](const std::string& suffix) {
+        auto filepath = [scratch_dir, modfile](const std::string& suffix, const std::string& ext) {
             static int count = 0;
-            return "{}/{}.{}.{}.mod"_format(scratch_dir, modfile, std::to_string(count++), suffix);
+            return "{}/{}.{}.{}.{}"_format(
+                scratch_dir, modfile, std::to_string(count++), suffix, ext);
         };
 
         /// driver object creates lexer and parser, just call parser method
@@ -351,7 +366,7 @@ int main(int argc, const char* argv[]) {
         {
             logger->info("Running CVode to cnexp visitor");
             AfterCVodeToCnexpVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("after_cvode_to_cnexp"));
+            ast_to_nmodl(*ast, filepath("after_cvode_to_cnexp", "mod"));
         }
 
         /// Rename variables that match ISPC compiler double constants
@@ -359,7 +374,7 @@ int main(int argc, const char* argv[]) {
             logger->info("Running ISPC variables rename visitor");
             IspcRenameVisitor(ast).visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("ispc_double_rename"));
+            ast_to_nmodl(*ast, filepath("ispc_double_rename", "mod"));
         }
 
         /// GLOBAL to RANGE rename visitor
@@ -372,7 +387,7 @@ int main(int argc, const char* argv[]) {
             logger->info("Running GlobalToRange visitor");
             GlobalToRangeVisitor(ast).visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("global_to_range"));
+            ast_to_nmodl(*ast, filepath("global_to_range", "mod"));
         }
 
         /// LOCAL to ASSIGNED visitor
@@ -381,7 +396,7 @@ int main(int argc, const char* argv[]) {
             PerfVisitor().visit_program(*ast);
             LocalToAssignedVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("local_to_assigned"));
+            ast_to_nmodl(*ast, filepath("local_to_assigned", "mod"));
         }
 
         {
@@ -407,31 +422,26 @@ int main(int argc, const char* argv[]) {
             symtab->print(std::cout);
         }
 
-        ast_to_nmodl(*ast, filepath("ast"));
-
-        if (json_ast) {
-            auto file = scratch_dir + "/" + modfile + ".ast.json";
-            logger->info("Writing AST into {}", file);
-            JSONVisitor(file).write(*ast);
-        }
+        ast_to_nmodl(*ast, filepath("ast", "mod"));
+        ast_to_json(*ast, filepath("ast", "json"));
 
         if (verbatim_rename) {
             logger->info("Running verbatim rename visitor");
             VerbatimVarRenameVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("verbatim_rename"));
+            ast_to_nmodl(*ast, filepath("verbatim_rename", "mod"));
         }
 
         if (nmodl_const_folding) {
             logger->info("Running nmodl constant folding visitor");
             ConstantFolderVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("constfold"));
+            ast_to_nmodl(*ast, filepath("constfold", "mod"));
         }
 
         if (nmodl_unroll) {
             logger->info("Running nmodl loop unroll visitor");
             LoopUnrollVisitor().visit_program(*ast);
             ConstantFolderVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("unroll"));
+            ast_to_nmodl(*ast, filepath("unroll", "mod"));
             SymtabVisitor(update_symtab).visit_program(*ast);
         }
 
@@ -443,7 +453,7 @@ int main(int argc, const char* argv[]) {
             auto kineticBlockVisitor = KineticBlockVisitor();
             kineticBlockVisitor.visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            const auto filename = filepath("kinetic");
+            const auto filename = filepath("kinetic", "mod");
             ast_to_nmodl(*ast, filename);
             if (nmodl_ast && kineticBlockVisitor.get_conserve_statement_count()) {
                 logger->warn(
@@ -456,7 +466,7 @@ int main(int argc, const char* argv[]) {
             logger->info("Running STEADYSTATE visitor");
             SteadystateVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("steadystate"));
+            ast_to_nmodl(*ast, filepath("steadystate", "mod"));
         }
 
         /// Parsing units fron "nrnunits.lib" and mod files
@@ -473,14 +483,14 @@ int main(int argc, const char* argv[]) {
         if (nmodl_inline) {
             logger->info("Running nmodl inline visitor");
             InlineVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("inline"));
+            ast_to_nmodl(*ast, filepath("inline", "mod"));
         }
 
         if (local_rename) {
             logger->info("Running local variable rename visitor");
             LocalVarRenameVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("local_rename"));
+            ast_to_nmodl(*ast, filepath("local_rename", "mod"));
         }
 
         if (nmodl_localize) {
@@ -489,14 +499,14 @@ int main(int argc, const char* argv[]) {
             LocalizeVisitor(localize_verbatim).visit_program(*ast);
             LocalVarRenameVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("localize"));
+            ast_to_nmodl(*ast, filepath("localize", "mod"));
         }
 
         if (sympy_conductance) {
             logger->info("Running sympy conductance visitor");
             SympyConductanceVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("sympy_conductance"));
+            ast_to_nmodl(*ast, filepath("sympy_conductance", "mod"));
         }
 
         if (sympy_analytic || sparse_solver_exists(*ast)) {
@@ -507,19 +517,19 @@ int main(int argc, const char* argv[]) {
             logger->info("Running sympy solve visitor");
             SympySolverVisitor(sympy_pade, sympy_cse).visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("sympy_solve"));
+            ast_to_nmodl(*ast, filepath("sympy_solve", "mod"));
         }
 
         {
             logger->info("Running cnexp visitor");
             NeuronSolveVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("cnexp"));
+            ast_to_nmodl(*ast, filepath("cnexp", "mod"));
         }
 
         {
             SolveBlockVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("solveblock"));
+            ast_to_nmodl(*ast, filepath("solveblock", "mod"));
         }
 
         if (json_perfstat) {
@@ -583,9 +593,11 @@ int main(int argc, const char* argv[]) {
 #ifdef NMODL_LLVM_BACKEND
             if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes, llvm_float_type);
+                CodegenLLVMVisitor visitor(
+                    modfile, output_dir, llvm_opt_passes, llvm_vec_width, llvm_float_type);
                 visitor.visit_program(*ast);
-                ast_to_nmodl(*ast, filepath("llvm"));
+                ast_to_nmodl(*ast, filepath("llvm", "mod"));
+                ast_to_json(*ast, filepath("llvm", "json"));
             }
 #endif
         }

From 4b3e2fc423cedbbe9f6c84d77d4fc70d5072f08c Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 7 Mar 2021 01:10:30 +0300
Subject: [PATCH 027/331] Addressing TODOs for Instance struct (#533) Part II  
 - remove undefined visit_codegen_instance_var   - Improved member creation
 for instance struct   - Instance struct type generation for kernel arguments 
  - Proper integration of instance struct   - Added scalar code generation for
 the kernel   - Removed instance test since it is not created explicitly
 anymore   - Fixed ordering for precision and width in LLVM Visitor   - Added
 vector induction variable   - Vectorised code for compute with direct loads
 fully functional   - Instance naming fixed   - (LLVM IR) Fixed compute vector
 code generation types   -  refactoring : improve coversion of double to int
 for     the loop expressions

---
 .../llvm/codegen_llvm_helper_visitor.cpp      |  83 ++--
 .../llvm/codegen_llvm_helper_visitor.hpp      |   7 +
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 387 ++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  48 ++-
 src/main.cpp                                  |   2 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  36 --
 6 files changed, 419 insertions(+), 144 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index b3f75b9372..c34ae2c873 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -146,7 +146,7 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
         /// create new type and name for creating new ast node
         auto type = new ast::CodegenVarType(FLOAT_TYPE);
         auto var = param->get_name()->clone();
-        arguments.emplace_back(new ast::CodegenVarWithType(type, 0, var));
+        arguments.emplace_back(new ast::CodegenVarWithType(type, /*is_pointer=*/0, var));
     }
 
     /// return type of the function is same as return variable type
@@ -170,31 +170,31 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
         };
 
     /// float variables are standard pointers to float vectors
-    for (auto& float_var: info.codegen_float_variables) {
-        add_var_with_type(float_var->get_name(), FLOAT_TYPE, 1);
+    for (const auto& float_var: info.codegen_float_variables) {
+        add_var_with_type(float_var->get_name(), FLOAT_TYPE, /*is_pointer=*/1);
     }
 
     /// int variables are pointers to indexes for other vectors
-    for (auto& int_var: info.codegen_int_variables) {
-        add_var_with_type(int_var.symbol->get_name(), FLOAT_TYPE, 1);
+    for (const auto& int_var: info.codegen_int_variables) {
+        add_var_with_type(int_var.symbol->get_name(), FLOAT_TYPE, /*is_pointer=*/1);
     }
 
     // for integer variables, there should be index
-    for (auto& int_var: info.codegen_int_variables) {
+    for (const auto& int_var: info.codegen_int_variables) {
         std::string var_name = int_var.symbol->get_name() + "_index";
-        add_var_with_type(var_name, INTEGER_TYPE, 1);
+        add_var_with_type(var_name, INTEGER_TYPE, /*is_pointer=*/1);
     }
 
     // add voltage and node index
-    add_var_with_type("voltage", FLOAT_TYPE, 1);
-    add_var_with_type("node_index", INTEGER_TYPE, 1);
+    add_var_with_type("voltage", FLOAT_TYPE, /*is_pointer=*/1);
+    add_var_with_type("node_index", INTEGER_TYPE, /*is_pointer=*/1);
 
     // add dt, t, celsius
-    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, 0);
-    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, 0);
-    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, 0);
-    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, 0);
-    add_var_with_type(MECH_NODECOUNT_VAR, INTEGER_TYPE, 0);
+    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
+    add_var_with_type(MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
 
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
@@ -384,7 +384,7 @@ void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
                                                             std::string& index_var) {
     /// collect all variables in the node of type ast::VarName
     auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
-    for (auto& v: variables) {
+    for (const auto& v: variables) {
         auto variable = std::dynamic_pointer_cast<ast::VarName>(v);
         auto variable_name = variable->get_node_name();
 
@@ -450,6 +450,44 @@ void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
     create_function_for_node(node);
 }
 
+/// Create asr::Varname node with given a given variable name
+static ast::VarName* create_varname(const std::string& varname) {
+    return new ast::VarName(new ast::Name(new ast::String(varname)), nullptr, nullptr);
+}
+
+/**
+ * Create for loop initialization expression
+ * @param code Usually "id = 0" as a string
+ * @return Expression representing code
+ * \todo : we can not use `create_statement_as_expression` function because
+ *         NMODL parser is using `ast::Double` type to represent all variables
+ *         including Integer. See #542.
+ */
+static std::shared_ptr<ast::Expression> loop_initialization_expression(
+    const std::string& induction_var) {
+    // create id = 0
+    const auto& id = create_varname(induction_var);
+    const auto& zero = new ast::Integer(0, nullptr);
+    return std::make_shared<ast::BinaryExpression>(id, ast::BinaryOperator(ast::BOP_ASSIGN), zero);
+}
+
+/**
+ * Create loop increment expression `id = id + width`
+ * \todo : same as loop_initialization_expression()
+ */
+static std::shared_ptr<ast::Expression> loop_increment_expression(const std::string& induction_var,
+                                                                  int vector_width) {
+    // first create id + x
+    const auto& id = create_varname(induction_var);
+    const auto& inc = new ast::Integer(vector_width, nullptr);
+    const auto& inc_expr =
+        new ast::BinaryExpression(id, ast::BinaryOperator(ast::BOP_ADDITION), inc);
+    // now create id = id + x
+    return std::make_shared<ast::BinaryExpression>(id->clone(),
+                                                   ast::BinaryOperator(ast::BOP_ASSIGN),
+                                                   inc_expr);
+}
+
 /**
  * \brief Convert ast::NrnStateBlock to corresponding code generation function nrn_state
  * @param node AST node representing ast::NrnStateBlock
@@ -471,9 +509,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// create now main compute part : for loop over channel instances
 
     /// loop constructs : initialization, condition and increment
-    const auto& initialization = create_statement_as_expression("id = 0");
-    const auto& condition = create_expression("id < node_count");
-    const auto& increment = create_statement_as_expression("id = id + {}"_format(vector_width));
+    const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
+    const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, MECH_NODECOUNT_VAR));
+    const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
 
     /// loop body : initialization + solve blocks
     ast::StatementVector loop_def_statements;
@@ -484,7 +522,8 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         std::vector<std::string> double_variables{"v"};
 
         /// access node index and corresponding voltage
-        loop_index_statements.push_back(visitor::create_statement("node_id = node_index[id]"));
+        loop_index_statements.push_back(
+            visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
         loop_body_statements.push_back(visitor::create_statement("v = voltage[node_id]"));
 
         /// read ion variables
@@ -558,7 +597,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     ast::CodegenVarWithTypeVector code_arguments;
 
     auto instance_var_type = new ast::CodegenVarType(ast::AstNodeType::INSTANCE_STRUCT);
-    auto instance_var_name = new ast::Name(new ast::String("mech"));
+    auto instance_var_name = new ast::Name(new ast::String(MECH_INSTANCE_VAR));
     auto instance_var = new ast::CodegenVarWithType(instance_var_type, 1, instance_var_name);
     code_arguments.emplace_back(instance_var);
 
@@ -567,7 +606,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         std::make_shared<ast::CodegenFunction>(return_type, name, code_arguments, function_block);
     codegen_functions.push_back(function);
 
-    std::cout << nmodl::to_nmodl(function);
+    std::cout << nmodl::to_nmodl(function) << std::endl;
 }
 
 void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
@@ -583,8 +622,6 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
-
-    std::cout << nmodl::to_nmodl(node);
 }
 
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 981372b4d5..b67aa7ee09 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -120,6 +120,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     const std::string MECH_INSTANCE_VAR = "mech";
     const std::string MECH_NODECOUNT_VAR = "node_count";
 
+    /// name of induction variable used in the kernel.
+    const std::string INDUCTION_VAR = "id";
+
     /// create new function for FUNCTION or PROCEDURE block
     void create_function_for_node(ast::Block& node);
 
@@ -134,6 +137,10 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
         return instance_var_helper;
     }
 
+    std::string get_kernel_id() {
+        return INDUCTION_VAR;
+    }
+
     /// run visitor and return code generation functions
     CodegenFunctionVector get_codegen_functions(const ast::Program& node);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 80bdfd20e3..62e69449b7 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -21,14 +21,22 @@ namespace nmodl {
 namespace codegen {
 
 
+static constexpr const char instance_struct_type_name[] = "__instance_var__type";
+
+// The prefix is used to create a vectorised id that can be used as index to GEPs. However, for
+// simple aligned vector loads and stores vector id is not needed. This is because we can bitcast
+// the pointer to the vector pointer! \todo: Consider removing this.
+static constexpr const char kernel_id_prefix[] = "__vec_";
+
+
 /****************************************************************************************/
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
 static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_codegen_return_statement() || statement.is_if_statement() ||
-           statement.is_while_statement();
+           statement.is_codegen_for_statement() || statement.is_codegen_return_statement() ||
+           statement.is_if_statement() || statement.is_while_statement();
 }
 
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
@@ -56,10 +64,82 @@ llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& no
     return create_gep(node.get_node_name(), index);
 }
 
+llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstanceVar& node) {
+    const auto& member_node = node.get_member_var();
+    const auto& instance_name = node.get_instance_var()->get_node_name();
+    const auto& member_name = member_node->get_node_name();
+
+    if (!instance_var_helper.is_an_instance_variable(member_name))
+        throw std::runtime_error("Error: " + member_name + " is not a member of the instance!");
+
+    // Load the instance struct given its name from the ValueSymbolTable.
+    llvm::Value* instance_ptr = builder.CreateLoad(lookup(instance_name));
+
+    // Create a GEP instruction to get a pointer to the member.
+    int member_index = instance_var_helper.get_variable_index(member_name);
+    llvm::Type* index_type = llvm::Type::getInt32Ty(*context);
+
+    std::vector<llvm::Value*> indices;
+    indices.push_back(llvm::ConstantInt::get(index_type, 0));
+    indices.push_back(llvm::ConstantInt::get(index_type, member_index));
+    llvm::Value* member_ptr = builder.CreateInBoundsGEP(instance_ptr, indices);
+
+    // Get the member AST node from the instance AST node, for which we proceed with the code
+    // generation. If the member is scalar, return the pointer to it straight away.
+    auto codegen_var_with_type = instance_var_helper.get_variable(member_name);
+    if (!codegen_var_with_type->get_is_pointer()) {
+        return member_ptr;
+    }
+
+    // Otherwise, the codegen variable is a pointer, and the member AST node must be an IndexedName.
+    auto member_var_name = std::dynamic_pointer_cast<ast::VarName>(member_node);
+    if (!member_var_name->get_name()->is_indexed_name())
+        throw std::runtime_error("Error: " + member_name + " is not an IndexedName!");
+
+    // Proceed to creating a GEP instruction to get the pointer to the member's element. While LLVM
+    // Helper set the indices to be Name nodes, a sanity check is added here. Note that this step
+    // can be avoided if using `get_array_index_or_length()`. However, it does not support indexing
+    // with Name/Expression at the moment. \todo: Reuse `get_array_index_or_length()` here.
+    auto member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
+        member_var_name->get_name());
+    if (!member_indexed_name->get_length()->is_name())
+        throw std::runtime_error("Error: " + member_name + " has a non-Name index!");
+
+    // Load the index variable that will be used to access the member's element. Since we index a
+    // pointer variable, we need to extend the 32-bit integer index variable to 64-bit.
+    llvm::Value* i32_index = builder.CreateLoad(
+        lookup(member_indexed_name->get_length()->get_node_name()));
+    llvm::Value* i64_index = builder.CreateSExt(i32_index, llvm::Type::getInt64Ty(*context));
+
+    // Create a indices vector for GEP to return the pointer to the element at the specified index.
+    std::vector<llvm::Value*> member_indices;
+    member_indices.push_back(i64_index);
+
+    // The codegen variable type is always a scalar, so we need to transform it to a pointer. Then
+    // load the member which would be indexed later.
+    llvm::Type* type = get_codegen_var_type(*codegen_var_with_type->get_type());
+    llvm::Value* instance_member =
+        builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
+
+
+    // If the code is vectorised, then bitcast to a vector pointer.
+    if (is_kernel_code && vector_width > 1) {
+        llvm::Type* vector_type =
+            llvm::PointerType::get(llvm::FixedVectorType::get(type, vector_width),
+                                   /*AddressSpace=*/0);
+        llvm::Value* instance_member_bitcasted = builder.CreateBitCast(instance_member,
+                                                                       vector_type);
+        return builder.CreateInBoundsGEP(instance_member_bitcasted, member_indices);
+    }
+
+    return builder.CreateInBoundsGEP(instance_member, member_indices);
+}
+
 unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& indexed_name) {
+    // \todo: Support indices with expressions and names: k[i + j] = ...
     auto integer = std::dynamic_pointer_cast<ast::Integer>(indexed_name.get_length());
     if (!integer)
-        throw std::runtime_error("Error: expecting integer index or length");
+        throw std::runtime_error("Error: only integer indices/length are supported!");
 
     // Check if integer value is taken from a macro.
     if (!integer->get_macro())
@@ -74,6 +154,8 @@ llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType&
         return llvm::Type::getInt1Ty(*context);
     case ast::AstNodeType::DOUBLE:
         return get_default_fp_type();
+    case ast::AstNodeType::INSTANCE_STRUCT:
+        return get_instance_struct_type();
     case ast::AstNodeType::INTEGER:
         return llvm::Type::getInt32Ty(*context);
     case ast::AstNodeType::VOID:
@@ -85,6 +167,26 @@ llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType&
     }
 }
 
+llvm::Value* CodegenLLVMVisitor::get_constant_int_vector(int value) {
+    llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+    std::vector<llvm::Constant*> constants;
+    for (unsigned i = 0; i < vector_width; ++i) {
+        const auto& element = llvm::ConstantInt::get(i32_type, value);
+        constants.push_back(element);
+    }
+    return llvm::ConstantVector::get(constants);
+}
+
+llvm::Value* CodegenLLVMVisitor::get_constant_fp_vector(const std::string& value) {
+    llvm::Type* fp_type = get_default_fp_type();
+    std::vector<llvm::Constant*> constants;
+    for (unsigned i = 0; i < vector_width; ++i) {
+        const auto& element = llvm::ConstantFP::get(fp_type, value);
+        constants.push_back(element);
+    }
+    return llvm::ConstantVector::get(constants);
+}
+
 llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
     if (use_single_precision)
         return llvm::Type::getFloatTy(*context);
@@ -97,6 +199,59 @@ llvm::Type* CodegenLLVMVisitor::get_default_fp_ptr_type() {
     return llvm::Type::getDoublePtrTy(*context);
 }
 
+llvm::Type* CodegenLLVMVisitor::get_instance_struct_type() {
+    std::vector<llvm::Type*> members;
+    for (const auto& variable: instance_var_helper.instance->get_codegen_vars()) {
+        auto is_pointer = variable->get_is_pointer();
+        auto nmodl_type = variable->get_type()->get_type();
+
+        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+        llvm::Type* i32ptr_type = llvm::Type::getInt32PtrTy(*context);
+
+        switch (nmodl_type) {
+#define DISPATCH(type, llvm_ptr_type, llvm_type)                       \
+    case type:                                                         \
+        members.push_back(is_pointer ? (llvm_ptr_type) : (llvm_type)); \
+        break;
+
+            DISPATCH(ast::AstNodeType::DOUBLE, get_default_fp_ptr_type(), get_default_fp_type());
+            DISPATCH(ast::AstNodeType::INTEGER, i32ptr_type, i32_type);
+
+#undef DISPATCH
+        default:
+            throw std::runtime_error("Error: unsupported type found in instance struct");
+        }
+    }
+
+    llvm::StructType* llvm_struct_type =
+        llvm::StructType::create(*context, mod_filename + instance_struct_type_name);
+    llvm_struct_type->setBody(members);
+    return llvm::PointerType::get(llvm_struct_type, /*AddressSpace=*/0);
+}
+
+llvm::Value* CodegenLLVMVisitor::get_variable_ptr(const ast::VarName& node) {
+    const auto& identifier = node.get_name();
+    if (!identifier->is_name() && !identifier->is_indexed_name() &&
+        !identifier->is_codegen_instance_var()) {
+        throw std::runtime_error("Error: Unsupported variable type - " + node.get_node_name());
+    }
+
+    llvm::Value* ptr;
+    if (identifier->is_name())
+        ptr = lookup(node.get_node_name());
+
+    if (identifier->is_indexed_name()) {
+        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        ptr = codegen_indexed_name(*indexed_name);
+    }
+
+    if (identifier->is_codegen_instance_var()) {
+        auto instance_var = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(identifier);
+        ptr = codegen_instance_var(*instance_var);
+    }
+    return ptr;
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
@@ -134,7 +289,7 @@ void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
     }
 
 #define DISPATCH(method_name, intrinsic)                                                           \
-    if (name == method_name) {                                                                     \
+    if (name == (method_name)) {                                                                   \
         llvm::Value* result = builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
         values.push_back(result);                                                                  \
         return;                                                                                    \
@@ -234,12 +389,12 @@ llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
     llvm::Value* result;
 
     switch (bin_op) {
-#define DISPATCH(binary_op, llvm_fp_op, llvm_int_op)         \
-    case binary_op:                                          \
-        if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
-            result = llvm_fp_op(lhs, rhs);                   \
-        else                                                 \
-            result = llvm_int_op(lhs, rhs);                  \
+#define DISPATCH(binary_op, llvm_fp_op, llvm_int_op) \
+    case binary_op:                                  \
+        if (lhs_type->isIntOrIntVectorTy())          \
+            result = llvm_int_op(lhs, rhs);          \
+        else                                         \
+            result = llvm_fp_op(lhs, rhs);           \
         return result;
 
         DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd, builder.CreateAdd);
@@ -256,20 +411,11 @@ llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
 
 void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs) {
     auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
-    if (!var) {
-        throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
-    }
+    if (!var)
+        throw std::runtime_error("Error: only VarName assignment is supported!");
 
-    const auto& identifier = var->get_name();
-    if (identifier->is_name()) {
-        llvm::Value* alloca = lookup(var->get_node_name());
-        builder.CreateStore(rhs, alloca);
-    } else if (identifier->is_indexed_name()) {
-        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-        builder.CreateStore(rhs, codegen_indexed_name(*indexed_name));
-    } else {
-        throw std::runtime_error("Error: Unsupported variable type");
-    }
+    llvm::Value* ptr = get_variable_ptr(*var);
+    builder.CreateStore(rhs, ptr);
 }
 
 llvm::Value* CodegenLLVMVisitor::visit_logical_bin_op(llvm::Value* lhs,
@@ -373,6 +519,117 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
     values.push_back(constant);
 }
 
+// Generating FOR loop in LLVM IR creates the following structure:
+//
+//  +---------------------------+
+//  | <code before for loop>    |
+//  | <for loop initialisation> |
+//  | br %cond                  |
+//  +---------------------------+
+//                |
+//                V
+//  +-----------------------------+
+//  | <condition code>            |
+//  | %cond = ...                 |<------+
+//  | cond_br %cond, %body, %exit |       |
+//  +-----------------------------+       |
+//      |                 |               |
+//      |                 V               |
+//      |     +------------------------+  |
+//      |     | <body code>            |  |
+//      |     | br %inc                |  |
+//      |     +------------------------+  |
+//      |                 |               |
+//      |                 V               |
+//      |     +------------------------+  |
+//      |     | <increment code>       |  |
+//      |      | br %cond              |  |
+//      |     +------------------------+  |
+//      |                 |               |
+//      |                 +---------------+
+//      V
+//  +---------------------------+
+//  | <code after for loop>     |
+//  +---------------------------+
+void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatement& node) {
+    // Get the current and the next blocks within the function.
+    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* next = curr_block->getNextNode();
+    llvm::Function* func = curr_block->getParent();
+
+    // Create the basic blocks for FOR loop.
+    llvm::BasicBlock* for_cond =
+        llvm::BasicBlock::Create(*context, /*Name=*/"for.cond", func, next);
+    llvm::BasicBlock* for_body =
+        llvm::BasicBlock::Create(*context, /*Name=*/"for.body", func, next);
+    llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
+    llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
+
+    // First, initialise the loop in the same basic block.
+    node.get_initialization()->accept(*this);
+
+    // If the loop is to be vectorised, create a separate vector induction variable.
+    // \todo: See the comment for `kernel_id_prefix`.
+    if (vector_width > 1) {
+        // First, create a vector type and alloca for it.
+        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+        llvm::Type* vec_type = llvm::FixedVectorType::get(i32_type, vector_width);
+        llvm::Value* vec_alloca = builder.CreateAlloca(vec_type,
+                                                       /*ArraySize=*/nullptr,
+                                                       /*Name=*/kernel_id_prefix + kernel_id);
+
+        // Then, store the initial value of <0, 1, ..., [W-1]> o the alloca pointer, where W is the
+        // vector width.
+        std::vector<llvm::Constant*> constants;
+        for (unsigned i = 0; i < vector_width; ++i) {
+            const auto& element = llvm::ConstantInt::get(i32_type, i);
+            constants.push_back(element);
+        }
+        llvm::Value* vector_id = llvm::ConstantVector::get(constants);
+        builder.CreateStore(vector_id, vec_alloca);
+    }
+    // Branch to condition basic block and insert condition code there.
+    builder.CreateBr(for_cond);
+    builder.SetInsertPoint(for_cond);
+    node.get_condition()->accept(*this);
+
+    // Extract the condition to decide whether to branch to the loop body or loop exit.
+    llvm::Value* cond = values.back();
+    values.pop_back();
+    builder.CreateCondBr(cond, for_body, exit);
+
+    // Generate code for the loop body and create the basic block for the increment.
+    builder.SetInsertPoint(for_body);
+    is_kernel_code = true;
+    const auto& statement_block = node.get_statement_block();
+    statement_block->accept(*this);
+    is_kernel_code = false;
+    builder.CreateBr(for_inc);
+
+    // Process increment.
+    builder.SetInsertPoint(for_inc);
+    node.get_increment()->accept(*this);
+
+    // If the code is vectorised, then increment the vector id by <W, W, ..., W> where W is the
+    // vector width.
+    // \todo: See the comment for `kernel_id_prefix`.
+    if (vector_width > 1) {
+        // First, create an increment vector.
+        llvm::Value* vector_inc = get_constant_int_vector(vector_width);
+
+        // Increment the kernel id elements by a constant vector width.
+        llvm::Value* vector_id_ptr = lookup(kernel_id_prefix + kernel_id);
+        llvm::Value* vector_id = builder.CreateLoad(vector_id_ptr);
+        llvm::Value* incremented = builder.CreateAdd(vector_id, vector_inc);
+        builder.CreateStore(incremented, vector_id_ptr);
+    }
+
+    // Create a branch to condition block, then generate exit code out of the loop.
+    builder.CreateBr(for_cond);
+    builder.SetInsertPoint(exit);
+}
+
+
 void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node) {
     const auto& name = node.get_node_name();
     const auto& arguments = node.get_arguments();
@@ -406,7 +663,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     block->accept(*this);
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
-    if (node.is_void())
+    if (node.get_return_type()->get_type() == ast::AstNodeType::VOID)
         builder.CreateRetVoid();
 
     // Clear local values stack and remove the pointer to the local symbol table.
@@ -419,7 +676,7 @@ void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturn
         throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
 
     std::string ret = "ret_" + current_func->getName().str();
-    llvm::Value* ret_value = builder.CreateLoad(current_func->getValueSymbolTable()->lookup(ret));
+    llvm::Value* ret_value = builder.CreateLoad(lookup(ret));
     builder.CreateRet(ret_value);
 }
 
@@ -456,6 +713,10 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
 }
 
 void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
+    if (is_kernel_code && vector_width > 1) {
+        values.push_back(get_constant_fp_vector(node.get_value()));
+        return;
+    }
     const auto& constant = llvm::ConstantFP::get(get_default_fp_type(), node.get_value());
     values.push_back(constant);
 }
@@ -547,6 +808,10 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
 }
 
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
+    if (is_kernel_code && vector_width > 1) {
+        values.push_back(get_constant_int_vector(node.get_value()));
+        return;
+    }
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
                                                   node.get_value());
     values.push_back(constant);
@@ -561,9 +826,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
 
-    // TODO :: George / Ioannis :: before emitting procedures, we have
-    //         to emmit INSTANCE_STRUCT type as it's used as an argument.
-    //         Currently it's done in node.visit_children which is late.
+    kernel_id = v.get_kernel_id();
 
     // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
@@ -574,8 +837,15 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     // Set the AST symbol table.
     sym_tab = node.get_symbol_table();
 
-    // Proceed with code generation.
-    node.visit_children(*this);
+    // Proceed with code generation. Right now, we do not do
+    //     node.visit_children(*this);
+    // The reason is that the node may contain AST nodes for which the visitor functions have been
+    // defined. In our implementation we assume that the code generation is happening within the
+    // function scope. To avoid generating code outside of functions, visit only them for now.
+    // \todo: Handle what is mentioned here.
+    for (const auto& func: functions) {
+        visit_codegen_function(*func);
+    }
 
     if (opt_passes) {
         logger->info("Running LLVM optimisation passes");
@@ -605,60 +875,21 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
 }
 
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
-    const auto& identifier = node.get_name();
-    if (!identifier->is_name() && !identifier->is_indexed_name())
-        throw std::runtime_error("Error: Unsupported variable type");
-
-    // TODO :: George :: here instance_var_helper can be used to query
-    // variable type and it's index into structure
-    auto name = node.get_node_name();
-
-    auto codegen_var_with_type = instance_var_helper.get_variable(name);
-    auto codegen_var_index = instance_var_helper.get_variable_index(name);
-    // this will be INTEGER or DOUBLE
-    auto var_type = codegen_var_with_type->get_type()->get_type();
-    auto is_pointer = codegen_var_with_type->get_is_pointer();
-
-    llvm::Value* ptr;
-    if (identifier->is_name())
-        ptr = lookup(node.get_node_name());
-
-    if (identifier->is_indexed_name()) {
-        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-        ptr = codegen_indexed_name(*indexed_name);
-    }
+    llvm::Value* ptr = get_variable_ptr(node);
 
     // Finally, load the variable from the pointer value.
     llvm::Value* var = builder.CreateLoad(ptr);
-    values.push_back(var);
-}
 
-void CodegenLLVMVisitor::visit_instance_struct(const ast::InstanceStruct& node) {
-    std::vector<llvm::Type*> members;
-    for (const auto& variable: node.get_codegen_vars()) {
-        // TODO :: Ioannis / George :: we have now double*, int*, double and int
-        //         variables in the instance structure. Each variable is of type
-        //         ast::CodegenVarWithType. So we can query variable type and if
-        //         it's pointer.
-        auto is_pointer = variable->get_is_pointer();
-        auto type = variable->get_type()->get_type();
-
-        // todo : clean up ?
-        if (type == ast::AstNodeType::DOUBLE) {
-            auto llvm_type = is_pointer ? get_default_fp_ptr_type() : get_default_fp_type();
-            members.push_back(llvm_type);
-        } else {
-            if (is_pointer) {
-                members.push_back(llvm::Type::getInt32PtrTy(*context));
-            } else {
-                members.push_back(llvm::Type::getInt32Ty(*context));
-            }
-        }
+    // If the vale should not be vectorised, or it is already a vector, add it to the stack.
+    if (!is_kernel_code || vector_width <= 1 || var->getType()->isVectorTy()) {
+        values.push_back(var);
+        return;
     }
 
-    llvm_struct = llvm::StructType::create(*context, mod_filename + "_Instance");
-    llvm_struct->setBody(members);
-    module->getOrInsertGlobal("inst", llvm_struct);
+    // Otherwise, if we are generating vectorised inside the loop, replicate the value to form a
+    // vector of `vector_width`.
+    llvm::Value* vector_var = builder.CreateVectorSplat(vector_width, var);
+    values.push_back(vector_var);
 }
 
 void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index b20a19bac7..c93b76b1d6 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -82,11 +82,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
     bool use_single_precision;
 
-    // explicit vectorisation width
+    // Explicit vectorisation width.
     int vector_width;
 
-    // LLVM mechanism struct
-    llvm::StructType* llvm_struct;
+    // The name of induction variable used in the kernel functions.
+    std::string kernel_id;
+
+    // A flag to indicate that the code is generated for the kernel.
+    bool is_kernel_code = false;
 
     /**
      *\brief Run LLVM optimisation passes on generated IR
@@ -106,8 +109,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
                        bool opt_passes,
-                       int vector_width = 1,
-                       bool use_single_precision = false)
+                       bool use_single_precision = false,
+                       int vector_width = 1)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
@@ -130,6 +133,13 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Value* codegen_indexed_name(const ast::IndexedName& node);
 
+    /**
+     * Generates LLVM code for the given Instance variable
+     * \param node CodegenInstanceVar NMODL AST node
+     * \return LLVM code generated for this AST node
+     */
+    llvm::Value* codegen_instance_var(const ast::CodegenInstanceVar& node);
+
     /**
      * Returns GEP instruction to 1D array
      * \param name 1D array name
@@ -152,6 +162,20 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
 
+    /**
+     * Returns LLVM vector with `vector_width` int values.
+     * \param int value to replicate
+     * \return LLVM value
+     */
+    llvm::Value* get_constant_int_vector(int value);
+
+    /**
+     * Returns LLVM vector with `vector_width` double values.
+     * \param string a double value to replicate
+     * \return LLVM value
+     */
+    llvm::Value* get_constant_fp_vector(const std::string& value);
+
     /**
      * Returns 64-bit or 32-bit LLVM floating type
      * \return     \c LLVM floating point type according to `use_single_precision` flag
@@ -164,6 +188,18 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Type* get_default_fp_ptr_type();
 
+    /**
+     * Returns a pointer to LLVM struct type
+     * \return LLVM pointer type
+     */
+    llvm::Type* get_instance_struct_type();
+
+    /**
+     * Returns a LLVM value corresponding to the VarName node
+     * \return LLVM value
+     */
+    llvm::Value* get_variable_ptr(const ast::VarName& node);
+
     /**
      * Create a function call to an external method
      * \param name external method name
@@ -255,6 +291,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
     void visit_statement_block(const ast::StatementBlock& node) override;
+    void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
     void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
@@ -267,7 +304,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_program(const ast::Program& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
-    void visit_instance_struct(const ast::InstanceStruct& node) override;
     void visit_while_statement(const ast::WhileStatement& node) override;
 
     // \todo: move this to debug mode (e.g. -v option or --dump-ir)
diff --git a/src/main.cpp b/src/main.cpp
index f9e083f930..5fa5304776 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -594,7 +594,7 @@ int main(int argc, const char* argv[]) {
             if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(
-                    modfile, output_dir, llvm_opt_passes, llvm_vec_width, llvm_float_type);
+                    modfile, output_dir, llvm_opt_passes, llvm_float_type, llvm_vec_width);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index ba0c725c0c..a376bd3f5c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -794,39 +794,3 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
         }
     }
 }
-
-//=============================================================================
-// Create Instance Struct
-//=============================================================================
-
-SCENARIO("Creation of Instance Struct", "[visitor][llvm][instance_struct]") {
-    GIVEN("NEURON block with RANGE variables and IONS") {
-        std::string nmodl_text = R"(
-            NEURON {
-                USEION na READ ena WRITE ina
-                NONSPECIFIC_CURRENT il
-                RANGE minf, hinf
-            }
-
-            STATE {
-                m
-            }
-
-            ASSIGNED {
-                v (mV)
-                celsius (degC)
-                minf
-                hinf
-            }
-        )";
-
-        THEN("create struct with the declared variables") {
-            std::string module_string = run_llvm_visitor(nmodl_text, true);
-            std::smatch m;
-
-            std::regex instance_struct_declaration(
-                R"(%unknown_Instance = type \{ double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\* \})");
-            REQUIRE(std::regex_search(module_string, m, instance_struct_declaration));
-        }
-    }
-}

From c08eb224e26d45770c46f80251b00b26d3125230 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 9 Mar 2021 11:50:53 +0300
Subject: [PATCH 028/331] Unit test for scalar state kernel generation in LLVM
 (#547)

This PR adds a unit test to check LLVM instructions generated for
the scalar kernel, particularly:

- FOR loop blocks.

- Induction variable increments and comparisons.

- Correct loads through GEPs from the struct.

Test for vectorised code generation would be added in a separate
PR or when full vectorisation support (indirect indexing) would
land.
---
 test/unit/codegen/codegen_llvm_ir.cpp | 112 +++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 2 deletions(-)

diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index a376bd3f5c..d5b531c5d5 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -12,6 +12,8 @@
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
+#include "visitors/neuron_solve_visitor.hpp"
+#include "visitors/solve_block_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
 
 using namespace nmodl;
@@ -24,16 +26,20 @@ using nmodl::parser::NmodlDriver;
 
 std::string run_llvm_visitor(const std::string& text,
                              bool opt = false,
-                             bool use_single_precision = false) {
+                             bool use_single_precision = false,
+                             int vector_width = 1) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
+    NeuronSolveVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
 
     codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                              /*output_dir=*/".",
                                              opt,
-                                             use_single_precision);
+                                             use_single_precision,
+                                             vector_width);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.print_module();
 }
@@ -770,6 +776,108 @@ SCENARIO("While", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// State scalar kernel
+//=============================================================================
+
+SCENARIO("Scalar state kernel", "[visitor][llvm]") {
+    GIVEN("A neuron state update") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                NONSPECIFIC_CURRENT il
+                RANGE minf, mtau, gl, el
+            }
+
+            STATE {
+                m
+            }
+
+            ASSIGNED {
+                v (mV)
+                minf
+                mtau (ms)
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                il = gl * (v - el)
+            }
+
+            DERIVATIVE states {
+                    m = (minf-m) / mtau
+            }
+        )";
+
+        THEN("a kernel with instance struct as an argument and a FOR loop is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check the struct type and the kernel declaration.
+            std::regex struct_type(
+                "%.*__instance_var__type = type \\{ double\\*, double\\*, double\\*, double\\*, "
+                "double\\*, double\\*, double\\*, i32\\*, double, double, double, i32, i32 \\}");
+            std::regex kernel_declaration(
+                R"(define void @nrn_state_hh\(%.*__instance_var__type\* .*\))");
+            REQUIRE(std::regex_search(module_string, m, struct_type));
+            REQUIRE(std::regex_search(module_string, m, kernel_declaration));
+
+            // Check for correct induction variable initialisation and a branch to condition block.
+            std::regex alloca_instr(R"(%id = alloca i32)");
+            std::regex br(R"(br label %for\.cond)");
+            REQUIRE(std::regex_search(module_string, m, alloca_instr));
+            REQUIRE(std::regex_search(module_string, m, br));
+
+            // Check condition block: id < mech->node_count, and a conditional branch to loop body
+            // or exit.
+            std::regex condition(
+                "  %.* = load %.*__instance_var__type\\*, %.*__instance_var__type\\*\\* %.*,.*\n"
+                "  %.* = getelementptr inbounds %.*__instance_var__type, "
+                "%.*__instance_var__type\\* "
+                "%.*, i32 0, i32 [0-9]+\n"
+                "  %.* = load i32, i32\\* %.*,.*\n"
+                "  %.* = load i32, i32\\* %id,.*\n"
+                "  %.* = icmp slt i32 %.*, %.*");
+            std::regex cond_br(R"(br i1 %.*, label %for\.body, label %for\.exit)");
+            REQUIRE(std::regex_search(module_string, m, condition));
+            REQUIRE(std::regex_search(module_string, m, cond_br));
+
+            // In the body block, `node_id` and voltage `v` are initialised with the data from the
+            // struct. Check for variable allocations and correct loads from the struct with GEPs.
+            std::regex initialisation(
+                "for\\.body:.*\n"
+                "  %node_id = alloca i32,.*\n"
+                "  %v = alloca double,.*");
+            std::regex load_from_struct(
+                "  %.* = load %.*__instance_var__type\\*, %.*__instance_var__type\\*\\* %.*\n"
+                "  %.* = getelementptr inbounds %.*__instance_var__type, "
+                "%.*__instance_var__type\\* %.*, i32 0, i32 [0-9]+\n"
+                "  %.* = load i32, i32\\* %id,.*\n"
+                "  %.* = sext i32 %.* to i64\n"
+                "  %.* = load (i32|double)\\*, (i32|double)\\*\\* %.*\n"
+                "  %.* = getelementptr inbounds (i32|double), (i32|double)\\* %.*, i64 %.*\n"
+                "  %.* = load (i32|double), (i32|double)\\* %.*");
+            REQUIRE(std::regex_search(module_string, m, initialisation));
+            REQUIRE(std::regex_search(module_string, m, load_from_struct));
+
+            // Check induction variable is incremented in increment block.
+            std::regex increment(
+                "for.inc:.*\n"
+                "  %.* = load i32, i32\\* %id,.*\n"
+                "  %.* = add i32 %.*, 1\n"
+                "  store i32 %.*, i32\\* %id,.*\n"
+                "  br label %for\\.cond");
+            REQUIRE(std::regex_search(module_string, m, increment));
+
+            // Check exit block.
+            std::regex exit(
+                "for\\.exit:.*\n"
+                "  ret void");
+            REQUIRE(std::regex_search(module_string, m, exit));
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From ec9127194d2cc45c343b92972f7493d1d9c7e3fd Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 12 Mar 2021 04:50:38 -0800
Subject: [PATCH 029/331] Indexed name codegen improvements (#550)

Improved index code generation within the LLVM pipeline.
The following issues were addressed:

Array indices are i64 per LLVM's addressing convention.
This means that if the value is not a constant, an additional
sext instruction must be created.

Bounds check is removed since it requires a certain analysis
on the index value. This can be addressed in a separate PR.

`IndexedName` code generation is separated into 2 functions
The first, `get_array_length()` is responsible for array initialisation,
the second, `get_array_index()`, for indexing. In latter case, we
support the following cases:
```
...
// Indexing with an integer constant
k[0] = ...

// Indexing with an integer expression
k[10 - 10]

// Indexing with a `Name` AST node that is an integer
// (in our case a FOR loop induction variable or a variable
// with `CodegenVarType` == `Integer`
k[id] = ...
k[ena_id] = ...
```
Note that the case:
```
// id := loop integer induction variable
k[id + 1] = ...
```
is not supported for 2 reasons:

On the AST level, as per #545 the expression would
contain a Name and not VarName node that fails the
code generation.

The case only arises in the kernel functions like state_update,
where indexing is "artificially" created with indexing by a Name
only.

fixes #541
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 71 +++++++++++++----------
 src/codegen/llvm/codegen_llvm_visitor.hpp | 21 +++----
 test/unit/codegen/codegen_llvm_ir.cpp     | 37 +++++-------
 3 files changed, 65 insertions(+), 64 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 62e69449b7..cd2af2af69 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -39,28 +39,17 @@ static bool is_supported_statement(const ast::Statement& statement) {
            statement.is_if_statement() || statement.is_while_statement();
 }
 
-bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
-    llvm::Type* array_type = lookup(node.get_node_name())->getType()->getPointerElementType();
-    unsigned length = array_type->getArrayNumElements();
-    return 0 <= index && index < length;
-}
-
-llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, unsigned index) {
-    llvm::Type* index_type = llvm::Type::getInt32Ty(*context);
+llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, llvm::Value* index) {
+    llvm::Type* index_type = llvm::Type::getInt64Ty(*context);
     std::vector<llvm::Value*> indices;
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
-    indices.push_back(llvm::ConstantInt::get(index_type, index));
+    indices.push_back(index);
 
     return builder.CreateInBoundsGEP(lookup(name), indices);
 }
 
 llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
-    unsigned index = get_array_index_or_length(node);
-
-    // Check if index is within array bounds.
-    if (!check_array_bounds(node, index))
-        throw std::runtime_error("Error: Index is out of bounds");
-
+    llvm::Value* index = get_array_index(node);
     return create_gep(node.get_node_name(), index);
 }
 
@@ -96,20 +85,11 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     if (!member_var_name->get_name()->is_indexed_name())
         throw std::runtime_error("Error: " + member_name + " is not an IndexedName!");
 
-    // Proceed to creating a GEP instruction to get the pointer to the member's element. While LLVM
-    // Helper set the indices to be Name nodes, a sanity check is added here. Note that this step
-    // can be avoided if using `get_array_index_or_length()`. However, it does not support indexing
-    // with Name/Expression at the moment. \todo: Reuse `get_array_index_or_length()` here.
+    // Proceed to creating a GEP instruction to get the pointer to the member's element.
     auto member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
         member_var_name->get_name());
-    if (!member_indexed_name->get_length()->is_name())
-        throw std::runtime_error("Error: " + member_name + " has a non-Name index!");
+    llvm::Value* i64_index = get_array_index(*member_indexed_name);
 
-    // Load the index variable that will be used to access the member's element. Since we index a
-    // pointer variable, we need to extend the 32-bit integer index variable to 64-bit.
-    llvm::Value* i32_index = builder.CreateLoad(
-        lookup(member_indexed_name->get_length()->get_node_name()));
-    llvm::Value* i64_index = builder.CreateSExt(i32_index, llvm::Type::getInt64Ty(*context));
 
     // Create a indices vector for GEP to return the pointer to the element at the specified index.
     std::vector<llvm::Value*> member_indices;
@@ -135,17 +115,44 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     return builder.CreateInBoundsGEP(instance_member, member_indices);
 }
 
-unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& indexed_name) {
-    // \todo: Support indices with expressions and names: k[i + j] = ...
-    auto integer = std::dynamic_pointer_cast<ast::Integer>(indexed_name.get_length());
+llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
+    // Process the index expression. It can either be a Name node:
+    //    k[id]     // id is an integer
+    // or an integer expression.
+    llvm::Value* index_value;
+    if (node.get_length()->is_name()) {
+        llvm::Value* ptr = lookup(node.get_length()->get_node_name());
+        index_value = builder.CreateLoad(ptr);
+    } else {
+        node.get_length()->accept(*this);
+        index_value = values.back();
+        values.pop_back();
+    }
+
+    // Check if index is a double. While it is possible to use casting from double to integer
+    // values, we choose not to support these cases.
+    if (!index_value->getType()->isIntOrIntVectorTy())
+        throw std::runtime_error("Error: only integer indexing is supported!");
+
+    // Conventionally, in LLVM array indices are 64 bit.
+    auto index_type = llvm::cast<llvm::IntegerType>(index_value->getType());
+    llvm::Type* i64_type = llvm::Type::getInt64Ty(*context);
+    if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
+        return index_value;
+
+    return builder.CreateSExtOrTrunc(index_value, i64_type);
+}
+
+int CodegenLLVMVisitor::get_array_length(const ast::IndexedName& node) {
+    auto integer = std::dynamic_pointer_cast<ast::Integer>(node.get_length());
     if (!integer)
-        throw std::runtime_error("Error: only integer indices/length are supported!");
+        throw std::runtime_error("Error: only integer length is supported!");
 
     // Check if integer value is taken from a macro.
     if (!integer->get_macro())
         return integer->get_value();
     const auto& macro = sym_tab->lookup(integer->get_macro()->get_node_name());
-    return static_cast<unsigned>(*macro->get_value());
+    return static_cast<int>(*macro->get_value());
 }
 
 llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType& node) {
@@ -691,7 +698,7 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
         llvm::Type* var_type;
         if (identifier->is_indexed_name()) {
             auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-            unsigned length = get_array_index_or_length(*indexed_name);
+            int length = get_array_length(*indexed_name);
             var_type = llvm::ArrayType::get(scalar_var_type, length);
         } else if (identifier->is_name()) {
             // This case corresponds to a scalar local variable. Its type is double by default.
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index c93b76b1d6..1477e0d66d 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -119,12 +119,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , builder(*context)
         , fpm(module.get()) {}
 
-    /**
-     * Checks if array index specified by the given IndexedName is within bounds
-     * \param node IndexedName representing array
-     * \return     \c true if the index is within bounds
-     */
-    bool check_array_bounds(const ast::IndexedName& node, unsigned index);
 
     /**
      * Generates LLVM code for the given IndexedName
@@ -146,14 +140,21 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      * \param index element index
      * \return GEP instruction value
      */
-    llvm::Value* create_gep(const std::string& name, unsigned index);
+    llvm::Value* create_gep(const std::string& name, llvm::Value* index);
+
+    /**
+     * Returns array index from given IndexedName
+     * \param node IndexedName representing array
+     * \return array index
+     */
+    llvm::Value* get_array_index(const ast::IndexedName& node);
 
     /**
-     * Returns array index or length from given IndexedName
+     * Returns array length from given IndexedName
      * \param node IndexedName representing array
-     * \return array index or length
+     * \return array length
      */
-    unsigned get_array_index_or_length(const ast::IndexedName& node);
+    int get_array_length(const ast::IndexedName& node);
 
     /**
      * Returns LLVM type for the given CodegenVarType node
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index d5b531c5d5..58c1e2a7eb 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -557,6 +557,7 @@ SCENARIO("Indexed name", "[visitor][llvm]") {
         std::string nmodl_text = R"(
             PROCEDURE foo() {
                 LOCAL x[2]
+                x[10 - 10] = 1
                 x[1] = 3
             }
         )";
@@ -565,14 +566,19 @@ SCENARIO("Indexed name", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check GEP is created correctly to pint at array element.
-            std::regex GEP(
-                R"(%1 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i32 0, i32 1)");
-            REQUIRE(std::regex_search(module_string, m, GEP));
-
-            // Check the value is stored to the pointer.
-            std::regex store(R"(store double 3.000000e\+00, double\* %1)");
-            REQUIRE(std::regex_search(module_string, m, store));
+            // Check GEPs are created correctly to get the addresses of array elements.
+            std::regex GEP1(
+                R"(%1 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i64 0, i64 0)");
+            std::regex GEP2(
+                R"(%2 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i64 0, i64 1)");
+            REQUIRE(std::regex_search(module_string, m, GEP1));
+            REQUIRE(std::regex_search(module_string, m, GEP2));
+
+            // Check the value is stored to the correct addresses.
+            std::regex store1(R"(store double 1.000000e\+00, double\* %1)");
+            std::regex store2(R"(store double 3.000000e\+00, double\* %2)");
+            REQUIRE(std::regex_search(module_string, m, store1));
+            REQUIRE(std::regex_search(module_string, m, store2));
         }
     }
 
@@ -591,7 +597,7 @@ SCENARIO("Indexed name", "[visitor][llvm]") {
 
             // Check GEP is created correctly to pint at array element.
             std::regex GEP(
-                R"(%2 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i32 0, i32 1)");
+                R"(%2 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i64 0, i64 1)");
             REQUIRE(std::regex_search(module_string, m, GEP));
 
             // Check the value is loaded from the pointer.
@@ -603,19 +609,6 @@ SCENARIO("Indexed name", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, store));
         }
     }
-
-    GIVEN("Array with out of bounds access") {
-        std::string nmodl_text = R"(
-            PROCEDURE foo() {
-                LOCAL x[2]
-                x[5] = 3
-            }
-        )";
-
-        THEN("error is thrown") {
-            REQUIRE_THROWS_AS(run_llvm_visitor(nmodl_text), std::runtime_error);
-        }
-    }
 }
 
 //=============================================================================

From 559d15200a869cc142e83df9f3bbf10688213646 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Sat, 13 Mar 2021 12:35:18 +0100
Subject: [PATCH 030/331] Add InstanceStruct  test data generation helper and
 unit test (#546)

* CodegenLLVMHelperVisitor improved without hardcoded parameters
* Added get_instance_struct_ptr to get instance structure for variable information
* test/unit/codegen/codegen_data_helper.cpp : first draft implementation
   of codegen data helper
* Added test for typecasting to the proper struct type

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 .../llvm/codegen_llvm_helper_visitor.cpp      |  25 ++-
 .../llvm/codegen_llvm_helper_visitor.hpp      |  14 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   4 +
 src/codegen/llvm/codegen_llvm_visitor.hpp     |   6 +
 test/unit/CMakeLists.txt                      |   5 +-
 test/unit/codegen/codegen_data_helper.cpp     | 186 ++++++++++++++++++
 test/unit/codegen/codegen_data_helper.hpp     | 111 +++++++++++
 .../codegen/codegen_llvm_instance_struct.cpp  | 174 ++++++++++++++++
 8 files changed, 512 insertions(+), 13 deletions(-)
 create mode 100644 test/unit/codegen/codegen_data_helper.cpp
 create mode 100644 test/unit/codegen/codegen_data_helper.hpp
 create mode 100644 test/unit/codegen/codegen_llvm_instance_struct.cpp

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c34ae2c873..c8143ac393 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -18,6 +18,13 @@ namespace codegen {
 
 using namespace fmt::literals;
 
+/// initialize static member variables
+const ast::AstNodeType CodegenLLVMHelperVisitor::INTEGER_TYPE = ast::AstNodeType::INTEGER;
+const ast::AstNodeType CodegenLLVMHelperVisitor::FLOAT_TYPE = ast::AstNodeType::DOUBLE;
+const std::string CodegenLLVMHelperVisitor::NODECOUNT_VAR = "node_count";
+const std::string CodegenLLVMHelperVisitor::VOLTAGE_VAR = "voltage";
+const std::string CodegenLLVMHelperVisitor::NODE_INDEX_VAR = "node_index";
+
 /**
  * \brief Create variable definition statement
  *
@@ -157,7 +164,12 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     auto function = std::make_shared<ast::CodegenFunction>(fun_ret_type, name, arguments, block);
     codegen_functions.push_back(function);
 }
-
+/**
+ * \note : Order of variables is not important but we assume all pointers
+ * are added first and then scalar variables like t, dt, second_order etc.
+ * This order is assumed when we allocate data for integration testing
+ * and benchmarking purpose. See CodegenDataHelper::create_data().
+ */
 std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_struct() {
     ast::CodegenVarWithTypeVector codegen_vars;
 
@@ -186,15 +198,15 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     }
 
     // add voltage and node index
-    add_var_with_type("voltage", FLOAT_TYPE, /*is_pointer=*/1);
-    add_var_with_type("node_index", INTEGER_TYPE, /*is_pointer=*/1);
+    add_var_with_type(VOLTAGE_VAR, FLOAT_TYPE, /*is_pointer=*/1);
+    add_var_with_type(NODE_INDEX_VAR, INTEGER_TYPE, /*is_pointer=*/1);
 
     // add dt, t, celsius
     add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
     add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
     add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
     add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
-    add_var_with_type(MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
+    add_var_with_type(NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
 
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
@@ -510,7 +522,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
 
     /// loop constructs : initialization, condition and increment
     const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
-    const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, MECH_NODECOUNT_VAR));
+    const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
     const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
 
     /// loop body : initialization + solve blocks
@@ -524,7 +536,8 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         /// access node index and corresponding voltage
         loop_index_statements.push_back(
             visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
-        loop_body_statements.push_back(visitor::create_statement("v = voltage[node_id]"));
+        loop_body_statements.push_back(
+            visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
 
         /// read ion variables
         ion_read_statements(BlockType::State,
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index b67aa7ee09..446d5a6fd9 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -112,13 +112,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     /// mechanism data helper
     InstanceVarHelper instance_var_helper;
 
-    /// default integer and float node type
-    const ast::AstNodeType INTEGER_TYPE = ast::AstNodeType::INTEGER;
-    const ast::AstNodeType FLOAT_TYPE = ast::AstNodeType::DOUBLE;
-
     /// name of the mechanism instance parameter
     const std::string MECH_INSTANCE_VAR = "mech";
-    const std::string MECH_NODECOUNT_VAR = "node_count";
 
     /// name of induction variable used in the kernel.
     const std::string INDUCTION_VAR = "id";
@@ -130,6 +125,15 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     std::shared_ptr<ast::InstanceStruct> create_instance_struct();
 
   public:
+    /// default integer and float node type
+    static const ast::AstNodeType INTEGER_TYPE;
+    static const ast::AstNodeType FLOAT_TYPE;
+
+    // node count, voltage and node index variables
+    static const std::string NODECOUNT_VAR;
+    static const std::string VOLTAGE_VAR;
+    static const std::string NODE_INDEX_VAR;
+
     CodegenLLVMHelperVisitor(int vector_width)
         : vector_width(vector_width){};
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index cd2af2af69..b1182d36b9 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -259,6 +259,10 @@ llvm::Value* CodegenLLVMVisitor::get_variable_ptr(const ast::VarName& node) {
     return ptr;
 }
 
+std::shared_ptr<ast::InstanceStruct> CodegenLLVMVisitor::get_instance_struct_ptr() {
+    return instance_var_helper.instance;
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 1477e0d66d..41235a1ff0 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -201,6 +201,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Value* get_variable_ptr(const ast::VarName& node);
 
+    /**
+     * Returns shared_ptr to generated ast::InstanceStruct
+     * \return std::shared_ptr<ast::InstanceStruct>
+     */
+    std::shared_ptr<ast::InstanceStruct> get_instance_struct_ptr();
+
     /**
      * Create a function call to an external method
      * \param name external method name
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index ef24242b69..077706ef8d 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -96,8 +96,9 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 
 if(NMODL_ENABLE_LLVM)
-  include_directories(${LLVM_INCLUDE_DIRS})
-  add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp)
+  include_directories(${LLVM_INCLUDE_DIRS} codegen)
+  add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp
+                          codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_llvm_execution.cpp)
   target_link_libraries(
     testllvm
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
new file mode 100644
index 0000000000..e42cfe01f3
--- /dev/null
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -0,0 +1,186 @@
+#include <algorithm>
+
+#include "ast/codegen_var_type.hpp"
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+
+#include "codegen_data_helper.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+// scalar variables with default values
+const double default_nthread_dt_value = 0.025;
+const double default_nthread_t_value = 100.0;
+const double default_celsius_value = 34.0;
+const int default_second_order_value = 0;
+
+// cleanup all members and struct base pointer
+CodegenInstanceData::~CodegenInstanceData() {
+    // first free num_ptr_members members which are pointers
+    for (size_t i = 0; i < num_ptr_members; i++) {
+        free(members[i]);
+    }
+    // and then pointer to container struct
+    free(base_ptr);
+}
+
+/**
+ * \todo : various things can be improved here
+ * - if variable is voltage then initialization range could be -65 to +65
+ * - if variable is double or float then those could be initialize with
+ *   "some" floating point value between range like 1.0 to 100.0. Note
+ *   it would be nice to have unique values to avoid errors like division
+ *   by zero. We have simple implementation that is taking care of this.
+ * - if variable is integer then initialization range must be between
+ *   0 and num_elements. In practice, num_elements is number of instances
+ *   of a particular mechanism. This would be <= number of compartments
+ *   in the cell. For now, just initialize integer variables from 0 to
+ *   num_elements - 1.
+ */
+void initialize_variable(const std::shared_ptr<ast::CodegenVarWithType>& var,
+                         void* ptr,
+                         size_t initial_value,
+                         size_t num_elements) {
+    ast::AstNodeType type = var->get_type()->get_type();
+    const std::string& name = var->get_name()->get_node_name();
+
+    if (type == ast::AstNodeType::DOUBLE) {
+        const auto& generated_double_data = generate_dummy_data<double>(initial_value,
+                                                                        num_elements);
+        double* data = (double*) ptr;
+        for (size_t i = 0; i < num_elements; i++) {
+            data[i] = generated_double_data[i];
+        }
+    } else if (type == ast::AstNodeType::FLOAT) {
+        const auto& generated_float_data = generate_dummy_data<float>(initial_value, num_elements);
+        float* data = (float*) ptr;
+        for (size_t i = 0; i < num_elements; i++) {
+            data[i] = generated_float_data[i];
+        }
+    } else if (type == ast::AstNodeType::INTEGER) {
+        const auto& generated_int_data = generate_dummy_data<int>(initial_value, num_elements);
+        int* data = (int*) ptr;
+        for (size_t i = 0; i < num_elements; i++) {
+            data[i] = generated_int_data[i];
+        }
+    } else {
+        throw std::runtime_error("Unhandled data type during initialize_variable");
+    };
+}
+
+CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t seed) {
+    // alignment with 64-byte to generate aligned loads/stores
+    const unsigned NBYTE_ALIGNMENT = 64;
+
+    // get variable information
+    const auto& variables = instance->get_codegen_vars();
+
+    // start building data
+    CodegenInstanceData data;
+    data.num_elements = num_elements;
+
+    // base pointer to instance object
+    void* base = nullptr;
+
+    // max size of each member : pointer / double has maximum size
+    size_t member_size = std::max(sizeof(double), sizeof(double*));
+
+    // allocate instance object with memory alignment
+    posix_memalign(&base, NBYTE_ALIGNMENT, member_size * variables.size());
+    data.base_ptr = base;
+
+    size_t offset = 0;
+    void* ptr = base;
+    size_t variable_index = 0;
+
+    // allocate each variable and allocate memory at particular offset in base pointer
+    for (auto& var: variables) {
+        // only process until first non-pointer variable
+        if (!var->get_is_pointer()) {
+            break;
+        }
+
+        // check type of variable and it's size
+        size_t member_size = 0;
+        ast::AstNodeType type = var->get_type()->get_type();
+        if (type == ast::AstNodeType::DOUBLE) {
+            member_size = sizeof(double);
+        } else if (type == ast::AstNodeType::FLOAT) {
+            member_size = sizeof(float);
+        } else if (type == ast::AstNodeType::INTEGER) {
+            member_size = sizeof(int);
+        }
+
+        // allocate memory and setup a pointer
+        void* member;
+        posix_memalign(&member, NBYTE_ALIGNMENT, member_size * num_elements);
+        initialize_variable(var, member, variable_index, num_elements);
+
+        // copy address at specific location in the struct
+        memcpy(ptr, &member, sizeof(double*));
+
+        data.offsets.push_back(offset);
+        data.members.push_back(member);
+        data.num_ptr_members++;
+
+        // all pointer types are of same size, so just use double*
+        offset += sizeof(double*);
+        ptr = (char*) base + offset;
+
+        variable_index++;
+    }
+
+    // we are now switching from pointer type to next member type (e.g. double)
+    // ideally we should use padding but switching from double* to double should
+    // already meet alignment requirements
+    for (auto& var: variables) {
+        // process only scalar elements
+        if (var->get_is_pointer()) {
+            continue;
+        }
+        ast::AstNodeType type = var->get_type()->get_type();
+        const std::string& name = var->get_name()->get_node_name();
+
+        // some default values for standard parameters
+        double value = 0;
+        if (name == naming::NTHREAD_DT_VARIABLE) {
+            value = default_nthread_dt_value;
+        } else if (name == naming::NTHREAD_T_VARIABLE) {
+            value = default_nthread_t_value;
+        } else if (name == naming::CELSIUS_VARIABLE) {
+            value = default_celsius_value;
+        } else if (name == CodegenLLVMHelperVisitor::NODECOUNT_VAR) {
+            value = num_elements;
+        } else if (name == naming::SECOND_ORDER_VARIABLE) {
+            value = default_second_order_value;
+        }
+
+        if (type == ast::AstNodeType::DOUBLE) {
+            *((double*) ptr) = value;
+            data.offsets.push_back(offset);
+            data.members.push_back(ptr);
+            offset += sizeof(double);
+            ptr = (char*) base + offset;
+        } else if (type == ast::AstNodeType::FLOAT) {
+            *((float*) ptr) = float(value);
+            data.offsets.push_back(offset);
+            data.members.push_back(ptr);
+            offset += sizeof(float);
+            ptr = (char*) base + offset;
+        } else if (type == ast::AstNodeType::INTEGER) {
+            *((int*) ptr) = int(value);
+            data.offsets.push_back(offset);
+            data.members.push_back(ptr);
+            offset += sizeof(int);
+            ptr = (char*) base + offset;
+        } else {
+            throw std::runtime_error(
+                "Unhandled type while allocating data in CodegenDataHelper::create_data()");
+        }
+    }
+
+    return data;
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/test/unit/codegen/codegen_data_helper.hpp b/test/unit/codegen/codegen_data_helper.hpp
new file mode 100644
index 0000000000..368b964147
--- /dev/null
+++ b/test/unit/codegen/codegen_data_helper.hpp
@@ -0,0 +1,111 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+#include "ast/ast.hpp"
+
+/// \file
+/// \brief Generate test data for testing and benchmarking compute kernels
+
+namespace nmodl {
+namespace codegen {
+
+/// common scalar variables
+extern const double default_nthread_dt_value;
+extern const double default_nthread_t_value;
+extern const double default_celsius_value;
+extern const int default_second_order_value;
+
+/**
+ * \class CodegenInstanceData
+ * \brief Wrapper class to pack data allocate for instance
+ */
+struct CodegenInstanceData {
+    /// base pointer which can be type casted
+    /// to instance struct at run time
+    void* base_ptr = nullptr;
+
+    /// length of each member of pointer type
+    size_t num_elements = 0;
+
+    /// number of pointer members
+    size_t num_ptr_members = 0;
+
+    /// offset relative to base_ptr to locate
+    /// each member variable in instance struct
+    std::vector<size_t> offsets;
+
+    /// pointer to array allocated for each member variable
+    /// i.e. *(base_ptr + offsets[0]) will be members[0]
+    std::vector<void*> members;
+
+    // cleanup all memory allocated for type and member variables
+    ~CodegenInstanceData();
+};
+
+
+/**
+ * Generate vector of dummy data according to the template type specified
+ *
+ * For double type: generate vector starting from (initial_value + 1e-15)
+ *                  with increments of 1e-15
+ * For float type:  generate vector starting from (initial_value + 1e-6)
+ *                  with increments of 1e-6
+ * For int type:    generate vector starting from (initial_value + 1) with
+ *                  increments of 1
+ *
+ * \param inital_value Base value for initializing the data
+ * \param num_elements Number of element of the generated vector
+ * \return std::vector<T> of dummy data for testing purposes
+ */
+template <typename T>
+std::vector<T> generate_dummy_data(size_t initial_value, size_t num_elements) {
+    std::vector<T> data(num_elements);
+    T precision;
+    if (std::is_same<T, double>::value) {
+        precision = 1e-15;
+    } else if (std::is_same<T, float>::value) {
+        precision = 1e-6;
+    } else {
+        precision = 1;
+    }
+    for (size_t i = 0; i < num_elements; i++) {
+        data[i] = initial_value + precision * (i + 1);
+    }
+    return data;
+}
+
+/**
+ * \class CodegenDataHelper
+ * \brief Helper to allocate and initialize data for benchmarking
+ *
+ * The `ast::InstanceStruct` is has different number of member
+ * variables for different MOD files and hence we can't instantiate
+ * it at compile time. This class helps to inspect the variables
+ * information gathered from AST and allocate memory block that
+ * can be type cast to the `ast::InstanceStruct` corresponding
+ * to the MOD file.
+ */
+class CodegenDataHelper {
+    std::shared_ptr<ast::Program> program;
+    std::shared_ptr<ast::InstanceStruct> instance;
+
+  public:
+    CodegenDataHelper() = delete;
+    CodegenDataHelper(const std::shared_ptr<ast::Program>& program,
+                      const std::shared_ptr<ast::InstanceStruct>& instance)
+        : program(program)
+        , instance(instance) {}
+
+    CodegenInstanceData create_data(size_t num_elements, size_t seed);
+};
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
new file mode 100644
index 0000000000..4bfa1cd31c
--- /dev/null
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -0,0 +1,174 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <catch/catch.hpp>
+
+#include "ast/all.hpp"
+#include "ast/program.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen_data_helper.hpp"
+#include "parser/nmodl_driver.hpp"
+#include "visitors/checkparent_visitor.hpp"
+#include "visitors/neuron_solve_visitor.hpp"
+#include "visitors/solve_block_visitor.hpp"
+#include "visitors/symtab_visitor.hpp"
+
+using namespace nmodl;
+using namespace codegen;
+using namespace visitor;
+using nmodl::parser::NmodlDriver;
+
+//=============================================================================
+// Utility to get initialized Struct Instance data
+//=============================================================================
+
+codegen::CodegenInstanceData generate_instance_data(const std::string& text,
+                                                    bool opt = false,
+                                                    bool use_single_precision = false,
+                                                    int vector_width = 1,
+                                                    size_t num_elements = 100,
+                                                    size_t seed = 1) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    // Generate full AST and solve the BREAKPOINT block to be able to generate the Instance Struct
+    SymtabVisitor().visit_program(*ast);
+    NeuronSolveVisitor().visit_program(*ast);
+
+    codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"test",
+                                             /*output_dir=*/".",
+                                             opt,
+                                             use_single_precision,
+                                             vector_width);
+    llvm_visitor.visit_program(*ast);
+    llvm_visitor.print_module();
+    const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+    auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+    auto instance_data = codegen_data.create_data(num_elements, seed);
+    return instance_data;
+}
+
+template <typename T>
+bool compare(void* instance_struct_data_ptr, const std::vector<T>& generated_data) {
+    std::vector<T> instance_struct_vector;
+    std::cout << "Generated data size: " << generated_data.size() << std::endl;
+    instance_struct_vector.assign(static_cast<T*>(instance_struct_data_ptr),
+                                  static_cast<T*>(instance_struct_data_ptr) +
+                                      generated_data.size());
+    for (auto value: instance_struct_vector) {
+        std::cout << value << std::endl;
+    }
+    return instance_struct_vector == generated_data;
+}
+
+//=============================================================================
+// Simple Instance Struct creation
+//=============================================================================
+
+SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
+    GIVEN("Instantiate simple Instance Struct") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                USEION na READ ena
+                RANGE minf, mtau
+            }
+
+            STATE {
+                m
+            }
+
+            ASSIGNED {
+                v (mV)
+                celsius (degC)
+                ena (mV)
+                minf
+                mtau
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+                m' =  (minf-m)/mtau
+            }
+        )";
+
+
+        THEN("instance struct elements are properly initialized") {
+            const size_t num_elements = 10;
+            constexpr static double seed = 42;
+            auto instance_data = generate_instance_data(nmodl_text,
+                                                        /*opt=*/false,
+                                                        /*use_single_precision=*/true,
+                                                        /*vector_width*/ 1,
+                                                        num_elements,
+                                                        seed);
+            size_t minf_index = 0;
+            size_t mtau_index = 1;
+            size_t m_index = 2;
+            size_t Dm_index = 3;
+            size_t ena_index = 4;
+            size_t v_unused_index = 5;
+            size_t g_unused_index = 6;
+            size_t ion_ena_index = 7;
+            size_t ion_ena_index_index = 8;
+            size_t voltage_index = 9;
+            size_t node_index_index = 10;
+            size_t t_index = 11;
+            size_t dt_index = 12;
+            size_t celsius_index = 13;
+            size_t secondorder_index = 14;
+            size_t node_count_index = 15;
+            // Check if the various instance struct fields are properly initialized
+            REQUIRE(compare(instance_data.members[minf_index],
+                            generate_dummy_data<double>(minf_index, num_elements)));
+            REQUIRE(compare(instance_data.members[ena_index],
+                            generate_dummy_data<double>(ena_index, num_elements)));
+            REQUIRE(compare(instance_data.members[ion_ena_index],
+                            generate_dummy_data<double>(ion_ena_index, num_elements)));
+            REQUIRE(compare(instance_data.members[node_index_index],
+                            generate_dummy_data<int>(node_index_index, num_elements)));
+            REQUIRE(*static_cast<double*>(instance_data.members[t_index]) ==
+                    default_nthread_t_value);
+            REQUIRE(*static_cast<int*>(instance_data.members[node_count_index]) == num_elements);
+
+            // Hard code TestInstanceType struct
+            struct TestInstanceType {
+                double* minf;
+                double* mtau;
+                double* m;
+                double* Dm;
+                double* ena;
+                double* v_unused;
+                double* g_unused;
+                double* ion_ena;
+                int* ion_ena_index;
+                double* voltage;
+                int* node_index;
+                double t;
+                double dt;
+                double celsius;
+                int secondorder;
+                int node_count;
+            };
+            // Test if TestInstanceType struct is properly initialized
+            // Cast void ptr instance_data.base_ptr to TestInstanceType*
+            TestInstanceType* instance = (TestInstanceType*) instance_data.base_ptr;
+            REQUIRE(compare(instance->minf, generate_dummy_data<double>(minf_index, num_elements)));
+            REQUIRE(compare(instance->ena, generate_dummy_data<double>(ena_index, num_elements)));
+            REQUIRE(compare(instance->ion_ena,
+                            generate_dummy_data<double>(ion_ena_index, num_elements)));
+            REQUIRE(compare(instance->node_index,
+                            generate_dummy_data<int>(node_index_index, num_elements)));
+            REQUIRE(instance->t == default_nthread_t_value);
+            REQUIRE(instance->celsius == default_celsius_value);
+            REQUIRE(instance->secondorder == default_second_order_value);
+        }
+    }
+}

From b5d152f6bbc0dfad959dac684a29bc761ac51e8a Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Wed, 17 Mar 2021 12:57:02 +0100
Subject: [PATCH 031/331] Add the remainder loop for vectorization of
 DERIVATIVE block (#534)

* Implement remainder loop along with main vector loop
* Add unit test for the same

fixes #532
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 56 ++++++++----
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  6 +-
 test/unit/codegen/codegen_llvm_ir.cpp         | 89 ++++++++++++++++++-
 3 files changed, 132 insertions(+), 19 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c8143ac393..0173664a8c 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -520,11 +520,6 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
 
     /// create now main compute part : for loop over channel instances
 
-    /// loop constructs : initialization, condition and increment
-    const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
-    const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
-    const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
-
     /// loop body : initialization + solve blocks
     ast::StatementVector loop_def_statements;
     ast::StatementVector loop_index_statements;
@@ -583,20 +578,49 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// now construct a new code block which will become the body of the loop
     auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
 
-    /// convert local statement to codegenvar statement
-    convert_local_statement(*loop_block);
+    /// main loop possibly vectorized on vector_width
+    {
+        /// loop constructs : initialization, condition and increment
+        const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
+        const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
+        const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
+
+        /// clone it
+        auto local_loop_block = std::shared_ptr<ast::StatementBlock>(loop_block->clone());
 
-    /// create for loop node
-    auto for_loop_statement = std::make_shared<ast::CodegenForStatement>(initialization,
-                                                                         condition,
-                                                                         increment,
-                                                                         loop_block);
+        /// convert local statement to codegenvar statement
+        convert_local_statement(*local_loop_block);
 
-    /// convert all variables inside loop body to instance variables
-    convert_to_instance_variable(*for_loop_statement, loop_index_var);
+        auto for_loop_statement_main = std::make_shared<ast::CodegenForStatement>(initialization,
+                                                                                  condition,
+                                                                                  increment,
+                                                                                  local_loop_block);
+
+        /// convert all variables inside loop body to instance variables
+        convert_to_instance_variable(*for_loop_statement_main, loop_index_var);
+
+        /// loop itself becomes one of the statement in the function
+        function_statements.push_back(for_loop_statement_main);
+    }
 
-    /// loop itself becomes one of the statement in the function
-    function_statements.push_back(for_loop_statement);
+    /// remainder loop possibly vectorized on vector_width
+    {
+        /// loop constructs : initialization, condition and increment
+        const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
+        const auto& increment = loop_increment_expression(INDUCTION_VAR, 1);
+
+        /// convert local statement to codegenvar statement
+        convert_local_statement(*loop_block);
+
+        auto for_loop_statement_remainder =
+            std::make_shared<ast::CodegenForStatement>(nullptr, condition, increment, loop_block);
+
+        /// convert all variables inside loop body to instance variables
+        convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
+
+        /// loop itself becomes one of the statement in the function
+        function_statements.push_back(for_loop_statement_remainder);
+    }
 
     /// new block for the function
     auto function_block = new ast::StatementBlock(function_statements);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index b1182d36b9..bed88046a7 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -576,8 +576,10 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
-    // First, initialise the loop in the same basic block.
-    node.get_initialization()->accept(*this);
+    // First, initialise the loop in the same basic block. This block is optional.
+    if (node.get_initialization()) {
+        node.get_initialization()->accept(*this);
+    }
 
     // If the loop is to be vectorised, create a separate vector induction variable.
     // \todo: See the comment for `kernel_id_prefix`.
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 58c1e2a7eb..3ab0c8d929 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -8,16 +8,25 @@
 #include <catch/catch.hpp>
 #include <regex>
 
+#include "test/unit/utils/test_utils.hpp"
+
 #include "ast/program.hpp"
+#include "ast/statement_block.hpp"
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
 #include "visitors/neuron_solve_visitor.hpp"
 #include "visitors/solve_block_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
+#include "visitors/visitor_utils.hpp"
 
 using namespace nmodl;
+using namespace codegen;
 using namespace visitor;
+
+using namespace test_utils;
+
 using nmodl::parser::NmodlDriver;
 
 //=============================================================================
@@ -44,6 +53,24 @@ std::string run_llvm_visitor(const std::string& text,
     return llvm_visitor.print_module();
 }
 
+//=============================================================================
+// Utility to get specific LLVM nodes
+//=============================================================================
+
+std::vector<std::shared_ptr<ast::Ast>> run_codegen_visitor_helper(const std::string& text) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    /// construct symbol table and run codegen helper visitor
+    SymtabVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
+    CodegenLLVMHelperVisitor(8).visit_program(*ast);
+
+    const auto& nodes = collect_nodes(*ast, {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
+
+    return nodes;
+}
+
 //=============================================================================
 // BinaryExpression and Double
 //=============================================================================
@@ -864,13 +891,73 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
 
             // Check exit block.
             std::regex exit(
-                "for\\.exit:.*\n"
+                "for\\.exit[0-9]*:.*\n"
                 "  ret void");
             REQUIRE(std::regex_search(module_string, m, exit));
         }
     }
 }
 
+//=============================================================================
+// Derivative block : test optimization
+//=============================================================================
+
+SCENARIO("Derivative block", "[visitor][llvm][derivative]") {
+    GIVEN("After helper visitor") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                RANGE minf, mtau
+            }
+            STATE {
+                m
+            }
+            ASSIGNED {
+                v (mV)
+                minf
+                mtau (ms)
+            }
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+            DERIVATIVE states {
+                m = (minf-m)/mtau
+            }
+        )";
+
+        std::string expected_main_loop = R"(
+            for(id = 0; id<mech->node_count; id = id+8) {
+                INTEGER node_id
+                DOUBLE v
+                node_id = mech->node_index[id]
+                v = mech->voltage[node_id]
+                mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
+                SOLVE states METHOD cnexp
+            })";
+        std::string expected_reminder_loop = R"(
+            for(; id<mech->node_count; id = id+1) {
+                INTEGER node_id
+                DOUBLE v
+                node_id = mech->node_index[id]
+                v = mech->voltage[node_id]
+                mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
+                SOLVE states METHOD cnexp
+            })";
+
+
+        THEN("should contains 2 for loops") {
+            auto result = run_codegen_visitor_helper(nmodl_text);
+            REQUIRE(result.size() == 2);
+
+            auto main_loop = reindent_text(to_nmodl(result[0]));
+            REQUIRE(main_loop == reindent_text(expected_main_loop));
+
+            auto reminder_loop = reindent_text(to_nmodl(result[1]));
+            REQUIRE(reminder_loop == reindent_text(expected_reminder_loop));
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From c7e5e28e84a7a58c8b2f126d52d093cd91996634 Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Fri, 19 Mar 2021 20:59:19 +0100
Subject: [PATCH 032/331] Always initialize return variable in function block
 (#554)

* return value in PROCEDURE block was not initialised
* do the initialisation as part of ASTR transformation
* remove initialisation specific code from LLVM visitor

fixes #530
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 55 +++++++++++--------
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  9 ---
 test/unit/codegen/codegen_llvm_ir.cpp         |  1 +
 3 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 0173664a8c..ceced6dc77 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -25,6 +25,28 @@ const std::string CodegenLLVMHelperVisitor::NODECOUNT_VAR = "node_count";
 const std::string CodegenLLVMHelperVisitor::VOLTAGE_VAR = "voltage";
 const std::string CodegenLLVMHelperVisitor::NODE_INDEX_VAR = "node_index";
 
+/// Create asr::Varname node with given a given variable name
+static ast::VarName* create_varname(const std::string& varname) {
+    return new ast::VarName(new ast::Name(new ast::String(varname)), nullptr, nullptr);
+}
+
+/**
+ * Create initialization expression
+ * @param code Usually "id = 0" as a string
+ * @return Expression representing code
+ * \todo : we can not use `create_statement_as_expression` function because
+ *         NMODL parser is using `ast::Double` type to represent all variables
+ *         including Integer. See #542.
+ */
+static std::shared_ptr<ast::Expression> int_initialization_expression(
+    const std::string& induction_var,
+    int value = 0) {
+    // create id = 0
+    const auto& id = create_varname(induction_var);
+    const auto& zero = new ast::Integer(value, nullptr);
+    return std::make_shared<ast::BinaryExpression>(id, ast::BinaryOperator(ast::BOP_ASSIGN), zero);
+}
+
 /**
  * \brief Create variable definition statement
  *
@@ -120,7 +142,8 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     auto name = new ast::Name(new ast::String(function_name));
 
     /// return variable name has "ret_" prefix
-    auto return_var = new ast::Name(new ast::String("ret_" + function_name));
+    std::string return_var_name = "ret_{}"_format(function_name);
+    auto return_var = new ast::Name(new ast::String(return_var_name));
 
     /// return type based on node type
     ast::CodegenVarType* ret_var_type = nullptr;
@@ -137,6 +160,11 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     /// convert local statement to codegenvar statement
     convert_local_statement(*block);
 
+    if (node.get_node_type() == ast::AstNodeType::PROCEDURE_BLOCK) {
+        block->insert_statement(statements.begin(),
+                                std::make_shared<ast::ExpressionStatement>(
+                                    int_initialization_expression(return_var_name)));
+    }
     /// insert return variable at the start of the block
     ast::CodegenVarVector codegen_vars;
     codegen_vars.emplace_back(new ast::CodegenVar(0, return_var->clone()));
@@ -462,30 +490,9 @@ void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
     create_function_for_node(node);
 }
 
-/// Create asr::Varname node with given a given variable name
-static ast::VarName* create_varname(const std::string& varname) {
-    return new ast::VarName(new ast::Name(new ast::String(varname)), nullptr, nullptr);
-}
-
-/**
- * Create for loop initialization expression
- * @param code Usually "id = 0" as a string
- * @return Expression representing code
- * \todo : we can not use `create_statement_as_expression` function because
- *         NMODL parser is using `ast::Double` type to represent all variables
- *         including Integer. See #542.
- */
-static std::shared_ptr<ast::Expression> loop_initialization_expression(
-    const std::string& induction_var) {
-    // create id = 0
-    const auto& id = create_varname(induction_var);
-    const auto& zero = new ast::Integer(0, nullptr);
-    return std::make_shared<ast::BinaryExpression>(id, ast::BinaryOperator(ast::BOP_ASSIGN), zero);
-}
-
 /**
  * Create loop increment expression `id = id + width`
- * \todo : same as loop_initialization_expression()
+ * \todo : same as int_initialization_expression()
  */
 static std::shared_ptr<ast::Expression> loop_increment_expression(const std::string& induction_var,
                                                                   int vector_width) {
@@ -581,7 +588,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// main loop possibly vectorized on vector_width
     {
         /// loop constructs : initialization, condition and increment
-        const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
+        const auto& initialization = int_initialization_expression(INDUCTION_VAR);
         const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
         const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index bed88046a7..37b2e7fc67 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -713,15 +713,6 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
             throw std::runtime_error("Error: Unsupported local variable type");
         }
         llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
-
-        // Check if the variable we process is a procedure return variable (i.e. it has a name
-        // "ret_<current_function_name>" and the function return type is integer). If so, initialise
-        // it to 0.
-        std::string ret_val_name = "ret_" + current_func->getName().str();
-        if (name == ret_val_name && current_func->getReturnType()->isIntegerTy()) {
-            llvm::Value* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 0);
-            builder.CreateStore(zero, alloca);
-        }
     }
 }
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 3ab0c8d929..4a0e440aaf 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -686,6 +686,7 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, signature));
             REQUIRE(std::regex_search(module_string, m, alloc));
             REQUIRE(std::regex_search(module_string, m, store));
+            REQUIRE(std::regex_search(module_string, m, load));
             REQUIRE(std::regex_search(module_string, m, ret));
         }
     }

From 20dc78502960d4ddbd16ef9411ee91a814b20660 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 9 Apr 2021 19:57:54 +0300
Subject: [PATCH 033/331] Running a kernel with NMODL-LLVM JIT (#549)

* Added support for arguments in the JIT llvm runner
* Adjusted tests and added a simple kernel test
* Removed printfs from the kernel
* Fixed kernel number of arguments check
* Initial integration of dataHelper for kernel tests
* Implemented a test to check the scalar kernel execution
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    |  36 +++++
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  14 ++
 src/codegen/llvm/jit_driver.hpp              |  36 +++--
 src/codegen/llvm/main.cpp                    |   2 +-
 test/unit/CMakeLists.txt                     |   3 +-
 test/unit/codegen/codegen_llvm_execution.cpp | 151 +++++++++++++++++--
 6 files changed, 218 insertions(+), 24 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 37b2e7fc67..5fdd906480 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -923,5 +923,41 @@ void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node)
     builder.SetInsertPoint(exit);
 }
 
+void CodegenLLVMVisitor::wrap_kernel_function(const std::string& kernel_name) {
+    // Get the kernel function and the instance struct type.
+    auto kernel = module->getFunction(kernel_name);
+    if (!kernel)
+        throw std::runtime_error("Kernel " + kernel_name + " is not found!");
+
+    if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
+        throw std::runtime_error("Kernel " + kernel_name + " must have a single argument!");
+
+    auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(kernel->getArg(0)->getType());
+    if (!instance_struct_ptr_type)
+        throw std::runtime_error("Kernel " + kernel_name +
+                                 " does not have an instance struct pointer argument!");
+
+    // Create a wrapper void function that takes a void pointer as a single argument.
+    llvm::Type* void_type = llvm::Type::getVoidTy(*context);
+    llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+    llvm::Type* void_ptr_type = llvm::PointerType::get(void_type, /*AddressSpace=*/0);
+    llvm::Function* wrapper_func = llvm::Function::Create(
+        llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
+        llvm::Function::ExternalLinkage,
+        "__" + kernel_name + "_wrapper",
+        *module);
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
+    builder.SetInsertPoint(body);
+
+    // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel and
+    // adding a terminator.
+    llvm::Value* bitcasted = builder.CreateBitCast(wrapper_func->getArg(0),
+                                                   instance_struct_ptr_type);
+    std::vector<llvm::Value*> args;
+    args.push_back(bitcasted);
+    builder.CreateCall(kernel, args);
+    builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 41235a1ff0..b099646b07 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -237,6 +237,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     void emit_procedure_or_function_declaration(const ast::CodegenFunction& node);
 
+    /**
+     * Return InstanceVarHelper
+     * \return InstanceVarHelper
+     */
+    InstanceVarHelper get_instance_var_helper() {
+        return instance_var_helper;
+    }
+
     /**
      * Return module pointer
      * \return LLVM IR module pointer
@@ -321,6 +329,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         os.flush();
         return str;
     }
+
+    /**
+     * For the given kernel function, wraps it into another function that uses void* to pass the
+     * data to the kernel \param kernel_name kernel name to be wrapped
+     */
+    void wrap_kernel_function(const std::string& kernel_name);
 };
 
 /** \} */  // end of llvm_backends
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index d1e9a9412f..23c8fca612 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -39,15 +39,27 @@ class JITDriver {
     /// Initialize the JIT.
     void init();
 
-    /// Lookup the entry-point in the JIT and execute it, returning the result.
-    template <typename T>
-    T execute(const std::string& entry_point) {
+    /// Lookup the entry-point without arguments in the JIT and execute it, returning the result.
+    template <typename ReturnType>
+    ReturnType execute_without_arguments(const std::string& entry_point) {
         auto expected_symbol = jit->lookup(entry_point);
         if (!expected_symbol)
             throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
 
-        auto (*res)() = (T(*)())(intptr_t) expected_symbol->getAddress();
-        T result = res();
+        auto (*res)() = (ReturnType(*)())(intptr_t) expected_symbol->getAddress();
+        ReturnType result = res();
+        return result;
+    }
+
+    /// Lookup the entry-point with an argument in the JIT and execute it, returning the result.
+    template <typename ReturnType, typename ArgType>
+    ReturnType execute_with_arguments(const std::string& entry_point, ArgType arg) {
+        auto expected_symbol = jit->lookup(entry_point);
+        if (!expected_symbol)
+            throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
+
+        auto (*res)(ArgType) = (ReturnType(*)(ArgType))(intptr_t) expected_symbol->getAddress();
+        ReturnType result = res(arg);
         return result;
     }
 
@@ -71,10 +83,16 @@ class Runner {
         driver->init();
     }
 
-    /// Run the entry-point function.
-    template <typename T>
-    double run(const std::string& entry_point) {
-        return driver->execute<T>(entry_point);
+    /// Run the entry-point function without arguments.
+    template <typename ReturnType>
+    ReturnType run_without_arguments(const std::string& entry_point) {
+        return driver->template execute_without_arguments<ReturnType>(entry_point);
+    }
+
+    /// Run the entry-point function with a pointer to the data as an argument.
+    template <typename ReturnType, typename ArgType>
+    ReturnType run_with_argument(const std::string& entry_point, ArgType arg) {
+        return driver->template execute_with_arguments<ReturnType, ArgType>(entry_point, arg);
     }
 };
 
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 11ea178cb4..acbdc37f19 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -67,7 +67,7 @@ int main(int argc, const char* argv[]) {
     Runner runner(std::move(module));
 
     // Since only double type is supported, provide explicit double type to the running function.
-    auto r = runner.run<double>(entry_point_name);
+    auto r = runner.run_without_arguments<double>(entry_point_name);
     fprintf(stderr, "Result: %f\n", r);
 
     return 0;
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 077706ef8d..631f8090f0 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -99,7 +99,8 @@ if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS} codegen)
   add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp
                           codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
-  add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_llvm_execution.cpp)
+  add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_data_helper.cpp
+                                  codegen/codegen_llvm_execution.cpp)
   target_link_libraries(
     testllvm
     llvm_codegen
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 90e8fb3cc2..c0764c7897 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -6,13 +6,15 @@
  *************************************************************************/
 
 #include <catch/catch.hpp>
-#include <regex>
 
 #include "ast/program.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "codegen/llvm/jit_driver.hpp"
+#include "codegen_data_helper.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
+#include "visitors/neuron_solve_visitor.hpp"
+#include "visitors/solve_block_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
 
 using namespace nmodl;
@@ -23,7 +25,43 @@ using nmodl::parser::NmodlDriver;
 static double EPSILON = 1e-15;
 
 //=============================================================================
-// No optimisations
+// Utilities for testing.
+//=============================================================================
+
+struct InstanceTestInfo {
+    codegen::CodegenInstanceData& instance;
+    codegen::CodegenLLVMVisitor& visitor;
+    int num_elements;
+};
+
+template <typename T>
+bool check_instance_variable(InstanceTestInfo& instance_info,
+                             std::vector<T>& expected,
+                             const std::string& variable_name) {
+    std::vector<T> actual;
+    int variable_index = instance_info.visitor.get_instance_var_helper().get_variable_index(
+        variable_name);
+    actual.assign(static_cast<T*>(instance_info.instance.members[variable_index]),
+                  static_cast<T*>(instance_info.instance.members[variable_index]) +
+                      instance_info.num_elements);
+    // While we are comparing double types as well, for simplicity the test cases are hand-crafted
+    // so that no floating-point arithmetic is really involved.
+    return actual == expected;
+}
+
+template <typename T>
+void initialise_instance_variable(InstanceTestInfo& instance_info,
+                                  std::vector<T>& data,
+                                  const std::string& variable_name) {
+    int variable_index = instance_info.visitor.get_instance_var_helper().get_variable_index(
+        variable_name);
+    T* data_start = static_cast<T*>(instance_info.instance.members[variable_index]);
+    for (int i = 0; i < instance_info.num_elements; ++i)
+        *(data_start + i) = data[i];
+}
+
+//=============================================================================
+// Simple functions: no optimisations
 //=============================================================================
 
 SCENARIO("Arithmetic expression", "[llvm][runner]") {
@@ -60,6 +98,10 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
 
             PROCEDURE foo() {}
 
+            FUNCTION with_argument(x) {
+                with_argument = x
+            }
+
             FUNCTION loop() {
                 LOCAL i, j, sum, result
                 result = 0
@@ -92,26 +134,31 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         Runner runner(std::move(m));
 
         THEN("functions are evaluated correctly") {
-            auto exp_result = runner.run<double>("exponential");
+            auto exp_result = runner.run_without_arguments<double>("exponential");
             REQUIRE(fabs(exp_result - 2.718281828459045) < EPSILON);
 
-            auto constant_result = runner.run<double>("constant");
+            auto constant_result = runner.run_without_arguments<double>("constant");
             REQUIRE(fabs(constant_result - 10.0) < EPSILON);
 
-            auto arithmetic_result = runner.run<double>("arithmetic");
+            auto arithmetic_result = runner.run_without_arguments<double>("arithmetic");
             REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
 
-            auto function_call_result = runner.run<double>("function_call");
+            auto function_call_result = runner.run_without_arguments<double>("function_call");
             REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
 
-            auto loop_result = runner.run<double>("loop");
+            double data = 10.0;
+            auto with_argument_result = runner.run_with_argument<double, double>("with_argument",
+                                                                                 data);
+            REQUIRE(fabs(with_argument_result - 10.0) < EPSILON);
+
+            auto loop_result = runner.run_without_arguments<double>("loop");
             REQUIRE(fabs(loop_result - 90.0) < EPSILON);
         }
     }
 }
 
 //=============================================================================
-// With optimisations
+// Simple functions: with optimisations
 //=============================================================================
 
 SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
@@ -189,23 +236,101 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
 
         THEN("optimizations preserve function results") {
             // Check exponential is turned into a constant.
-            auto exp_result = runner.run<double>("exponential");
+            auto exp_result = runner.run_without_arguments<double>("exponential");
             REQUIRE(fabs(exp_result - 2.718281828459045) < EPSILON);
 
             // Check constant folding.
-            auto constant_result = runner.run<double>("constant");
+            auto constant_result = runner.run_without_arguments<double>("constant");
             REQUIRE(fabs(constant_result - 10.0) < EPSILON);
 
             // Check nested conditionals
-            auto conditionals_result = runner.run<double>("conditionals");
+            auto conditionals_result = runner.run_without_arguments<double>("conditionals");
             REQUIRE(fabs(conditionals_result - 4.0) < EPSILON);
 
             // Check constant folding.
-            auto arithmetic_result = runner.run<double>("arithmetic");
+            auto arithmetic_result = runner.run_without_arguments<double>("arithmetic");
             REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
 
-            auto function_call_result = runner.run<double>("function_call");
+            auto function_call_result = runner.run_without_arguments<double>("function_call");
             REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
         }
     }
 }
+
+//=============================================================================
+// State scalar kernel.
+//=============================================================================
+
+SCENARIO("Simple scalar kernel", "[llvm][runner]") {
+    GIVEN("Simple MOD file with a state update") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                NONSPECIFIC_CURRENT i
+                RANGE x0, x1
+            }
+
+            STATE {
+                x
+            }
+
+            ASSIGNED {
+                v
+                x0
+                x1
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                i = 0
+            }
+
+            DERIVATIVE states {
+                x = (x0 - x) / x1
+            }
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/false,
+                                                 /*use_single_precision=*/false,
+                                                 /*vector_width=*/1);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_function("nrn_state_test");
+
+        // Create the instance struct data.
+        int num_elements = 4;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // Fill the instance struct data with some values.
+        std::vector<double> x = {1.0, 2.0, 3.0, 4.0};
+        std::vector<double> x0 = {5.0, 5.0, 5.0, 5.0};
+        std::vector<double> x1 = {1.0, 1.0, 1.0, 1.0};
+
+        InstanceTestInfo instance_info{instance_data, llvm_visitor, num_elements};
+        initialise_instance_variable(instance_info, x, "x");
+        initialise_instance_variable(instance_info, x0, "x0");
+        initialise_instance_variable(instance_info, x1, "x1");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        Runner runner(std::move(module));
+
+        THEN("Values in struct have changed according to the formula") {
+            runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
+                                                 instance_data.base_ptr);
+            std::vector<double> x_expected = {4.0, 3.0, 2.0, 1.0};
+            REQUIRE(check_instance_variable(instance_info, x_expected, "x"));
+        }
+    }
+}

From f6844660b335355cf9ab9bf5791686d61eebf3b6 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 9 Apr 2021 21:42:09 +0300
Subject: [PATCH 034/331] Loop epilogue fix for LLVM visitor helper (#567)

* Added renaming for loop local variables in CodegenForStatement
* Fixed trip count in main loop and removed epilogue loop for scalar case
* Refactored loop remainder tests and added a scalar case
* Change `reminder` to `epilogue` in the test
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 68 +++++++++++++--
 test/unit/codegen/codegen_llvm_ir.cpp         | 84 +++++++++++++++----
 2 files changed, 126 insertions(+), 26 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index ceced6dc77..c3e9159dfa 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -11,6 +11,7 @@
 #include "ast/all.hpp"
 #include "codegen/codegen_helper_visitor.hpp"
 #include "utils/logger.hpp"
+#include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
 namespace nmodl {
@@ -25,6 +26,8 @@ const std::string CodegenLLVMHelperVisitor::NODECOUNT_VAR = "node_count";
 const std::string CodegenLLVMHelperVisitor::VOLTAGE_VAR = "voltage";
 const std::string CodegenLLVMHelperVisitor::NODE_INDEX_VAR = "node_index";
 
+static constexpr const char epilogue_variable_prefix[] = "epilogue_";
+
 /// Create asr::Varname node with given a given variable name
 static ast::VarName* create_varname(const std::string& varname) {
     return new ast::VarName(new ast::Name(new ast::String(varname)), nullptr, nullptr);
@@ -507,6 +510,39 @@ static std::shared_ptr<ast::Expression> loop_increment_expression(const std::str
                                                    inc_expr);
 }
 
+/**
+ * Create loop count comparison expression
+ *
+ * Based on if loop is vectorised or not, the condition for loop
+ * is different. For example:
+ *  - serial loop : `id < node_count`
+ *  - vector loop : `id < (node_count - vector_width + 1)`
+ *
+ * \todo : same as int_initialization_expression()
+ */
+static std::shared_ptr<ast::Expression> loop_count_expression(const std::string& induction_var,
+                                                              const std::string& node_count,
+                                                              int vector_width) {
+    const auto& id = create_varname(induction_var);
+    const auto& mech_node_count = create_varname(node_count);
+
+    // For non-vectorised loop, the condition is id < mech->node_count
+    if (vector_width == 1) {
+        return std::make_shared<ast::BinaryExpression>(id->clone(),
+                                                       ast::BinaryOperator(ast::BOP_LESS),
+                                                       mech_node_count);
+    }
+
+    // For vectorised loop, the condition is id < mech->node_count - vector_width + 1
+    const auto& remainder = new ast::Integer(vector_width - 1, /*macro=*/nullptr);
+    const auto& count = new ast::BinaryExpression(mech_node_count,
+                                                  ast::BinaryOperator(ast::BOP_SUBTRACTION),
+                                                  remainder);
+    return std::make_shared<ast::BinaryExpression>(id->clone(),
+                                                   ast::BinaryOperator(ast::BOP_LESS),
+                                                   count);
+}
+
 /**
  * \brief Convert ast::NrnStateBlock to corresponding code generation function nrn_state
  * @param node AST node representing ast::NrnStateBlock
@@ -522,8 +558,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
 
     /// create variable definition for loop index and insert at the beginning
     std::string loop_index_var = "id";
-    std::vector<std::string> int_variables{"id"};
-    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    std::vector<std::string> induction_variables{"id"};
+    function_statements.push_back(
+        create_local_variable_statement(induction_variables, INTEGER_TYPE));
 
     /// create now main compute part : for loop over channel instances
 
@@ -531,10 +568,10 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     ast::StatementVector loop_def_statements;
     ast::StatementVector loop_index_statements;
     ast::StatementVector loop_body_statements;
-    {
-        std::vector<std::string> int_variables{"node_id"};
-        std::vector<std::string> double_variables{"v"};
 
+    std::vector<std::string> int_variables{"node_id"};
+    std::vector<std::string> double_variables{"v"};
+    {
         /// access node index and corresponding voltage
         loop_index_statements.push_back(
             visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
@@ -589,7 +626,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     {
         /// loop constructs : initialization, condition and increment
         const auto& initialization = int_initialization_expression(INDUCTION_VAR);
-        const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
+        const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, vector_width);
         const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
 
         /// clone it
@@ -611,10 +648,11 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     }
 
     /// remainder loop possibly vectorized on vector_width
-    {
+    if (vector_width > 1) {
         /// loop constructs : initialization, condition and increment
-        const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, 1);
+        const auto& condition =
+            loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, /*vector_width=*/1);
+        const auto& increment = loop_increment_expression(INDUCTION_VAR, /*vector_width=*/1);
 
         /// convert local statement to codegenvar statement
         convert_local_statement(*loop_block);
@@ -622,6 +660,18 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         auto for_loop_statement_remainder =
             std::make_shared<ast::CodegenForStatement>(nullptr, condition, increment, loop_block);
 
+        const auto& loop_statements = for_loop_statement_remainder->get_statement_block();
+        // \todo: Change RenameVisitor to take a vector of names to which it would append a single
+        // prefix.
+        for (const auto& name: int_variables) {
+            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
+            loop_statements->accept(v);
+        }
+        for (const auto& name: double_variables) {
+            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
+            loop_statements->accept(v);
+        }
+
         /// convert all variables inside loop body to instance variables
         convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 4a0e440aaf..b51a4e3d58 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -54,19 +54,21 @@ std::string run_llvm_visitor(const std::string& text,
 }
 
 //=============================================================================
-// Utility to get specific LLVM nodes
+// Utility to get specific NMODL AST nodes
 //=============================================================================
 
-std::vector<std::shared_ptr<ast::Ast>> run_codegen_visitor_helper(const std::string& text) {
+std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
+    const std::string& text,
+    int vector_width,
+    const std::vector<ast::AstNodeType>& nodes_to_collect) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
-    /// construct symbol table and run codegen helper visitor
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(8).visit_program(*ast);
+    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
 
-    const auto& nodes = collect_nodes(*ast, {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
+    const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 
     return nodes;
 }
@@ -903,11 +905,12 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
 // Derivative block : test optimization
 //=============================================================================
 
-SCENARIO("Derivative block", "[visitor][llvm][derivative]") {
-    GIVEN("After helper visitor") {
+SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
+    GIVEN("After LLVM helper visitor transformations") {
         std::string nmodl_text = R"(
             NEURON {
                 SUFFIX hh
+                NONSPECIFIC_CURRENT il
                 RANGE minf, mtau
             }
             STATE {
@@ -920,41 +923,88 @@ SCENARIO("Derivative block", "[visitor][llvm][derivative]") {
             }
             BREAKPOINT {
                 SOLVE states METHOD cnexp
+                il = 2
             }
             DERIVATIVE states {
                 m = (minf-m)/mtau
             }
         )";
 
-        std::string expected_main_loop = R"(
-            for(id = 0; id<mech->node_count; id = id+8) {
+        std::string expected_loop = R"(
+            for(id = 0; id<mech->node_count; id = id+1) {
                 INTEGER node_id
                 DOUBLE v
                 node_id = mech->node_index[id]
                 v = mech->voltage[node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
-                SOLVE states METHOD cnexp
             })";
-        std::string expected_reminder_loop = R"(
-            for(; id<mech->node_count; id = id+1) {
+
+        THEN("a single scalar loops is constructed") {
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  /*vector_width=*/1,
+                                                  {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
+            REQUIRE(result.size() == 1);
+
+            auto main_loop = reindent_text(to_nmodl(result[0]));
+            REQUIRE(main_loop == reindent_text(expected_loop));
+        }
+    }
+}
+
+SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
+    GIVEN("After LLVM helper visitor transformations") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                NONSPECIFIC_CURRENT il
+                RANGE minf, mtau
+            }
+            STATE {
+                m
+            }
+            ASSIGNED {
+                v (mV)
+                minf
+                mtau (ms)
+            }
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                il = 2
+            }
+            DERIVATIVE states {
+                m = (minf-m)/mtau
+            }
+        )";
+
+        std::string expected_main_loop = R"(
+            for(id = 0; id<mech->node_count-7; id = id+8) {
                 INTEGER node_id
                 DOUBLE v
                 node_id = mech->node_index[id]
                 v = mech->voltage[node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
-                SOLVE states METHOD cnexp
+            })";
+        std::string expected_epilogue_loop = R"(
+            for(; id<mech->node_count; id = id+1) {
+                INTEGER epilogue_node_id
+                DOUBLE epilogue_v
+                epilogue_node_id = mech->node_index[id]
+                epilogue_v = mech->voltage[epilogue_node_id]
+                mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
             })";
 
 
-        THEN("should contains 2 for loops") {
-            auto result = run_codegen_visitor_helper(nmodl_text);
+        THEN("vector and epilogue scalar loops are constructed") {
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  /*vector_width=*/8,
+                                                  {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 2);
 
             auto main_loop = reindent_text(to_nmodl(result[0]));
             REQUIRE(main_loop == reindent_text(expected_main_loop));
 
-            auto reminder_loop = reindent_text(to_nmodl(result[1]));
-            REQUIRE(reminder_loop == reindent_text(expected_reminder_loop));
+            auto epilogue_loop = reindent_text(to_nmodl(result[1]));
+            REQUIRE(epilogue_loop == reindent_text(expected_epilogue_loop));
         }
     }
 }

From cc92fc27655a3bcc0a25f20ea86aae363170c738 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 10 Apr 2021 22:00:50 +0300
Subject: [PATCH 035/331] Gather support and vectorisation fixes for LLVM code
 generation (#568)

* Add gather support
* Fixed vectorisation patterns and added simple JIT tests
* Added IR regex test for gather
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    | 103 ++++++++-----------
 test/unit/codegen/codegen_llvm_execution.cpp | 103 +++++++++++++++++--
 test/unit/codegen/codegen_llvm_ir.cpp        |  55 ++++++++++
 3 files changed, 191 insertions(+), 70 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 5fdd906480..a42201824c 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -23,11 +23,6 @@ namespace codegen {
 
 static constexpr const char instance_struct_type_name[] = "__instance_var__type";
 
-// The prefix is used to create a vectorised id that can be used as index to GEPs. However, for
-// simple aligned vector loads and stores vector id is not needed. This is because we can bitcast
-// the pointer to the vector pointer! \todo: Consider removing this.
-static constexpr const char kernel_id_prefix[] = "__vec_";
-
 
 /****************************************************************************************/
 /*                            Helper routines                                           */
@@ -88,12 +83,11 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     // Proceed to creating a GEP instruction to get the pointer to the member's element.
     auto member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
         member_var_name->get_name());
-    llvm::Value* i64_index = get_array_index(*member_indexed_name);
 
+    if (!member_indexed_name->get_length()->is_name())
+        throw std::runtime_error("Error: " + member_name + " must be indexed with a variable!");
 
-    // Create a indices vector for GEP to return the pointer to the element at the specified index.
-    std::vector<llvm::Value*> member_indices;
-    member_indices.push_back(i64_index);
+    llvm::Value* i64_index = get_array_index(*member_indexed_name);
 
     // The codegen variable type is always a scalar, so we need to transform it to a pointer. Then
     // load the member which would be indexed later.
@@ -101,18 +95,25 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     llvm::Value* instance_member =
         builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
 
+    // Check if the code is vectorised and the index is indirect.
+    std::string id = member_indexed_name->get_length()->get_node_name();
+    if (id != kernel_id && is_kernel_code && vector_width > 1) {
+        // Calculate a vector of addresses via GEP instruction, and then created a gather to load
+        // indirectly.
+        llvm::Value* addresses = builder.CreateInBoundsGEP(instance_member, {i64_index});
+        return builder.CreateMaskedGather(addresses, llvm::Align());
+    }
+
+    llvm::Value* member_addr = builder.CreateInBoundsGEP(instance_member, {i64_index});
 
     // If the code is vectorised, then bitcast to a vector pointer.
     if (is_kernel_code && vector_width > 1) {
         llvm::Type* vector_type =
             llvm::PointerType::get(llvm::FixedVectorType::get(type, vector_width),
                                    /*AddressSpace=*/0);
-        llvm::Value* instance_member_bitcasted = builder.CreateBitCast(instance_member,
-                                                                       vector_type);
-        return builder.CreateInBoundsGEP(instance_member_bitcasted, member_indices);
+        return builder.CreateBitCast(member_addr, vector_type);
     }
-
-    return builder.CreateInBoundsGEP(instance_member, member_indices);
+    return member_addr;
 }
 
 llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
@@ -135,12 +136,19 @@ llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
         throw std::runtime_error("Error: only integer indexing is supported!");
 
     // Conventionally, in LLVM array indices are 64 bit.
-    auto index_type = llvm::cast<llvm::IntegerType>(index_value->getType());
     llvm::Type* i64_type = llvm::Type::getInt64Ty(*context);
-    if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
-        return index_value;
+    if (auto index_type = llvm::dyn_cast<llvm::IntegerType>(index_value->getType())) {
+        if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
+            return index_value;
+        return builder.CreateSExtOrTrunc(index_value, i64_type);
+    }
 
-    return builder.CreateSExtOrTrunc(index_value, i64_type);
+    auto vector_type = llvm::cast<llvm::FixedVectorType>(index_value->getType());
+    auto element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
+    if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
+        return index_value;
+    return builder.CreateSExtOrTrunc(index_value,
+                                     llvm::FixedVectorType::get(i64_type, vector_width));
 }
 
 int CodegenLLVMVisitor::get_array_length(const ast::IndexedName& node) {
@@ -167,8 +175,6 @@ llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType&
         return llvm::Type::getInt32Ty(*context);
     case ast::AstNodeType::VOID:
         return llvm::Type::getVoidTy(*context);
-    // TODO :: George/Ioannis : Here we have to also return INSTANCE_STRUCT type
-    //         as it is used as an argument to nrn_state function
     default:
         throw std::runtime_error("Error: expecting a type in CodegenVarType node\n");
     }
@@ -576,31 +582,15 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
-    // First, initialise the loop in the same basic block. This block is optional.
+    // First, initialise the loop in the same basic block. This block is optional. Also, reset
+    // vector width to 1 if processing the remainder of the loop.
+    int tmp_vector_width = vector_width;
     if (node.get_initialization()) {
         node.get_initialization()->accept(*this);
+    } else {
+        vector_width = 1;
     }
 
-    // If the loop is to be vectorised, create a separate vector induction variable.
-    // \todo: See the comment for `kernel_id_prefix`.
-    if (vector_width > 1) {
-        // First, create a vector type and alloca for it.
-        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::Type* vec_type = llvm::FixedVectorType::get(i32_type, vector_width);
-        llvm::Value* vec_alloca = builder.CreateAlloca(vec_type,
-                                                       /*ArraySize=*/nullptr,
-                                                       /*Name=*/kernel_id_prefix + kernel_id);
-
-        // Then, store the initial value of <0, 1, ..., [W-1]> o the alloca pointer, where W is the
-        // vector width.
-        std::vector<llvm::Constant*> constants;
-        for (unsigned i = 0; i < vector_width; ++i) {
-            const auto& element = llvm::ConstantInt::get(i32_type, i);
-            constants.push_back(element);
-        }
-        llvm::Value* vector_id = llvm::ConstantVector::get(constants);
-        builder.CreateStore(vector_id, vec_alloca);
-    }
     // Branch to condition basic block and insert condition code there.
     builder.CreateBr(for_cond);
     builder.SetInsertPoint(for_cond);
@@ -623,23 +613,11 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     builder.SetInsertPoint(for_inc);
     node.get_increment()->accept(*this);
 
-    // If the code is vectorised, then increment the vector id by <W, W, ..., W> where W is the
+    // Create a branch to condition block, then generate exit code out of the loop. Restore the
     // vector width.
-    // \todo: See the comment for `kernel_id_prefix`.
-    if (vector_width > 1) {
-        // First, create an increment vector.
-        llvm::Value* vector_inc = get_constant_int_vector(vector_width);
-
-        // Increment the kernel id elements by a constant vector width.
-        llvm::Value* vector_id_ptr = lookup(kernel_id_prefix + kernel_id);
-        llvm::Value* vector_id = builder.CreateLoad(vector_id_ptr);
-        llvm::Value* incremented = builder.CreateAdd(vector_id, vector_inc);
-        builder.CreateStore(incremented, vector_id_ptr);
-    }
-
-    // Create a branch to condition block, then generate exit code out of the loop.
     builder.CreateBr(for_cond);
     builder.SetInsertPoint(exit);
+    vector_width = tmp_vector_width;
 }
 
 
@@ -707,8 +685,12 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
             int length = get_array_length(*indexed_name);
             var_type = llvm::ArrayType::get(scalar_var_type, length);
         } else if (identifier->is_name()) {
-            // This case corresponds to a scalar local variable. Its type is double by default.
-            var_type = scalar_var_type;
+            // This case corresponds to a scalar or vector local variable.
+            if (is_kernel_code && vector_width > 1) {
+                var_type = llvm::FixedVectorType::get(scalar_var_type, vector_width);
+            } else {
+                var_type = scalar_var_type;
+            }
         } else {
             throw std::runtime_error("Error: Unsupported local variable type");
         }
@@ -881,10 +863,11 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
     llvm::Value* ptr = get_variable_ptr(node);
 
-    // Finally, load the variable from the pointer value.
-    llvm::Value* var = builder.CreateLoad(ptr);
+    // Finally, load the variable from the pointer value unless it has already been loaded (e.g. via
+    // gather instruction).
+    llvm::Value* var = ptr->getType()->isPointerTy() ? builder.CreateLoad(ptr) : ptr;
 
-    // If the vale should not be vectorised, or it is already a vector, add it to the stack.
+    // If the value should not be vectorised, or it is already a vector, add it to the stack.
     if (!is_kernel_code || vector_width <= 1 || var->getType()->isVectorTy()) {
         values.push_back(var);
         return;
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index c0764c7897..782a3374b8 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -29,8 +29,8 @@ static double EPSILON = 1e-15;
 //=============================================================================
 
 struct InstanceTestInfo {
-    codegen::CodegenInstanceData& instance;
-    codegen::CodegenLLVMVisitor& visitor;
+    codegen::CodegenInstanceData* instance;
+    codegen::InstanceVarHelper helper;
     int num_elements;
 };
 
@@ -39,11 +39,11 @@ bool check_instance_variable(InstanceTestInfo& instance_info,
                              std::vector<T>& expected,
                              const std::string& variable_name) {
     std::vector<T> actual;
-    int variable_index = instance_info.visitor.get_instance_var_helper().get_variable_index(
-        variable_name);
-    actual.assign(static_cast<T*>(instance_info.instance.members[variable_index]),
-                  static_cast<T*>(instance_info.instance.members[variable_index]) +
+    int variable_index = instance_info.helper.get_variable_index(variable_name);
+    actual.assign(static_cast<T*>(instance_info.instance->members[variable_index]),
+                  static_cast<T*>(instance_info.instance->members[variable_index]) +
                       instance_info.num_elements);
+
     // While we are comparing double types as well, for simplicity the test cases are hand-crafted
     // so that no floating-point arithmetic is really involved.
     return actual == expected;
@@ -53,9 +53,8 @@ template <typename T>
 void initialise_instance_variable(InstanceTestInfo& instance_info,
                                   std::vector<T>& data,
                                   const std::string& variable_name) {
-    int variable_index = instance_info.visitor.get_instance_var_helper().get_variable_index(
-        variable_name);
-    T* data_start = static_cast<T*>(instance_info.instance.members[variable_index]);
+    int variable_index = instance_info.helper.get_variable_index(variable_name);
+    T* data_start = static_cast<T*>(instance_info.instance->members[variable_index]);
     for (int i = 0; i < instance_info.num_elements; ++i)
         *(data_start + i) = data[i];
 }
@@ -317,7 +316,9 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         std::vector<double> x0 = {5.0, 5.0, 5.0, 5.0};
         std::vector<double> x1 = {1.0, 1.0, 1.0, 1.0};
 
-        InstanceTestInfo instance_info{instance_data, llvm_visitor, num_elements};
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
         initialise_instance_variable(instance_info, x, "x");
         initialise_instance_variable(instance_info, x0, "x0");
         initialise_instance_variable(instance_info, x1, "x1");
@@ -334,3 +335,85 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         }
     }
 }
+
+//=============================================================================
+// State vectorised kernel with optimisations on.
+//=============================================================================
+
+SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
+    GIVEN("Simple MOD file with a state update") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                NONSPECIFIC_CURRENT i
+                RANGE x0, x1
+            }
+
+            STATE {
+                x
+            }
+
+            ASSIGNED {
+                v
+                x0
+                x1
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                i = 0
+            }
+
+            DERIVATIVE states {
+                x = (x0 - x) / x1
+            }
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/true,
+                                                 /*use_single_precision=*/false,
+                                                 /*vector_width=*/4);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_function("nrn_state_test");
+
+        // Create the instance struct data.
+        int num_elements = 10;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // Fill the instance struct data with some values for unit testing.
+        std::vector<double> x = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
+        std::vector<double> x0 = {11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0};
+        std::vector<double> x1 = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
+        initialise_instance_variable<double>(instance_info, x, "x");
+        initialise_instance_variable<double>(instance_info, x0, "x0");
+        initialise_instance_variable<double>(instance_info, x1, "x1");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        Runner runner(std::move(module));
+
+        THEN("Values in struct have changed according to the formula") {
+            runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
+                                                 instance_data.base_ptr);
+            std::vector<double> x_expected = {10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
+
+            // Check that the main and remainder loops correctly change the data stored in x.
+            REQUIRE(check_instance_variable<double>(instance_info, x_expected, "x"));
+        }
+    }
+}
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index b51a4e3d58..dfa6d271dc 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -901,6 +901,61 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// Gather for vectorised kernel
+//=============================================================================
+
+SCENARIO("Vectorised simple kernel", "[visitor][llvm]") {
+    GIVEN("An indirect indexing of voltage") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                NONSPECIFIC_CURRENT i
+            }
+
+            STATE {}
+
+            ASSIGNED {
+                v (mV)
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                i = 2
+            }
+
+            DERIVATIVE states {}
+        )";
+
+        THEN("a gather instructions is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text,
+                                                         /*opt=*/false,
+                                                         /*use_single_precision=*/false,
+                                                         /*vector_width=*/4);
+            std::smatch m;
+
+            // Check gather intrinsic is correctly declared.
+            std::regex declaration(
+                R"(declare <4 x double> @llvm\.masked\.gather\.v4f64\.v4p0f64\(<4 x double\*>, i32 immarg, <4 x i1>, <4 x double>\) )");
+            REQUIRE(std::regex_search(module_string, m, declaration));
+
+            // Check that the indices vector is created correctly and extended to i64.
+            std::regex index_load(R"(load <4 x i32>, <4 x i32>\* %node_id)");
+            std::regex sext(R"(sext <4 x i32> %.* to <4 x i64>)");
+            REQUIRE(std::regex_search(module_string, m, index_load));
+            REQUIRE(std::regex_search(module_string, m, sext));
+
+            // Check that the access to `voltage` is performed via gather instruction.
+            //      v = mech->voltage[node_id]
+            std::regex gather(
+                "call <4 x double> @llvm\\.masked\\.gather\\.v4f64\\.v4p0f64\\("
+                "<4 x double\\*> %.*, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x "
+                "double> undef\\)");
+            REQUIRE(std::regex_search(module_string, m, gather));
+        }
+    }
+}
+
 //=============================================================================
 // Derivative block : test optimization
 //=============================================================================

From 3803ae1237ff7b42c216830b40c6554c796077a0 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 13 Apr 2021 08:31:28 +0300
Subject: [PATCH 036/331] Verification and file utilities for LLVM IR codegen
 (#582)

Added several minor improvement to the current pipeline
infrastructure. Particularly, the following was addressed:

- The generated IR module is now verified after running the
visitor
- The kernel is checked if it can be vectorised or not
- The generated IR can be dumped to `.ll` file with
`-o <filename>`
- Printing LLVM IR is moved to debug mode
---
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 58 ++++++++++++++++++-
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  6 +-
 .../codegen/codegen_llvm_instance_struct.cpp  |  2 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  2 +-
 4 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index a42201824c..b080a1638f 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -9,13 +9,17 @@
 
 #include "ast/all.hpp"
 #include "visitors/rename_visitor.hpp"
+#include "visitors/visitor_utils.hpp"
 
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ToolOutputFile.h"
 
 namespace nmodl {
 namespace codegen {
@@ -28,12 +32,31 @@ static constexpr const char instance_struct_type_name[] = "__instance_var__type"
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
+/// A utility to check for supported Statement AST nodes.
 static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
            statement.is_codegen_for_statement() || statement.is_codegen_return_statement() ||
            statement.is_if_statement() || statement.is_while_statement();
 }
 
+/// A utility to check of the kernel body can be vectorised.
+static bool can_vectorise(const ast::CodegenForStatement& statement, symtab::SymbolTable* sym_tab) {
+    // Check that function calls are made to external methods only.
+    const auto& function_calls = collect_nodes(statement, {ast::AstNodeType::FUNCTION_CALL});
+    for (const auto& call: function_calls) {
+        const auto& name = call->get_node_name();
+        auto symbol = sym_tab->lookup(name);
+        if (symbol && !symbol->has_any_property(symtab::syminfo::NmodlType::extern_method))
+            return false;
+    }
+
+    // Check there is no control flow in the kernel.
+    const std::vector<ast::AstNodeType> unsupported_nodes = {ast::AstNodeType::IF_STATEMENT};
+    const auto& collected = collect_nodes(statement, unsupported_nodes);
+
+    return collected.empty();
+}
+
 llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, llvm::Value* index) {
     llvm::Type* index_type = llvm::Type::getInt64Ty(*context);
     std::vector<llvm::Value*> indices;
@@ -582,9 +605,18 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
+    // Save the vector width.
+    int tmp_vector_width = vector_width;
+
+    // Check if the kernel can be vectorised. If not, generate scalar code.
+    if (!can_vectorise(node, sym_tab)) {
+        logger->info("Cannot vectorise the for loop in '" + current_func->getName().str() + "'");
+        logger->info("Generating scalar code...");
+        vector_width = 1;
+    }
+
     // First, initialise the loop in the same basic block. This block is optional. Also, reset
     // vector width to 1 if processing the remainder of the loop.
-    int tmp_vector_width = vector_width;
     if (node.get_initialization()) {
         node.get_initialization()->accept(*this);
     } else {
@@ -833,13 +865,33 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         visit_codegen_function(*func);
     }
 
+    // Verify the generated LLVM IR module.
+    std::string error;
+    llvm::raw_string_ostream ostream(error);
+    if (verifyModule(*module, &ostream)) {
+        throw std::runtime_error("Error: incorrect IR has been generated!\n" + ostream.str());
+    }
+
     if (opt_passes) {
         logger->info("Running LLVM optimisation passes");
         run_llvm_opt_passes();
     }
 
-    // Keep this for easier development (maybe move to debug mode later).
-    std::cout << print_module();
+    // If the output directory is specified, save the IR to .ll file.
+    // \todo: Consider saving the generated LLVM IR to bytecode (.bc) file instead.
+    if (output_dir != ".") {
+        std::error_code error_code;
+        std::unique_ptr<llvm::ToolOutputFile> out = std::make_unique<llvm::ToolOutputFile>(
+            output_dir + "/" + mod_filename + ".ll", error_code, llvm::sys::fs::OF_Text);
+        if (error_code)
+            throw std::runtime_error("Error: " + error_code.message());
+
+        std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
+        module->print(out->os(), annotator.get());
+        out->keep();
+    }
+
+    logger->debug("Dumping generated IR...\n" + dump_module());
 }
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index b099646b07..f001c2c2fe 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -321,8 +321,10 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_var_name(const ast::VarName& node) override;
     void visit_while_statement(const ast::WhileStatement& node) override;
 
-    // \todo: move this to debug mode (e.g. -v option or --dump-ir)
-    std::string print_module() const {
+    /**
+     * Dumps the generated LLVM IR module to string.
+     */
+    std::string dump_module() const {
         std::string str;
         llvm::raw_string_ostream os(str);
         os << *module;
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 4bfa1cd31c..52b9bb9868 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -45,7 +45,7 @@ codegen::CodegenInstanceData generate_instance_data(const std::string& text,
                                              use_single_precision,
                                              vector_width);
     llvm_visitor.visit_program(*ast);
-    llvm_visitor.print_module();
+    llvm_visitor.dump_module();
     const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
     auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
     auto instance_data = codegen_data.create_data(num_elements, seed);
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index dfa6d271dc..83807fedbf 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -50,7 +50,7 @@ std::string run_llvm_visitor(const std::string& text,
                                              use_single_precision,
                                              vector_width);
     llvm_visitor.visit_program(*ast);
-    return llvm_visitor.print_module();
+    return llvm_visitor.dump_module();
 }
 
 //=============================================================================

From 60e68c9594b8a83d6fe6640be0d9501e6a12def6 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 16 Apr 2021 19:20:29 +0300
Subject: [PATCH 037/331] Add gather execution test (#591)

---
 test/unit/codegen/codegen_llvm_execution.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 782a3374b8..b191f350df 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -350,7 +350,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
             }
 
             STATE {
-                x
+                x y
             }
 
             ASSIGNED {
@@ -366,6 +366,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
 
             DERIVATIVE states {
                 x = (x0 - x) / x1
+                y = v
             }
         )";
 
@@ -396,6 +397,9 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         std::vector<double> x0 = {11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0};
         std::vector<double> x1 = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
+        std::vector<double> voltage = {3.0, 4.0, 7.0, 1.0, 2.0, 5.0, 8.0, 6.0, 10.0, 9.0};
+        std::vector<int> node_index = {3, 4, 0, 1, 5, 7, 2, 6, 9, 8};
+
         InstanceTestInfo instance_info{&instance_data,
                                        llvm_visitor.get_instance_var_helper(),
                                        num_elements};
@@ -403,6 +407,9 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         initialise_instance_variable<double>(instance_info, x0, "x0");
         initialise_instance_variable<double>(instance_info, x1, "x1");
 
+        initialise_instance_variable<double>(instance_info, voltage, "voltage");
+        initialise_instance_variable<int>(instance_info, node_index, "node_index");
+
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
         Runner runner(std::move(module));
@@ -410,10 +417,14 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         THEN("Values in struct have changed according to the formula") {
             runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
                                                  instance_data.base_ptr);
-            std::vector<double> x_expected = {10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-
             // Check that the main and remainder loops correctly change the data stored in x.
+            std::vector<double> x_expected = {10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
             REQUIRE(check_instance_variable<double>(instance_info, x_expected, "x"));
+
+            // Check that the gather load produces correct results in y:
+            //   y[id] = voltage[node_index[id]]
+            std::vector<double> y_expected = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
+            REQUIRE(check_instance_variable<double>(instance_info, y_expected, "y"));
         }
     }
 }

From 8f1fbae3bd57c60452a59a2ea19433208985c34d Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 17 Apr 2021 09:34:46 +0300
Subject: [PATCH 038/331] Fixed loop allocations (#590)

* avoid local variables inside loop to not have allocas
* this was causing stack overview for large instance count
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 32 ++++++++++++++-----
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 28 +++++++++++++---
 test/unit/codegen/codegen_llvm_ir.cpp         | 24 +++++---------
 3 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c3e9159dfa..eec79370f6 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -562,15 +562,16 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     function_statements.push_back(
         create_local_variable_statement(induction_variables, INTEGER_TYPE));
 
+    /// create vectors of local variables that would be used in compute part
+    std::vector<std::string> int_variables{"node_id"};
+    std::vector<std::string> double_variables{"v"};
+
     /// create now main compute part : for loop over channel instances
 
     /// loop body : initialization + solve blocks
     ast::StatementVector loop_def_statements;
     ast::StatementVector loop_index_statements;
     ast::StatementVector loop_body_statements;
-
-    std::vector<std::string> int_variables{"node_id"};
-    std::vector<std::string> double_variables{"v"};
     {
         /// access node index and corresponding voltage
         loop_index_statements.push_back(
@@ -597,6 +598,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         /// add breakpoint block if no current
         if (info.currents.empty() && info.breakpoint_node != nullptr) {
             auto block = info.breakpoint_node->get_statement_block();
+            // \todo this automatically adds `SOLVE states METHOD ...`
             append_statements_from_block(loop_body_statements, block);
         }
 
@@ -607,10 +609,6 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
                              loop_index_statements,
                              loop_body_statements);
 
-        loop_def_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
-        loop_def_statements.push_back(
-            create_local_variable_statement(double_variables, FLOAT_TYPE));
-
         // \todo handle process_shadow_update_statement and wrote_conc_call yet
     }
 
@@ -622,6 +620,10 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// now construct a new code block which will become the body of the loop
     auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
 
+    /// declare main FOR loop local variables
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
+
     /// main loop possibly vectorized on vector_width
     {
         /// loop constructs : initialization, condition and increment
@@ -647,6 +649,10 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         function_statements.push_back(for_loop_statement_main);
     }
 
+    /// vectors containing renamed FOR loop local variables
+    std::vector<std::string> renamed_int_variables;
+    std::vector<std::string> renamed_double_variables;
+
     /// remainder loop possibly vectorized on vector_width
     if (vector_width > 1) {
         /// loop constructs : initialization, condition and increment
@@ -664,14 +670,24 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         // \todo: Change RenameVisitor to take a vector of names to which it would append a single
         // prefix.
         for (const auto& name: int_variables) {
-            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
+            std::string new_name = epilogue_variable_prefix + name;
+            renamed_int_variables.push_back(new_name);
+            visitor::RenameVisitor v(name, new_name);
             loop_statements->accept(v);
         }
         for (const auto& name: double_variables) {
+            std::string new_name = epilogue_variable_prefix + name;
+            renamed_double_variables.push_back(new_name);
             visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
             loop_statements->accept(v);
         }
 
+        /// declare remainder FOR loop local variables
+        function_statements.push_back(
+            create_local_variable_statement(renamed_int_variables, INTEGER_TYPE));
+        function_statements.push_back(
+            create_local_variable_statement(renamed_double_variables, FLOAT_TYPE));
+
         /// convert all variables inside loop body to instance variables
         convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index b080a1638f..3a165e465a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -592,6 +592,9 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
 //  | <code after for loop>     |
 //  +---------------------------+
 void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatement& node) {
+    // Disable vector code generation for condition and increment blocks.
+    is_kernel_code = false;
+
     // Get the current and the next blocks within the function.
     llvm::BasicBlock* curr_block = builder.GetInsertBlock();
     llvm::BasicBlock* next = curr_block->getNextNode();
@@ -650,6 +653,7 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     builder.CreateBr(for_cond);
     builder.SetInsertPoint(exit);
     vector_width = tmp_vector_width;
+    is_kernel_code = true;
 }
 
 
@@ -682,11 +686,19 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
         builder.CreateStore(&arg, alloca);
     }
 
-    // Process function or procedure body. The return statement is handled in a separate visitor.
-    block->accept(*this);
+    // Process function or procedure body. If the function is a compute kernel, then set the
+    // corresponding flags. The return statement is handled in a separate visitor.
+    bool has_void_ret_type = node.get_return_type()->get_type() == ast::AstNodeType::VOID;
+    if (has_void_ret_type) {
+        is_kernel_code = true;
+        block->accept(*this);
+        is_kernel_code = false;
+    } else {
+        block->accept(*this);
+    }
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
-    if (node.get_return_type()->get_type() == ast::AstNodeType::VOID)
+    if (has_void_ret_type)
         builder.CreateRetVoid();
 
     // Clear local values stack and remove the pointer to the local symbol table.
@@ -718,7 +730,13 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
             var_type = llvm::ArrayType::get(scalar_var_type, length);
         } else if (identifier->is_name()) {
             // This case corresponds to a scalar or vector local variable.
-            if (is_kernel_code && vector_width > 1) {
+            const auto& identifier_name = identifier->get_node_name();
+
+            // Even if generating vectorised code, some variables still need to be scalar.
+            // Particularly, the induction variable "id" and remainder loop variables (that start
+            // with "epilogue").
+            if (is_kernel_code && vector_width > 1 && identifier_name != kernel_id &&
+                identifier_name.rfind("epilogue", 0)) {
                 var_type = llvm::FixedVectorType::get(scalar_var_type, vector_width);
             } else {
                 var_type = scalar_var_type;
@@ -726,7 +744,7 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
         } else {
             throw std::runtime_error("Error: Unsupported local variable type");
         }
-        llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+        builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
     }
 }
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 83807fedbf..207548ee46 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -845,10 +845,14 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, struct_type));
             REQUIRE(std::regex_search(module_string, m, kernel_declaration));
 
-            // Check for correct induction variable initialisation and a branch to condition block.
-            std::regex alloca_instr(R"(%id = alloca i32)");
+            // Check for correct variables initialisation and a branch to condition block.
+            std::regex id_initialisation(R"(%id = alloca i32)");
+            std::regex node_id_initialisation(R"(%node_id = alloca i32)");
+            std::regex v_initialisation(R"(%v = alloca double)");
             std::regex br(R"(br label %for\.cond)");
-            REQUIRE(std::regex_search(module_string, m, alloca_instr));
+            REQUIRE(std::regex_search(module_string, m, id_initialisation));
+            REQUIRE(std::regex_search(module_string, m, node_id_initialisation));
+            REQUIRE(std::regex_search(module_string, m, v_initialisation));
             REQUIRE(std::regex_search(module_string, m, br));
 
             // Check condition block: id < mech->node_count, and a conditional branch to loop body
@@ -865,12 +869,7 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, condition));
             REQUIRE(std::regex_search(module_string, m, cond_br));
 
-            // In the body block, `node_id` and voltage `v` are initialised with the data from the
-            // struct. Check for variable allocations and correct loads from the struct with GEPs.
-            std::regex initialisation(
-                "for\\.body:.*\n"
-                "  %node_id = alloca i32,.*\n"
-                "  %v = alloca double,.*");
+            // Check for correct loads from the struct with GEPs.
             std::regex load_from_struct(
                 "  %.* = load %.*__instance_var__type\\*, %.*__instance_var__type\\*\\* %.*\n"
                 "  %.* = getelementptr inbounds %.*__instance_var__type, "
@@ -880,7 +879,6 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
                 "  %.* = load (i32|double)\\*, (i32|double)\\*\\* %.*\n"
                 "  %.* = getelementptr inbounds (i32|double), (i32|double)\\* %.*, i64 %.*\n"
                 "  %.* = load (i32|double), (i32|double)\\* %.*");
-            REQUIRE(std::regex_search(module_string, m, initialisation));
             REQUIRE(std::regex_search(module_string, m, load_from_struct));
 
             // Check induction variable is incremented in increment block.
@@ -987,8 +985,6 @@ SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
 
         std::string expected_loop = R"(
             for(id = 0; id<mech->node_count; id = id+1) {
-                INTEGER node_id
-                DOUBLE v
                 node_id = mech->node_index[id]
                 v = mech->voltage[node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
@@ -1033,16 +1029,12 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
 
         std::string expected_main_loop = R"(
             for(id = 0; id<mech->node_count-7; id = id+8) {
-                INTEGER node_id
-                DOUBLE v
                 node_id = mech->node_index[id]
                 v = mech->voltage[node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
             })";
         std::string expected_epilogue_loop = R"(
             for(; id<mech->node_count; id = id+1) {
-                INTEGER epilogue_node_id
-                DOUBLE epilogue_v
                 epilogue_node_id = mech->node_index[id]
                 epilogue_v = mech->voltage[epilogue_node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]

From 53cd1cce7374010017315cd59904cbd7c194e171 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 17 Apr 2021 09:51:19 +0300
Subject: [PATCH 039/331] Benchmarking LLVM code generation (#583)

Introduced the benchmarking for LLVM code generation pipeline.
For that, new options have been added:

```
benchmark
  LLVM benchmark option
  Options:
    --run                                                               Run LLVM benchmark (false)
    --instance-size INT                                       Instance struct size (10000)
    --repeat INT                                                    Number of experiments for benchmarking (100)
    --backend TEXT:{avx2, default, sse2}         Target's backend (default)
```

The JIT runner has also been modified to extract the target
information correctly, and disable available CPU features for
benchmarking a specific backend.

Example:
```
$ nmodl hh.mod llvm --ir --vector-width 1 benchmark --run --instance-size 100 --repeat 2 --backend default

Created LLVM IR module from NMODL AST in 0.006765817

Benchmarking kernel 'nrn_state_hh'
Experiment 0: compute time = 0.013977749
Experiment 1: compute time = 0.004847989
Average compute time = 0.0058550929
```

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 src/CMakeLists.txt                           |   2 +-
 src/codegen/llvm/CMakeLists.txt              |   5 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp    |  83 ++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  11 +-
 src/codegen/llvm/jit_driver.cpp              |  59 +++----
 src/codegen/llvm/jit_driver.hpp              |  11 +-
 src/codegen/llvm/llvm_benchmark.cpp          | 157 +++++++++++++++++++
 src/codegen/llvm/llvm_benchmark.hpp          |  85 ++++++++++
 src/main.cpp                                 |  43 ++++-
 test/unit/CMakeLists.txt                     |   5 +
 test/unit/codegen/codegen_llvm_execution.cpp |   4 +-
 11 files changed, 390 insertions(+), 75 deletions(-)
 create mode 100644 src/codegen/llvm/llvm_benchmark.cpp
 create mode 100644 src/codegen/llvm/llvm_benchmark.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cf4acc4de0..bda007c3a0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,7 +30,7 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 
 if(NMODL_ENABLE_LLVM)
-  target_link_libraries(nmodl llvm_codegen ${LLVM_LIBS_TO_LINK})
+  target_link_libraries(nmodl llvm_codegen llvm_benchmark ${LLVM_LIBS_TO_LINK})
 endif()
 
 # =============================================================================
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index bd54f4143d..8c2a295598 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -7,7 +7,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
@@ -29,6 +31,7 @@ if(NOT NMODL_AS_SUBPROJECT)
     nmodl_llvm_runner
     llvm_codegen
     codegen
+    llvm_benchmark
     visitor
     symtab
     lexer
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 3a165e465a..ea7e828035 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -976,40 +976,57 @@ void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node)
     builder.SetInsertPoint(exit);
 }
 
-void CodegenLLVMVisitor::wrap_kernel_function(const std::string& kernel_name) {
-    // Get the kernel function and the instance struct type.
-    auto kernel = module->getFunction(kernel_name);
-    if (!kernel)
-        throw std::runtime_error("Kernel " + kernel_name + " is not found!");
-
-    if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
-        throw std::runtime_error("Kernel " + kernel_name + " must have a single argument!");
-
-    auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(kernel->getArg(0)->getType());
-    if (!instance_struct_ptr_type)
-        throw std::runtime_error("Kernel " + kernel_name +
-                                 " does not have an instance struct pointer argument!");
-
-    // Create a wrapper void function that takes a void pointer as a single argument.
-    llvm::Type* void_type = llvm::Type::getVoidTy(*context);
-    llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-    llvm::Type* void_ptr_type = llvm::PointerType::get(void_type, /*AddressSpace=*/0);
-    llvm::Function* wrapper_func = llvm::Function::Create(
-        llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
-        llvm::Function::ExternalLinkage,
-        "__" + kernel_name + "_wrapper",
-        *module);
-    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
-    builder.SetInsertPoint(body);
+void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
+    // By convention, only the kernel functions return void type.
+    const auto& functions = module->getFunctionList();
+    for (const auto& func: functions) {
+        if (func.getReturnType()->isVoidTy()) {
+            container.push_back(func.getName().str());
+        }
+    }
+}
 
-    // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel and
-    // adding a terminator.
-    llvm::Value* bitcasted = builder.CreateBitCast(wrapper_func->getArg(0),
-                                                   instance_struct_ptr_type);
-    std::vector<llvm::Value*> args;
-    args.push_back(bitcasted);
-    builder.CreateCall(kernel, args);
-    builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
+void CodegenLLVMVisitor::wrap_kernel_functions() {
+    // First, identify all kernels.
+    std::vector<std::string> kernel_names;
+    find_kernel_names(kernel_names);
+
+    for (const auto& kernel_name: kernel_names) {
+        // Get the kernel function and the instance struct type.
+        auto kernel = module->getFunction(kernel_name);
+        if (!kernel)
+            throw std::runtime_error("Kernel " + kernel_name + " is not found!");
+
+        if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
+            throw std::runtime_error("Kernel " + kernel_name + " must have a single argument!");
+
+        auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(
+            kernel->getArg(0)->getType());
+        if (!instance_struct_ptr_type)
+            throw std::runtime_error("Kernel " + kernel_name +
+                                     " does not have an instance struct pointer argument!");
+
+        // Create a wrapper void function that takes a void pointer as a single argument.
+        llvm::Type* void_type = llvm::Type::getVoidTy(*context);
+        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+        llvm::Type* void_ptr_type = llvm::PointerType::get(void_type, /*AddressSpace=*/0);
+        llvm::Function* wrapper_func = llvm::Function::Create(
+            llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
+            llvm::Function::ExternalLinkage,
+            "__" + kernel_name + "_wrapper",
+            *module);
+        llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
+        builder.SetInsertPoint(body);
+
+        // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
+        // and adding a terminator.
+        llvm::Value* bitcasted = builder.CreateBitCast(wrapper_func->getArg(0),
+                                                       instance_struct_ptr_type);
+        std::vector<llvm::Value*> args;
+        args.push_back(bitcasted);
+        builder.CreateCall(kernel, args);
+        builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
+    }
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index f001c2c2fe..1007258010 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -333,10 +333,15 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     }
 
     /**
-     * For the given kernel function, wraps it into another function that uses void* to pass the
-     * data to the kernel \param kernel_name kernel name to be wrapped
+     * Fills the container with the names of kernel functions from the MOD file.
      */
-    void wrap_kernel_function(const std::string& kernel_name);
+    void find_kernel_names(std::vector<std::string>& container);
+
+    /**
+     * Wraps all kernel function calls into wrapper functions that use void* to pass the data to the
+     * kernel.
+     */
+    void wrap_kernel_functions();
 };
 
 /** \} */  // end of llvm_backends
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index a7673bb2ff..842c500810 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -22,24 +22,27 @@
 namespace nmodl {
 namespace runner {
 
-void JITDriver::init() {
+void JITDriver::init(std::string features) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
 
-    set_target_triple(module.get());
-    auto data_layout = module->getDataLayout();
-
     // Create IR compile function callback.
     auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
-        auto tm = tm_builder.createTargetMachine();
-        if (!tm)
-            return tm.takeError();
-        return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(*tm));
+        // Create target machine with some features possibly turned off.
+        auto tm = create_target(&tm_builder, features);
+
+        // Set the target triple and the data layout for the module.
+        module->setDataLayout(tm->createDataLayout());
+        module->setTargetTriple(tm->getTargetTriple().getTriple());
+
+        return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(tm));
     };
 
+    // Set JIT instance and extract the data layout from the module.
     auto jit_instance = cantFail(
         llvm::orc::LLJITBuilder().setCompileFunctionCreator(compile_function_creator).create());
+    auto data_layout = module->getDataLayout();
 
     // Add a ThreadSafeModule to the driver.
     llvm::orc::ThreadSafeModule tsm(std::move(module), std::make_unique<llvm::LLVMContext>());
@@ -52,29 +55,29 @@ void JITDriver::init() {
         data_layout.getGlobalPrefix())));
 }
 
-void JITDriver::set_target_triple(llvm::Module* module) {
-    auto target_triple = llvm::sys::getDefaultTargetTriple();
-    std::string error;
-    auto target = llvm::TargetRegistry::lookupTarget(target_triple, error);
+std::unique_ptr<llvm::TargetMachine> JITDriver::create_target(
+    llvm::orc::JITTargetMachineBuilder* builder,
+    const std::string& features) {
+    // First, look up the target.
+    std::string error_msg;
+    auto target_triple = builder->getTargetTriple().getTriple();
+    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
     if (!target)
-        throw std::runtime_error("Error: " + error + "\n");
-
-    std::string cpu(llvm::sys::getHostCPUName());
-    llvm::SubtargetFeatures features;
-    llvm::StringMap<bool> host_features;
-
-    if (llvm::sys::getHostCPUFeatures(host_features)) {
-        for (auto& f: host_features)
-            features.AddFeature(f.first(), f.second);
-    }
+        throw std::runtime_error("Error " + error_msg + "\n");
 
-    std::unique_ptr<llvm::TargetMachine> machine(
-        target->createTargetMachine(target_triple, cpu, features.getString(), {}, {}));
-    if (!machine)
-        throw std::runtime_error("Error: failed to create a target machine\n");
+    // Create default target machine with provided features.
+    auto tm = target->createTargetMachine(target_triple,
+                                          llvm::sys::getHostCPUName().str(),
+                                          features,
+                                          builder->getOptions(),
+                                          builder->getRelocationModel(),
+                                          builder->getCodeModel(),
+                                          /*OL=*/llvm::CodeGenOpt::Default,
+                                          /*JIT=*/true);
+    if (!tm)
+        throw std::runtime_error("Error: could not create the target machine\n");
 
-    module->setDataLayout(machine->createDataLayout());
-    module->setTargetTriple(target_triple);
+    return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
 }  // namespace runner
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index 23c8fca612..f994a57303 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -37,7 +37,7 @@ class JITDriver {
         : module(std::move(m)) {}
 
     /// Initialize the JIT.
-    void init();
+    void init(std::string features);
 
     /// Lookup the entry-point without arguments in the JIT and execute it, returning the result.
     template <typename ReturnType>
@@ -63,8 +63,9 @@ class JITDriver {
         return result;
     }
 
-    /// Set the target triple on the module.
-    static void set_target_triple(llvm::Module* module);
+    /// A wrapper around llvm::createTargetMachine to turn on/off certain CPU features.
+    std::unique_ptr<llvm::TargetMachine> create_target(llvm::orc::JITTargetMachineBuilder* builder,
+                                                       const std::string& features);
 };
 
 /**
@@ -78,9 +79,9 @@ class Runner {
     std::unique_ptr<JITDriver> driver = std::make_unique<JITDriver>(std::move(module));
 
   public:
-    Runner(std::unique_ptr<llvm::Module> m)
+    Runner(std::unique_ptr<llvm::Module> m, std::string features = "")
         : module(std::move(m)) {
-        driver->init();
+        driver->init(features);
     }
 
     /// Run the entry-point function without arguments.
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
new file mode 100644
index 0000000000..57e0d05c5b
--- /dev/null
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -0,0 +1,157 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <chrono>
+#include <fstream>
+
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/jit_driver.hpp"
+#include "llvm_benchmark.hpp"
+#include "llvm/Support/Host.h"
+
+#include "test/unit/codegen/codegen_data_helper.hpp"
+
+
+namespace nmodl {
+namespace benchmark {
+
+
+/// Precision for the timing measurements.
+static constexpr int PRECISION = 9;
+
+
+void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>& host_features) {
+    for (auto& host_feature: host_features) {
+        if (feature == host_feature.substr(1)) {
+            host_feature[0] = '-';
+            *log_stream << host_feature << "\n";
+            return;
+        }
+    }
+}
+
+void LLVMBenchmark::benchmark(const std::shared_ptr<ast::Program>& node) {
+    // First, set the output stream for the logs.
+    set_log_output();
+
+    // Then, record the time taken for building the LLVM IR module.
+    codegen::CodegenLLVMVisitor visitor(mod_filename,
+                                        output_dir,
+                                        llvm_build_info.opt_passes,
+                                        llvm_build_info.use_single_precision,
+                                        llvm_build_info.vector_width);
+    generate_llvm(visitor, node);
+
+    // Finally, run the benchmark and log the measurements.
+    run_benchmark(visitor, node);
+}
+
+void LLVMBenchmark::generate_llvm(codegen::CodegenLLVMVisitor& visitor,
+                                  const std::shared_ptr<ast::Program>& node) {
+    // First, visit the AST to build the LLVM IR module and wrap the kernel function calls.
+    auto start = std::chrono::high_resolution_clock::now();
+    visitor.visit_program(*node);
+    visitor.wrap_kernel_functions();
+    auto end = std::chrono::high_resolution_clock::now();
+
+    // Log the time taken to visit the AST and build LLVM IR.
+    std::chrono::duration<double> diff = end - start;
+    *log_stream << "Created LLVM IR module from NMODL AST in " << std::setprecision(PRECISION)
+                << diff.count() << "\n\n";
+}
+
+std::vector<std::string> LLVMBenchmark::get_cpu_features() {
+    std::string cpu(llvm::sys::getHostCPUName());
+
+    llvm::SubtargetFeatures features;
+    llvm::StringMap<bool> host_features;
+    if (llvm::sys::getHostCPUFeatures(host_features)) {
+        for (auto& f: host_features)
+            features.AddFeature(f.first(), f.second);
+    }
+    return features.getFeatures();
+}
+
+void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
+                                  const std::shared_ptr<ast::Program>& node) {
+    // Set the codegen data helper and find the kernels.
+    auto codegen_data = codegen::CodegenDataHelper(node, visitor.get_instance_struct_ptr());
+    std::vector<std::string> kernel_names;
+    visitor.find_kernel_names(kernel_names);
+
+    // Get feature's string and turn them off depending on the backend.
+    std::vector<std::string> features = get_cpu_features();
+    *log_stream << "Backend: " << backend << "\n";
+    if (backend == "avx2") {
+        // Disable SSE.
+        *log_stream << "Disabling features:\n";
+        disable("sse", features);
+        disable("sse2", features);
+        disable("sse3", features);
+        disable("sse4.1", features);
+        disable("sse4.2", features);
+    } else if (backend == "sse2") {
+        // Disable AVX.
+        *log_stream << "Disabling features:\n";
+        disable("avx", features);
+        disable("avx2", features);
+    }
+
+    std::string features_str = llvm::join(features.begin(), features.end(), ",");
+    std::unique_ptr<llvm::Module> m = visitor.get_module();
+    runner::Runner runner(std::move(m), features_str);
+
+    // Benchmark every kernel.
+    for (const auto& kernel_name: kernel_names) {
+        *log_stream << "Benchmarking kernel '" << kernel_name << "'\n";
+
+        // For every kernel run the benchmark `num_experiments` times.
+        double time_sum = 0.0;
+        for (int i = 0; i < num_experiments; ++i) {
+            // Initialise the data.
+            auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+
+            // Record the execution time of the kernel.
+            std::string wrapper_name = "__" + kernel_name + "_wrapper";
+            auto start = std::chrono::high_resolution_clock::now();
+            runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+            auto end = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<double> diff = end - start;
+
+            // Log the time taken for each run.
+            *log_stream << "Experiment " << i << ": compute time = " << std::setprecision(9)
+                        << diff.count() << "\n";
+
+            time_sum += diff.count();
+        }
+        // Log the average time taken for the kernel.
+        *log_stream << "Average compute time = " << std::setprecision(PRECISION)
+                    << time_sum / num_experiments << "\n\n";
+    }
+}
+
+void LLVMBenchmark::set_log_output() {
+    // If the output directory is not specified, dump logs to the console.
+    if (output_dir == ".") {
+        log_stream = std::make_shared<std::ostream>(std::cout.rdbuf());
+        return;
+    }
+
+    // Otherwise, dump logs to the specified file.
+    std::string filename = output_dir + "/" + mod_filename + ".log";
+    std::ofstream ofs;
+
+    ofs.open(filename.c_str());
+
+    if (ofs.fail())
+        throw std::runtime_error("Error while opening a file '" + filename + "'");
+
+    log_stream = std::make_shared<std::ostream>(ofs.rdbuf());
+}
+
+}  // namespace benchmark
+}  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/src/codegen/llvm/llvm_benchmark.hpp
new file mode 100644
index 0000000000..30ebf182e8
--- /dev/null
+++ b/src/codegen/llvm/llvm_benchmark.hpp
@@ -0,0 +1,85 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+
+
+namespace nmodl {
+namespace benchmark {
+
+/// A struct to hold LLVM visitor information.
+struct LLVMBuildInfo {
+    int vector_width;
+    bool opt_passes;
+    bool use_single_precision;
+};
+
+/**
+ * \class LLVMBenchmark
+ * \brief A wrapper to execute MOD file kernels via LLVM IR backend, and
+ * benchmark compile-time and runtime.
+ */
+class LLVMBenchmark {
+  private:
+    std::string mod_filename;
+
+    std::string output_dir;
+
+    int num_experiments;
+
+    int instance_size;
+
+    std::string backend;
+
+    LLVMBuildInfo llvm_build_info;
+
+    std::shared_ptr<std::ostream> log_stream;
+
+    /// Disable the specified feature.
+    void disable(const std::string& feature, std::vector<std::string>& host_features);
+
+    /// Visits the AST to construct the LLVM IR module.
+    void generate_llvm(codegen::CodegenLLVMVisitor& visitor,
+                       const std::shared_ptr<ast::Program>& node);
+
+    /// Get the host CPU features in the format:
+    ///   +feature,+feature,-feature,+feature,...
+    /// where `+` indicates that the feature is enabled.
+    std::vector<std::string> get_cpu_features();
+
+    /// Runs the main body of the benchmark, executing the compute kernels.
+    void run_benchmark(codegen::CodegenLLVMVisitor& visitor,
+                       const std::shared_ptr<ast::Program>& node);
+
+    /// Sets the log output stream (file or console).
+    void set_log_output();
+
+  public:
+    LLVMBenchmark(const std::string& mod_filename,
+                  const std::string& output_dir,
+                  LLVMBuildInfo info,
+                  int num_experiments,
+                  int instance_size,
+                  const std::string& backend)
+        : mod_filename(mod_filename)
+        , output_dir(output_dir)
+        , num_experiments(num_experiments)
+        , instance_size(instance_size)
+        , backend(backend)
+        , llvm_build_info(info) {}
+
+    /// Runs the benchmark.
+    void benchmark(const std::shared_ptr<ast::Program>& node);
+};
+
+
+}  // namespace benchmark
+}  // namespace nmodl
diff --git a/src/main.cpp b/src/main.cpp
index 5fa5304776..79d8d32bef 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -20,6 +20,7 @@
 
 #ifdef NMODL_LLVM_BACKEND
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/llvm_benchmark.hpp"
 #endif
 
 #include "config/config.h"
@@ -173,8 +174,20 @@ int main(int argc, const char* argv[]) {
     /// run llvm optimisation passes
     bool llvm_opt_passes(false);
 
-    /// llvm vector width;
+    /// llvm vector width
     int llvm_vec_width = 1;
+
+    /// run llvm benchmark
+    bool run_benchmark(false);
+
+    /// the size of the instance struct for the benchmark
+    int instance_size = 10000;
+
+    /// the number of experiments to run for the benchmarking
+    int repeat = 100;
+
+    /// specify the backend for LLVM IR to target
+    std::string backend = "default";
 #endif
 
     app.get_formatter()->column_width(40);
@@ -281,6 +294,8 @@ int main(int argc, const char* argv[]) {
         "Optimize copies of ion variables ({})"_format(optimize_ionvar_copies_codegen))->ignore_case();
 
 #ifdef NMODL_LLVM_BACKEND
+
+    // LLVM IR code generation options.
     auto llvm_opt = app.add_subcommand("llvm", "LLVM code generation option")->ignore_case();
     llvm_opt->add_flag("--ir",
         llvm_ir,
@@ -294,6 +309,21 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_option("--vector-width",
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
+
+    // LLVM IR benchmark options.
+    auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
+    benchmark_opt->add_flag("--run",
+                       run_benchmark,
+                       "Run LLVM benchmark ({})"_format(run_benchmark))->ignore_case();
+    benchmark_opt->add_option("--instance-size",
+                       instance_size,
+                       "Instance struct size ({})"_format(instance_size))->ignore_case();
+    benchmark_opt->add_option("--repeat",
+                       repeat,
+                       "Number of experiments for benchmarking ({})"_format(repeat))->ignore_case();
+    benchmark_opt->add_option("--backend",
+                       backend,
+                       "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));;
 #endif
     // clang-format on
 
@@ -591,7 +621,16 @@ int main(int argc, const char* argv[]) {
             }
 
 #ifdef NMODL_LLVM_BACKEND
-            if (llvm_ir) {
+
+            if (run_benchmark) {
+                logger->info("Running LLVM benchmark");
+                benchmark::LLVMBuildInfo info{llvm_vec_width, llvm_opt_passes, llvm_float_type};
+                benchmark::LLVMBenchmark bench(
+                    modfile, output_dir, info, repeat, instance_size, backend);
+                bench.benchmark(ast);
+            }
+
+            else if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(
                     modfile, output_dir, llvm_opt_passes, llvm_float_type, llvm_vec_width);
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 631f8090f0..b4fa2f7837 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -84,6 +84,7 @@ target_link_libraries(
   test_util
   printer
   ${NMODL_WRAPPER_LIBS})
+
 target_link_libraries(
   testcodegen
   codegen
@@ -97,6 +98,10 @@ target_link_libraries(
 
 if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS} codegen)
+
+  add_library(llvm_benchmark STATIC codegen/codegen_data_helper.cpp)
+  add_dependencies(llvm_benchmark lexer)
+
   add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp
                           codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_data_helper.cpp
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index b191f350df..4e2717e45c 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -303,7 +303,7 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/1);
         llvm_visitor.visit_program(*ast);
-        llvm_visitor.wrap_kernel_function("nrn_state_test");
+        llvm_visitor.wrap_kernel_functions();
 
         // Create the instance struct data.
         int num_elements = 4;
@@ -384,7 +384,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/4);
         llvm_visitor.visit_program(*ast);
-        llvm_visitor.wrap_kernel_function("nrn_state_test");
+        llvm_visitor.wrap_kernel_functions();
 
         // Create the instance struct data.
         int num_elements = 10;

From db80372cafff02d624a880c099ba079fc26e9d36 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Sun, 18 Apr 2021 23:47:28 +0200
Subject: [PATCH 040/331] Minor benchmarking improvement (#593)

- allocate instance data only once
- store memory size with instance data
- print memory size while running benchmarking kernel
---
 src/codegen/llvm/llvm_benchmark.cpp       | 9 +++++----
 test/unit/codegen/codegen_data_helper.cpp | 2 ++
 test/unit/codegen/codegen_data_helper.hpp | 3 +++
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index 57e0d05c5b..6ab9ff4982 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -107,14 +107,15 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
-        *log_stream << "Benchmarking kernel '" << kernel_name << "'\n";
+        // Initialise the data.
+        auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+
+        double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
+        *log_stream << "Benchmarking kernel '" << kernel_name << ", with " << size_mbs << " MBs\n";
 
         // For every kernel run the benchmark `num_experiments` times.
         double time_sum = 0.0;
         for (int i = 0; i < num_experiments; ++i) {
-            // Initialise the data.
-            auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::high_resolution_clock::now();
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index e42cfe01f3..4bf94f583d 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -88,6 +88,7 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
     // allocate instance object with memory alignment
     posix_memalign(&base, NBYTE_ALIGNMENT, member_size * variables.size());
     data.base_ptr = base;
+    data.num_bytes += member_size * variables.size();
 
     size_t offset = 0;
     void* ptr = base;
@@ -115,6 +116,7 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
         void* member;
         posix_memalign(&member, NBYTE_ALIGNMENT, member_size * num_elements);
         initialize_variable(var, member, variable_index, num_elements);
+        data.num_bytes += member_size * num_elements;
 
         // copy address at specific location in the struct
         memcpy(ptr, &member, sizeof(double*));
diff --git a/test/unit/codegen/codegen_data_helper.hpp b/test/unit/codegen/codegen_data_helper.hpp
index 368b964147..ef8e869366 100644
--- a/test/unit/codegen/codegen_data_helper.hpp
+++ b/test/unit/codegen/codegen_data_helper.hpp
@@ -46,6 +46,9 @@ struct CodegenInstanceData {
     /// i.e. *(base_ptr + offsets[0]) will be members[0]
     std::vector<void*> members;
 
+    /// size in bytes
+    size_t num_bytes = 0;
+
     // cleanup all memory allocated for type and member variables
     ~CodegenInstanceData();
 };

From 2a699a8120556d08aebaa9c457e92eb93bc2b976 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Mon, 19 Apr 2021 19:22:07 +0200
Subject: [PATCH 041/331] Bug fix in codegen helper: delete LOCAL statement
 (#595)

- LOCAL statement was not deleted correctly
- Instead of getting first element from statement vector,
   use local statement pointer to erase it from the node.

Related to #594
---
 src/codegen/llvm/codegen_llvm_helper_visitor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index eec79370f6..8105fec848 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -473,12 +473,13 @@ void CodegenLLVMHelperVisitor::convert_local_statement(ast::StatementBlock& node
         }
 
         /// remove local list statement now
-        const auto& statements = node.get_statements();
-        node.erase_statement(statements.begin());
+        std::unordered_set<nmodl::ast::Statement*> to_delete({local_statement.get()});
+        node.erase_statement(to_delete);
 
         /// create new codegen variable statement and insert at the beginning of the block
         auto type = new ast::CodegenVarType(FLOAT_TYPE);
         auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+        const auto& statements = node.get_statements();
         node.insert_statement(statements.begin(), statement);
     }
 }

From 630033c20fd3514617cb422585135256dc898de5 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 20 Apr 2021 21:36:39 +0300
Subject: [PATCH 042/331] LLVM 13 compatibility and fixing void* type (#603)

* Made compatible with LLVM 13 and replaced void* with i8*
---
 cmake/LLVMHelper.cmake                    | 9 ++++++++-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 3 +--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index a731fa0151..e27ac8d553 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -5,7 +5,14 @@
 find_package(LLVM REQUIRED CONFIG)
 
 # include LLVM header and core library
-llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core orcjit native)
+llvm_map_components_to_libnames(
+  LLVM_LIBS_TO_LINK
+  core
+  instcombine
+  native
+  orcjit
+  scalaropts
+  support)
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
 set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index ea7e828035..cd42fffae3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -1007,9 +1007,8 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
                                      " does not have an instance struct pointer argument!");
 
         // Create a wrapper void function that takes a void pointer as a single argument.
-        llvm::Type* void_type = llvm::Type::getVoidTy(*context);
         llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::Type* void_ptr_type = llvm::PointerType::get(void_type, /*AddressSpace=*/0);
+        llvm::Type* void_ptr_type = llvm::Type::getInt8PtrTy(*context);
         llvm::Function* wrapper_func = llvm::Function::Create(
             llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
             llvm::Function::ExternalLinkage,

From fe3d856919c91ab0a52f70be8c93b0a3974e234e Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Tue, 20 Apr 2021 23:31:40 +0200
Subject: [PATCH 043/331] Allow LOCAL variable inside StatementBlock for LLVM
 IR generation (#599)

  - if LOCAL variable was declared inside DERIVATIVE block then
    we were getting error:
      "Stored value type does not match pointer operand type!"
  - the error was happening because scalar variable from epilogue
    loop was conflicting with the vector type variable in main loop
  - to avoid conflict between main and epilogue loop, rename all
    local variables in epilogue.
  - bug fix for recursive handling of LocalList statement

fixes #594
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 57 ++++++++++++++++---
 .../llvm/codegen_llvm_helper_visitor.hpp      |  1 +
 2 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 8105fec848..0df364e649 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -459,12 +459,13 @@ void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
  * it to CodegenVarListStatement that will represent all variables as double.
  */
 void CodegenLLVMHelperVisitor::convert_local_statement(ast::StatementBlock& node) {
-    /// first process all children blocks if any
-    node.visit_children(*this);
+    /// collect all local statement block
+    const auto& statements = collect_nodes(node, {ast::AstNodeType::LOCAL_LIST_STATEMENT});
+
+    /// iterate over all statements and replace each with codegen variable
+    for (const auto& statement: statements) {
+        const auto& local_statement = std::dynamic_pointer_cast<ast::LocalListStatement>(statement);
 
-    /// check if block contains LOCAL statement
-    const auto& local_statement = visitor::get_local_list_statement(node);
-    if (local_statement) {
         /// create codegen variables from local variables
         /// clone variable to make new independent statement
         ast::CodegenVarVector variables;
@@ -474,16 +475,51 @@ void CodegenLLVMHelperVisitor::convert_local_statement(ast::StatementBlock& node
 
         /// remove local list statement now
         std::unordered_set<nmodl::ast::Statement*> to_delete({local_statement.get()});
-        node.erase_statement(to_delete);
+        /// local list statement is enclosed in statement block
+        const auto& parent_node = dynamic_cast<ast::StatementBlock*>(local_statement->get_parent());
+        parent_node->erase_statement(to_delete);
 
         /// create new codegen variable statement and insert at the beginning of the block
         auto type = new ast::CodegenVarType(FLOAT_TYPE);
-        auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
-        const auto& statements = node.get_statements();
-        node.insert_statement(statements.begin(), statement);
+        auto new_statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+        const auto& statements = parent_node->get_statements();
+        parent_node->insert_statement(statements.begin(), new_statement);
     }
 }
 
+/**
+ * \brief Visit StatementBlock and rename all LOCAL variables
+ * @param node AST node representing Statement block
+ *
+ * Statement block in remainder loop will have same LOCAL variables from
+ * main loop. In order to avoid conflict during lookup, rename each local
+ * variable by appending unique number. The number used as suffix is just
+ * a counter used for Statement block.
+ */
+void CodegenLLVMHelperVisitor::rename_local_variables(ast::StatementBlock& node) {
+    /// local block counter just to append unique number
+    static int local_block_counter = 1;
+
+    /// collect all local statement block
+    const auto& statements = collect_nodes(node, {ast::AstNodeType::LOCAL_LIST_STATEMENT});
+
+    /// iterate over each statement and rename all variables
+    for (const auto& statement: statements) {
+        const auto& local_statement = std::dynamic_pointer_cast<ast::LocalListStatement>(statement);
+
+        /// rename local variable in entire statement block
+        for (auto& var: local_statement->get_variables()) {
+            std::string old_name = var->get_node_name();
+            std::string new_name = "{}_{}"_format(old_name, local_block_counter);
+            visitor::RenameVisitor(old_name, new_name).visit_statement_block(node);
+        }
+    }
+
+    /// make it unique for next statement block
+    local_block_counter++;
+}
+
+
 void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
     node.visit_children(*this);
     create_function_for_node(node);
@@ -661,6 +697,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
             loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, /*vector_width=*/1);
         const auto& increment = loop_increment_expression(INDUCTION_VAR, /*vector_width=*/1);
 
+        /// rename local variables to avoid conflict with main loop
+        rename_local_variables(*loop_block);
+
         /// convert local statement to codegenvar statement
         convert_local_statement(*loop_block);
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 446d5a6fd9..bbff588675 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -163,6 +163,7 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void convert_to_instance_variable(ast::Node& node, std::string& index_var);
 
     void convert_local_statement(ast::StatementBlock& node);
+    void rename_local_variables(ast::StatementBlock& node);
 
     void visit_procedure_block(ast::ProcedureBlock& node) override;
     void visit_function_block(ast::FunctionBlock& node) override;

From dddffed5abcf86b7e1a8234dcd04449c2034ac5e Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Thu, 22 Apr 2021 17:11:09 +0200
Subject: [PATCH 044/331] Update CI with LLVM v13 (trunk) (#605)

 * In order to use VecLibReplace pass, we need LLVM 13 / trunk
 * Change ubuntu image on azure from 16.04 to 18.04
 * Install llvm-13 nightly snapshot
 * Enable LLVM build on Ubuntu
 * For Mac OS use pre-built binary package from https://github.com/pramodk/llvm-nightly
 * We will see if we get OS X bottle from BlueBrain/homebrew-tap/pull/7
---
 azure-pipelines.yml | 19 ++++++++++++++-----
 ci/bb5-pr.sh        |  4 ++--
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index f9d7d8ee80..ffe744d6f9 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -47,6 +47,11 @@ jobs:
       url="https://github.com/ispc/ispc/releases/download/${ispc_version}/ispc-${ispc_version}${ispc_version_suffix}-${url_os}.tar.gz";
       mkdir $(pwd)/$CMAKE_PKG/ispc
       wget --output-document=- $url | tar -xvzf - -C $(pwd)/$CMAKE_PKG/ispc --strip 1;
+      # install llvm nightly (future v13)
+      wget https://apt.llvm.org/llvm.sh
+      chmod +x llvm.sh
+      sudo ./llvm.sh 13
+
     env:
       CMAKE_PKG: 'cmake-3.10.2-Linux-x86_64'
     displayName: 'Install Dependencies'
@@ -56,7 +61,7 @@ jobs:
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
       cmake --version
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3.7) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=Release -DNMODL_ENABLE_LLVM=OFF
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3.7) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=Release -DNMODL_ENABLE_LLVM=ON -DLLVM_DIR=/usr/lib/llvm-13/share/llvm/cmake/
       make -j 2
       if [ $? -ne 0 ]
       then
@@ -115,10 +120,10 @@ jobs:
     env:
       CMAKE_PKG: 'cmake-3.10.2-Linux-x86_64'
     displayName: 'Build CoreNEURON and Run Integration Tests with ISPC compiler'
-- job: 'osx1014'
+- job: 'osx1015'
   pool:
-    vmImage: 'macOS-10.14'
-  displayName: 'MacOS (10.14), AppleClang 10.0'
+    vmImage: 'macOS-10.15'
+  displayName: 'MacOS (10.15), AppleClang 11.0'
   steps:
   - checkout: self
     submodules: True
@@ -127,11 +132,15 @@ jobs:
       python3 -m pip install -U pip setuptools
       python3 -m pip install --user 'Jinja2>=2.9.3' 'PyYAML>=3.13' pytest pytest-cov numpy 'sympy>=1.3'
     displayName: 'Install Dependencies'
+  - script: |
+      cd $HOME
+      git clone https://github.com/pramodk/llvm-nightly.git
+    displayName: 'Setup LLVM v13'
   - script: |
       export PATH=/usr/local/opt/flex/bin:/usr/local/opt/bison/bin:$PATH;
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=`brew --prefix llvm`/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$HOME/llvm-nightly/0421/osx/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
       make -j 2
       if [ $? -ne 0 ]
       then
diff --git a/ci/bb5-pr.sh b/ci/bb5-pr.sh
index 9f65c3783f..a840d38e17 100755
--- a/ci/bb5-pr.sh
+++ b/ci/bb5-pr.sh
@@ -7,7 +7,7 @@ git show HEAD
 source /gpfs/bbp.cscs.ch/apps/hpc/jenkins/config/modules.sh
 module use /gpfs/bbp.cscs.ch/apps/tools/modules/tcl/linux-rhel7-x86_64/
 
-module load archive/2020-10 cmake bison flex python-dev doxygen
+module load unstable cmake bison flex python-dev doxygen
 module list
 
 function bb5_pr_setup_virtualenv() {
@@ -41,7 +41,7 @@ function build_with() {
              -DPYTHON_EXECUTABLE=$(which python3) \
              -DNMODL_FORMATTING:BOOL=ON \
              -DClangFormat_EXECUTABLE=$clang_format_exe \
-             -DLLVM_DIR=/gpfs/bbp.cscs.ch/apps/hpc/jenkins/merge/deploy/externals/latest/linux-rhel7-x86_64/gcc-9.3.0/llvm-11.0.0-kzl4o5/lib/cmake/llvm
+             -DLLVM_DIR=/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0421/lib/cmake/llvm
     make -j6
     popd
 }

From 068ba5d0deaa20f6623b911c8e30b6720a634b97 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 22 Apr 2021 09:08:33 -0700
Subject: [PATCH 045/331] Integrating vector maths library into LLVM codegen
 (#604)

Added support for replacing LLVM IR maths intrinsics with vector
maths functions from Accelerate, libmvec, MASSV, and SVML. To
trigger the replacement, a new `--veclib` option should be used.
This is only supported on LLVM 13+.

Example:
```
$ bin/nmodl hh.mod llvm --ir --vector-width 4 --veclib SVML
```

fixes #589

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 CMakeLists.txt                            |  3 +
 cmake/LLVMHelper.cmake                    |  3 +
 src/codegen/llvm/codegen_llvm_visitor.cpp | 52 ++++++++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp | 29 ++++++-
 src/main.cpp                              | 24 ++++--
 test/unit/codegen/codegen_llvm_ir.cpp     | 98 ++++++++++++++++++++++-
 6 files changed, 188 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a280906edd..2a0b47a599 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -149,6 +149,9 @@ if(NMODL_ENABLE_LLVM)
   include(LLVMHelper)
   include_directories(${LLVM_INCLUDE_DIRS})
   add_definitions(-DNMODL_LLVM_BACKEND)
+  if(LLVM_VERSION VERSION_LESS_EQUAL 12)
+    add_definitions(-DLLVM_VERSION_LESS_THAN_13)
+  endif()
 endif()
 
 # =============================================================================
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index e27ac8d553..f81a5a62e8 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -7,8 +7,11 @@ find_package(LLVM REQUIRED CONFIG)
 # include LLVM header and core library
 llvm_map_components_to_libnames(
   LLVM_LIBS_TO_LINK
+  analysis
+  codegen
   core
   instcombine
+  mc
   native
   orcjit
   scalaropts
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index cd42fffae3..1738d4139e 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -11,6 +11,7 @@
 #include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -19,8 +20,13 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/ToolOutputFile.h"
 
+#ifndef LLVM_VERSION_LESS_THAN_13
+#include "llvm/CodeGen/ReplaceWithVeclib.h"
+#endif
+
 namespace nmodl {
 namespace codegen {
 
@@ -292,21 +298,21 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMVisitor::get_instance_struct_ptr
     return instance_var_helper.instance;
 }
 
-void CodegenLLVMVisitor::run_llvm_opt_passes() {
+void CodegenLLVMVisitor::run_ir_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
-    fpm.add(llvm::createInstructionCombiningPass());
-    fpm.add(llvm::createReassociatePass());
-    fpm.add(llvm::createGVNPass());
-    fpm.add(llvm::createCFGSimplificationPass());
+    opt_pm.add(llvm::createInstructionCombiningPass());
+    opt_pm.add(llvm::createReassociatePass());
+    opt_pm.add(llvm::createGVNPass());
+    opt_pm.add(llvm::createCFGSimplificationPass());
 
     /// initialize pass manager
-    fpm.doInitialization();
+    opt_pm.doInitialization();
 
     /// iterate over all functions and run the optimisation passes
     auto& functions = module->getFunctionList();
     for (auto& function: functions) {
         llvm::verifyFunction(function);
-        fpm.run(function);
+        opt_pm.run(function);
     }
 }
 
@@ -892,7 +898,37 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     if (opt_passes) {
         logger->info("Running LLVM optimisation passes");
-        run_llvm_opt_passes();
+        run_ir_opt_passes();
+    }
+
+    // Optionally, replace LLVM's maths intrinsics with vector library calls.
+    if (vector_width > 1 && vector_library != llvm::TargetLibraryInfoImpl::NoLibrary) {
+#ifdef LLVM_VERSION_LESS_THAN_13
+        logger->warn(
+            "This version of LLVM does not support replacement of LLVM intrinsics with vector "
+            "library calls");
+#else
+        // First, get the target library information.
+        llvm::Triple triple(llvm::sys::getDefaultTargetTriple());
+        llvm::TargetLibraryInfoImpl target_lib_info = llvm::TargetLibraryInfoImpl(triple);
+
+        // Populate target library information with vectorisable functions. Since libmvec is
+        // supported for x86_64 only, have a check to catch other architectures.
+        if (vector_library != llvm::TargetLibraryInfoImpl::LIBMVEC_X86 ||
+            (triple.isX86() && triple.isArch64Bit())) {
+            target_lib_info.addVectorizableFunctionsFromVecLib(vector_library);
+        }
+
+        // Run the codegen optimisation passes that replace maths intrinsics.
+        codegen_pm.add(new llvm::TargetLibraryInfoWrapperPass(target_lib_info));
+        codegen_pm.add(new llvm::ReplaceWithVeclibLegacy);
+        codegen_pm.doInitialization();
+        for (auto& function: module->getFunctionList()) {
+            if (!function.isDeclaration())
+                codegen_pm.run(function);
+        }
+        codegen_pm.doFinalization();
+#endif
     }
 
     // If the output directory is specified, save the IR to .ll file.
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 1007258010..099613f8d4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -23,6 +23,7 @@
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -45,6 +46,16 @@ namespace codegen {
  * @{
  */
 
+/// A map to query vector library by its string value.
+static const std::map<std::string, llvm::TargetLibraryInfoImpl::VectorLibrary> veclib_map = {
+    {"Accelerate", llvm::TargetLibraryInfoImpl::Accelerate},
+#ifndef LLVM_VERSION_LESS_THAN_13
+    {"libmvec", llvm::TargetLibraryInfoImpl::LIBMVEC_X86},
+#endif
+    {"MASSV", llvm::TargetLibraryInfoImpl::MASSV},
+    {"SVML", llvm::TargetLibraryInfoImpl::SVML},
+    {"none", llvm::TargetLibraryInfoImpl::NoLibrary}};
+
 /**
  * \class CodegenLLVMVisitor
  * \brief %Visitor for transforming NMODL AST to LLVM IR
@@ -65,7 +76,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
     llvm::IRBuilder<> builder;
 
-    llvm::legacy::FunctionPassManager fpm;
+    // Pass manager for optimisation passes that are used for target code generation.
+    llvm::legacy::FunctionPassManager codegen_pm;
+
+    // Vector library used for maths functions.
+    llvm::TargetLibraryInfoImpl::VectorLibrary vector_library;
+
+    // Pass manager for optimisation passes that are run on IR and are not related to target.
+    llvm::legacy::FunctionPassManager opt_pm;
 
     // Stack to hold visited values
     std::vector<llvm::Value*> values;
@@ -97,7 +115,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      * LLVM provides number of optimisation passes that can be run on the generated IR.
      * Here we run common optimisation LLVM passes that benefits code optimisation.
      */
-    void run_llvm_opt_passes();
+    void run_ir_opt_passes();
 
   public:
     /**
@@ -110,14 +128,17 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        const std::string& output_dir,
                        bool opt_passes,
                        bool use_single_precision = false,
-                       int vector_width = 1)
+                       int vector_width = 1,
+                       std::string vec_lib = "none")
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , use_single_precision(use_single_precision)
         , vector_width(vector_width)
+        , vector_library(veclib_map.at(vec_lib))
         , builder(*context)
-        , fpm(module.get()) {}
+        , codegen_pm(module.get())
+        , opt_pm(module.get()) {}
 
 
     /**
diff --git a/src/main.cpp b/src/main.cpp
index 79d8d32bef..69a7109dde 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -172,11 +172,14 @@ int main(int argc, const char* argv[]) {
     bool llvm_float_type(false);
 
     /// run llvm optimisation passes
-    bool llvm_opt_passes(false);
+    bool llvm_ir_opt_passes(false);
 
     /// llvm vector width
     int llvm_vec_width = 1;
 
+    /// vector library
+    std::string vec_lib("none");
+
     /// run llvm benchmark
     bool run_benchmark(false);
 
@@ -301,14 +304,17 @@ int main(int argc, const char* argv[]) {
         llvm_ir,
         "Generate LLVM IR ({})"_format(llvm_ir))->ignore_case();
     llvm_opt->add_flag("--opt",
-        llvm_opt_passes,
-        "Run LLVM optimisation passes ({})"_format(llvm_opt_passes))->ignore_case();
+                       llvm_ir_opt_passes,
+                       "Run LLVM optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
     llvm_opt->add_option("--vector-width",
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
+    llvm_opt->add_option("--veclib",
+                         vec_lib,
+                         "Vector library for maths functions ({})"_format(vec_lib))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
 
     // LLVM IR benchmark options.
     auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
@@ -323,7 +329,7 @@ int main(int argc, const char* argv[]) {
                        "Number of experiments for benchmarking ({})"_format(repeat))->ignore_case();
     benchmark_opt->add_option("--backend",
                        backend,
-                       "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));;
+                       "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));
 #endif
     // clang-format on
 
@@ -624,7 +630,7 @@ int main(int argc, const char* argv[]) {
 
             if (run_benchmark) {
                 logger->info("Running LLVM benchmark");
-                benchmark::LLVMBuildInfo info{llvm_vec_width, llvm_opt_passes, llvm_float_type};
+                benchmark::LLVMBuildInfo info{llvm_vec_width, llvm_ir_opt_passes, llvm_float_type};
                 benchmark::LLVMBenchmark bench(
                     modfile, output_dir, info, repeat, instance_size, backend);
                 bench.benchmark(ast);
@@ -632,8 +638,12 @@ int main(int argc, const char* argv[]) {
 
             else if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(
-                    modfile, output_dir, llvm_opt_passes, llvm_float_type, llvm_vec_width);
+                CodegenLLVMVisitor visitor(modfile,
+                                           output_dir,
+                                           llvm_ir_opt_passes,
+                                           llvm_float_type,
+                                           llvm_vec_width,
+                                           vec_lib);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 207548ee46..93fd269b8e 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -36,7 +36,8 @@ using nmodl::parser::NmodlDriver;
 std::string run_llvm_visitor(const std::string& text,
                              bool opt = false,
                              bool use_single_precision = false,
-                             int vector_width = 1) {
+                             int vector_width = 1,
+                             std::string vec_lib = "none") {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
@@ -48,7 +49,8 @@ std::string run_llvm_visitor(const std::string& text,
                                              /*output_dir=*/".",
                                              opt,
                                              use_single_precision,
-                                             vector_width);
+                                             vector_width,
+                                             vec_lib);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();
 }
@@ -1056,6 +1058,98 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
     }
 }
 
+//=============================================================================
+// Vector library calls.
+//=============================================================================
+
+SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
+    GIVEN("A vector LLVM intrinsic") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                NONSPECIFIC_CURRENT il
+            }
+            STATE {
+                m
+            }
+            ASSIGNED {
+                v (mV)
+            }
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                il = 2
+            }
+            DERIVATIVE states {
+                m = exp(m)
+            }
+        )";
+
+        THEN("it is replaced with an appropriate vector library call") {
+            std::smatch m;
+
+            // Check exponential intrinsic is created.
+            std::string no_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                 /*opt=*/false,
+                                                                 /*use_single_precision=*/false,
+                                                                 /*vector_width=*/2);
+            std::regex exp_decl(R"(declare <2 x double> @llvm\.exp\.v2f64\(<2 x double>\))");
+            std::regex exp_call(R"(call <2 x double> @llvm\.exp\.v2f64\(<2 x double> .*\))");
+            REQUIRE(std::regex_search(no_library_module_str, m, exp_decl));
+            REQUIRE(std::regex_search(no_library_module_str, m, exp_call));
+
+#ifndef LLVM_VERSION_LESS_THAN_13
+            // Check exponential calls are replaced with calls to SVML library.
+            std::string svml_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                   /*opt=*/false,
+                                                                   /*use_single_precision=*/false,
+                                                                   /*vector_width=*/2,
+                                                                   /*vec_lib=*/"SVML");
+            std::regex svml_exp_decl(R"(declare <2 x double> @__svml_exp2\(<2 x double>\))");
+            std::regex svml_exp_call(R"(call <2 x double> @__svml_exp2\(<2 x double> .*\))");
+            REQUIRE(std::regex_search(svml_library_module_str, m, svml_exp_decl));
+            REQUIRE(std::regex_search(svml_library_module_str, m, svml_exp_call));
+            REQUIRE(!std::regex_search(svml_library_module_str, m, exp_call));
+
+            // Check that supported exponential calls are replaced with calls to MASSV library (i.e.
+            // operating on vector of width 2).
+            std::string massv2_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                     /*opt=*/false,
+                                                                     /*use_single_precision=*/false,
+                                                                     /*vector_width=*/2,
+                                                                     /*vec_lib=*/"MASSV");
+            std::regex massv2_exp_decl(R"(declare <2 x double> @__expd2_P8\(<2 x double>\))");
+            std::regex massv2_exp_call(R"(call <2 x double> @__expd2_P8\(<2 x double> .*\))");
+            REQUIRE(std::regex_search(massv2_library_module_str, m, massv2_exp_decl));
+            REQUIRE(std::regex_search(massv2_library_module_str, m, massv2_exp_call));
+            REQUIRE(!std::regex_search(massv2_library_module_str, m, exp_call));
+
+            // Check no replacement for MASSV happens for non-supported vector widths.
+            std::string massv4_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                     /*opt=*/false,
+                                                                     /*use_single_precision=*/false,
+                                                                     /*vector_width=*/4,
+                                                                     /*vec_lib=*/"MASSV");
+            std::regex exp4_call(R"(call <4 x double> @llvm\.exp\.v4f64\(<4 x double> .*\))");
+            REQUIRE(std::regex_search(massv4_library_module_str, m, exp4_call));
+
+            // Check correct replacement of @llvm.exp.v4f32 into @vexpf when using Accelerate.
+            std::string accelerate_library_module_str =
+                run_llvm_visitor(nmodl_text,
+                                 /*opt=*/false,
+                                 /*use_single_precision=*/true,
+                                 /*vector_width=*/4,
+                                 /*vec_lib=*/"Accelerate");
+            std::regex accelerate_exp_decl(R"(declare <4 x float> @vexpf\(<4 x float>\))");
+            std::regex accelerate_exp_call(R"(call <4 x float> @vexpf\(<4 x float> .*\))");
+            std::regex fexp_call(R"(call <4 x float> @llvm\.exp\.v4f32\(<4 x float> .*\))");
+            REQUIRE(std::regex_search(accelerate_library_module_str, m, accelerate_exp_decl));
+            REQUIRE(std::regex_search(accelerate_library_module_str, m, accelerate_exp_call));
+            REQUIRE(!std::regex_search(accelerate_library_module_str, m, fexp_call));
+#endif
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From 98a88d505b28cf92817d279d0502f4d4d57b63d2 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 22 Apr 2021 16:25:35 -0700
Subject: [PATCH 046/331] Using shared libraries in LLVM JIT (#609)

* Integrated veclibs in benchmark and added shared libs support for JIT
* Tested on BBP Ubuntu Linux box
* Make sure to set LD_LIBRARY_PATH for Intel library dir
---
 cmake/LLVMHelper.cmake              |  1 +
 src/codegen/llvm/jit_driver.cpp     | 68 +++++++++++++++++++++++++----
 src/codegen/llvm/jit_driver.hpp     | 11 +++--
 src/codegen/llvm/llvm_benchmark.cpp |  5 ++-
 src/codegen/llvm/llvm_benchmark.hpp |  5 +++
 src/main.cpp                        | 15 +++++--
 6 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index f81a5a62e8..2b7db94a85 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -10,6 +10,7 @@ llvm_map_components_to_libnames(
   analysis
   codegen
   core
+  executionengine
   instcombine
   mc
   native
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index 842c500810..ec08e8856d 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -11,9 +11,11 @@
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -22,27 +24,55 @@
 namespace nmodl {
 namespace runner {
 
-void JITDriver::init(std::string features) {
+void JITDriver::init(std::string features, std::vector<std::string>& lib_paths) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
 
+    // Set the target triple and the data layout for the module.
+    set_triple_and_data_layout(features);
+    auto data_layout = module->getDataLayout();
+
+    // Create object linking function callback.
+    auto object_linking_layer_creator = [&](llvm::orc::ExecutionSession& session,
+                                            const llvm::Triple& triple) {
+        // Create linking layer.
+        auto layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, []() {
+            return std::make_unique<llvm::SectionMemoryManager>();
+        });
+        for (const auto& lib_path: lib_paths) {
+            // For every library path, create a corresponding memory buffer.
+            auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
+            if (!memory_buffer)
+                throw std::runtime_error("Unable to create memory buffer for " + lib_path);
+
+            // Create a new JIT library instance for this session and resolve symbols.
+            auto& jd = session.createBareJITDylib(std::string(lib_path));
+            auto loaded =
+                llvm::orc::DynamicLibrarySearchGenerator::Load(lib_path.data(),
+                                                               data_layout.getGlobalPrefix());
+
+            if (!loaded)
+                throw std::runtime_error("Unable to load " + lib_path);
+            jd.addGenerator(std::move(*loaded));
+            cantFail(layer->add(jd, std::move(*memory_buffer)));
+        }
+
+        return layer;
+    };
+
     // Create IR compile function callback.
     auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
         // Create target machine with some features possibly turned off.
         auto tm = create_target(&tm_builder, features);
-
-        // Set the target triple and the data layout for the module.
-        module->setDataLayout(tm->createDataLayout());
-        module->setTargetTriple(tm->getTargetTriple().getTriple());
-
         return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(tm));
     };
 
     // Set JIT instance and extract the data layout from the module.
-    auto jit_instance = cantFail(
-        llvm::orc::LLJITBuilder().setCompileFunctionCreator(compile_function_creator).create());
-    auto data_layout = module->getDataLayout();
+    auto jit_instance = cantFail(llvm::orc::LLJITBuilder()
+                                     .setCompileFunctionCreator(compile_function_creator)
+                                     .setObjectLinkingLayerCreator(object_linking_layer_creator)
+                                     .create());
 
     // Add a ThreadSafeModule to the driver.
     llvm::orc::ThreadSafeModule tsm(std::move(module), std::make_unique<llvm::LLVMContext>());
@@ -80,5 +110,25 @@ std::unique_ptr<llvm::TargetMachine> JITDriver::create_target(
     return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
+void JITDriver::set_triple_and_data_layout(const std::string& features) {
+    // Get the default target triple for the host.
+    auto target_triple = llvm::sys::getDefaultTargetTriple();
+    std::string error_msg;
+    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
+    if (!target)
+        throw std::runtime_error("Error " + error_msg + "\n");
+
+    // Get the CPU information and set a target machine to create the data layout.
+    std::string cpu(llvm::sys::getHostCPUName());
+
+    std::unique_ptr<llvm::TargetMachine> tm(
+        target->createTargetMachine(target_triple, cpu, features, {}, {}));
+    if (!tm)
+        throw std::runtime_error("Error: could not create the target machine\n");
+
+    // Set data layout and the target triple to the module.
+    module->setDataLayout(tm->createDataLayout());
+    module->setTargetTriple(target_triple);
+}
 }  // namespace runner
 }  // namespace nmodl
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index f994a57303..d46e605054 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -37,7 +37,7 @@ class JITDriver {
         : module(std::move(m)) {}
 
     /// Initialize the JIT.
-    void init(std::string features);
+    void init(std::string features, std::vector<std::string>& lib_paths);
 
     /// Lookup the entry-point without arguments in the JIT and execute it, returning the result.
     template <typename ReturnType>
@@ -66,6 +66,9 @@ class JITDriver {
     /// A wrapper around llvm::createTargetMachine to turn on/off certain CPU features.
     std::unique_ptr<llvm::TargetMachine> create_target(llvm::orc::JITTargetMachineBuilder* builder,
                                                        const std::string& features);
+
+    /// Sets the triple and the data layout for the module.
+    void set_triple_and_data_layout(const std::string& features);
 };
 
 /**
@@ -79,9 +82,11 @@ class Runner {
     std::unique_ptr<JITDriver> driver = std::make_unique<JITDriver>(std::move(module));
 
   public:
-    Runner(std::unique_ptr<llvm::Module> m, std::string features = "")
+    Runner(std::unique_ptr<llvm::Module> m,
+           std::string features = "",
+           std::vector<std::string> lib_paths = {})
         : module(std::move(m)) {
-        driver->init(features);
+        driver->init(features, lib_paths);
     }
 
     /// Run the entry-point function without arguments.
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index 6ab9ff4982..4c49ce30df 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -43,7 +43,8 @@ void LLVMBenchmark::benchmark(const std::shared_ptr<ast::Program>& node) {
                                         output_dir,
                                         llvm_build_info.opt_passes,
                                         llvm_build_info.use_single_precision,
-                                        llvm_build_info.vector_width);
+                                        llvm_build_info.vector_width,
+                                        llvm_build_info.vec_lib);
     generate_llvm(visitor, node);
 
     // Finally, run the benchmark and log the measurements.
@@ -103,7 +104,7 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
 
     std::string features_str = llvm::join(features.begin(), features.end(), ",");
     std::unique_ptr<llvm::Module> m = visitor.get_module();
-    runner::Runner runner(std::move(m), features_str);
+    runner::Runner runner(std::move(m), features_str, shared_libs);
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/src/codegen/llvm/llvm_benchmark.hpp
index 30ebf182e8..d23567d79d 100644
--- a/src/codegen/llvm/llvm_benchmark.hpp
+++ b/src/codegen/llvm/llvm_benchmark.hpp
@@ -20,6 +20,7 @@ struct LLVMBuildInfo {
     int vector_width;
     bool opt_passes;
     bool use_single_precision;
+    std::string vec_lib;
 };
 
 /**
@@ -33,6 +34,8 @@ class LLVMBenchmark {
 
     std::string output_dir;
 
+    std::vector<std::string> shared_libs;
+
     int num_experiments;
 
     int instance_size;
@@ -65,12 +68,14 @@ class LLVMBenchmark {
   public:
     LLVMBenchmark(const std::string& mod_filename,
                   const std::string& output_dir,
+                  std::vector<std::string> shared_libs,
                   LLVMBuildInfo info,
                   int num_experiments,
                   int instance_size,
                   const std::string& backend)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
+        , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
         , backend(backend)
diff --git a/src/main.cpp b/src/main.cpp
index 69a7109dde..26958ae129 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -177,9 +177,12 @@ int main(int argc, const char* argv[]) {
     /// llvm vector width
     int llvm_vec_width = 1;
 
-    /// vector library
+    /// vector library name
     std::string vec_lib("none");
 
+    /// list of shared libraries to link
+    std::vector<std::string> libs;
+
     /// run llvm benchmark
     bool run_benchmark(false);
 
@@ -321,6 +324,9 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_flag("--run",
                        run_benchmark,
                        "Run LLVM benchmark ({})"_format(run_benchmark))->ignore_case();
+    benchmark_opt->add_option("--libs", libs, "Shared libraries to link IR against")
+            ->ignore_case()
+            ->check(CLI::ExistingFile);
     benchmark_opt->add_option("--instance-size",
                        instance_size,
                        "Instance struct size ({})"_format(instance_size))->ignore_case();
@@ -630,9 +636,12 @@ int main(int argc, const char* argv[]) {
 
             if (run_benchmark) {
                 logger->info("Running LLVM benchmark");
-                benchmark::LLVMBuildInfo info{llvm_vec_width, llvm_ir_opt_passes, llvm_float_type};
+                benchmark::LLVMBuildInfo info{llvm_vec_width,
+                                              llvm_ir_opt_passes,
+                                              llvm_float_type,
+                                              vec_lib};
                 benchmark::LLVMBenchmark bench(
-                    modfile, output_dir, info, repeat, instance_size, backend);
+                    modfile, output_dir, libs, info, repeat, instance_size, backend);
                 bench.benchmark(ast);
             }
 

From 16504c770a49233d36aee975c11d32e24dc1f5a7 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Sat, 24 Apr 2021 21:43:05 +0200
Subject: [PATCH 047/331] Avoid local std::ofstream object causing segfault
 (#614)

- std::ofstream().rdbuf() was used but as it was a local object,
   it becomes invalid at the end of function scope
 - make std::ofstream as member variable
---
 src/codegen/llvm/llvm_benchmark.cpp | 2 --
 src/codegen/llvm/llvm_benchmark.hpp | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index 4c49ce30df..c93b723cb0 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -145,8 +145,6 @@ void LLVMBenchmark::set_log_output() {
 
     // Otherwise, dump logs to the specified file.
     std::string filename = output_dir + "/" + mod_filename + ".log";
-    std::ofstream ofs;
-
     ofs.open(filename.c_str());
 
     if (ofs.fail())
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/src/codegen/llvm/llvm_benchmark.hpp
index d23567d79d..646912c253 100644
--- a/src/codegen/llvm/llvm_benchmark.hpp
+++ b/src/codegen/llvm/llvm_benchmark.hpp
@@ -46,6 +46,8 @@ class LLVMBenchmark {
 
     std::shared_ptr<std::ostream> log_stream;
 
+    std::ofstream ofs;
+
     /// Disable the specified feature.
     void disable(const std::string& feature, std::vector<std::string>& host_features);
 

From dd2889d2c14f44eb43dc5375ecf3b3128fc4ff3b Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 30 Apr 2021 15:29:31 -0700
Subject: [PATCH 048/331] Refactoring of runners' infrastructure and dumping
 object files (#620)

The following is added:

1. Dumping object files in JIT.

A functionality to dump (enabled by default) the generated from LLVM IR file binary to `.o` has been added to benchmarking. Now, in addition to logs, a `v<vector_width>_<mod_filename>.o` is generated. The reasons it is an object file and not an assembly (hence not included in logs) are the following:

- LLVM does not have library functions that take the object and turn back into assembly, but rather `object -> file -> assembly` path. It also has a `llvm-objdump` tool, but it is intended as a command-line utility and does not have a well-defined API.

- Writing custom functions to produce a readable assembly is not a priority. Also, mimicking `objdump` functionality would be difficult.

- Both `objdump` and `llvm-objdump` can be used to isnpect the `.o` file manually.

2. Refactoring of `Runner` class.

In addition to the support of dumping the binary, `Runner`and `JITDriver` classes were refactored to have a nicer OOP-style.

fixes #611

Co-authored-by: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
---
 src/codegen/llvm/jit_driver.cpp              | 11 ++-
 src/codegen/llvm/jit_driver.hpp              | 98 +++++++++++++++-----
 src/codegen/llvm/llvm_benchmark.cpp          |  6 +-
 src/codegen/llvm/main.cpp                    |  3 +-
 test/unit/codegen/codegen_llvm_execution.cpp | 12 ++-
 5 files changed, 102 insertions(+), 28 deletions(-)

diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index ec08e8856d..7910036848 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/Support/Host.h"
@@ -24,7 +25,9 @@
 namespace nmodl {
 namespace runner {
 
-void JITDriver::init(std::string features, std::vector<std::string>& lib_paths) {
+void JITDriver::init(std::string features,
+                     std::vector<std::string> lib_paths,
+                     ObjDumpInfo* dump_info) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
 
@@ -83,6 +86,12 @@ void JITDriver::init(std::string features, std::vector<std::string>& lib_paths)
     llvm::orc::JITDylib& sym_tab = jit->getMainJITDylib();
     sym_tab.addGenerator(cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
         data_layout.getGlobalPrefix())));
+
+    // Optionally, dump the binary to the object file.
+    if (dump_info) {
+        jit->getObjTransformLayer().setTransform(
+            llvm::orc::DumpObjects(dump_info->output_dir, dump_info->filename));
+    }
 }
 
 std::unique_ptr<llvm::TargetMachine> JITDriver::create_target(
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index d46e605054..dfd06ca7ee 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -20,9 +20,18 @@
 namespace nmodl {
 namespace runner {
 
+/// A struct to hold the information for dumping object file.
+struct ObjDumpInfo {
+    /// Object file name.
+    std::string filename;
+
+    /// Object file output directory.
+    std::string output_dir;
+};
+
 /**
  * \class JITDriver
- * \brief Driver to execute MOD file function via LLVM IR backend
+ * \brief Driver to execute a MOD file function via LLVM IR backend.
  */
 class JITDriver {
   private:
@@ -33,13 +42,15 @@ class JITDriver {
     std::unique_ptr<llvm::Module> module;
 
   public:
-    JITDriver(std::unique_ptr<llvm::Module> m)
+    explicit JITDriver(std::unique_ptr<llvm::Module> m)
         : module(std::move(m)) {}
 
-    /// Initialize the JIT.
-    void init(std::string features, std::vector<std::string>& lib_paths);
+    /// Initializes the JIT.
+    void init(std::string features = "",
+              std::vector<std::string> lib_paths = {},
+              ObjDumpInfo* dump_info = nullptr);
 
-    /// Lookup the entry-point without arguments in the JIT and execute it, returning the result.
+    /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
     template <typename ReturnType>
     ReturnType execute_without_arguments(const std::string& entry_point) {
         auto expected_symbol = jit->lookup(entry_point);
@@ -51,7 +62,7 @@ class JITDriver {
         return result;
     }
 
-    /// Lookup the entry-point with an argument in the JIT and execute it, returning the result.
+    /// Lookups the entry-point with an argument in the JIT and executes it, returning the result.
     template <typename ReturnType, typename ArgType>
     ReturnType execute_with_arguments(const std::string& entry_point, ArgType arg) {
         auto expected_symbol = jit->lookup(entry_point);
@@ -63,7 +74,8 @@ class JITDriver {
         return result;
     }
 
-    /// A wrapper around llvm::createTargetMachine to turn on/off certain CPU features.
+  private:
+    /// Creates llvm::TargetMachine with certain CPU features turned on/off.
     std::unique_ptr<llvm::TargetMachine> create_target(llvm::orc::JITTargetMachineBuilder* builder,
                                                        const std::string& features);
 
@@ -72,35 +84,79 @@ class JITDriver {
 };
 
 /**
- * \class Runner
- * \brief A wrapper around JITDriver to execute an entry point in the LLVM IR module.
+ * \class BaseRunner
+ * \brief A base runner class that provides functionality to execute an
+ * entry point in the LLVM IR module.
  */
-class Runner {
-  private:
-    std::unique_ptr<llvm::Module> module;
+class BaseRunner {
+  protected:
+    std::unique_ptr<JITDriver> driver;
 
-    std::unique_ptr<JITDriver> driver = std::make_unique<JITDriver>(std::move(module));
+    explicit BaseRunner(std::unique_ptr<llvm::Module> m)
+        : driver(std::make_unique<JITDriver>(std::move(m))) {}
 
   public:
-    Runner(std::unique_ptr<llvm::Module> m,
-           std::string features = "",
-           std::vector<std::string> lib_paths = {})
-        : module(std::move(m)) {
-        driver->init(features, lib_paths);
-    }
+    /// Sets up the JIT driver.
+    virtual void initialize_driver() = 0;
 
-    /// Run the entry-point function without arguments.
+    /// Runs the entry-point function without arguments.
     template <typename ReturnType>
     ReturnType run_without_arguments(const std::string& entry_point) {
         return driver->template execute_without_arguments<ReturnType>(entry_point);
     }
 
-    /// Run the entry-point function with a pointer to the data as an argument.
+    /// Runs the entry-point function with a pointer to the data as an argument.
     template <typename ReturnType, typename ArgType>
     ReturnType run_with_argument(const std::string& entry_point, ArgType arg) {
         return driver->template execute_with_arguments<ReturnType, ArgType>(entry_point, arg);
     }
 };
 
+/**
+ * \class TestRunner
+ * \brief A simple runner for testing purposes.
+ */
+class TestRunner: public BaseRunner {
+  public:
+    explicit TestRunner(std::unique_ptr<llvm::Module> m)
+        : BaseRunner(std::move(m)) {}
+
+    virtual void initialize_driver() {
+        driver->init();
+    }
+};
+
+/**
+ * \class BenchmarkRunner
+ * \brief A runner with benchmarking functionality. It takes user-specified CPU
+ * features into account, as well as it can link against shared libraries.
+ */
+class BenchmarkRunner: public BaseRunner {
+  private:
+    /// Information on dumping object file generated from LLVM IR.
+    ObjDumpInfo dump_info;
+
+    /// CPU features specified by the user.
+    std::string features;
+
+    /// Shared libraries' paths to link against.
+    std::vector<std::string> shared_lib_paths;
+
+  public:
+    BenchmarkRunner(std::unique_ptr<llvm::Module> m,
+                    std::string filename,
+                    std::string output_dir,
+                    std::string features = "",
+                    std::vector<std::string> lib_paths = {})
+        : BaseRunner(std::move(m))
+        , dump_info{filename, output_dir}
+        , features(features)
+        , shared_lib_paths(lib_paths) {}
+
+    virtual void initialize_driver() {
+        driver->init(features, shared_lib_paths, &dump_info);
+    }
+};
+
 }  // namespace runner
 }  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index c93b723cb0..87e36ec822 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -104,7 +104,11 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
 
     std::string features_str = llvm::join(features.begin(), features.end(), ",");
     std::unique_ptr<llvm::Module> m = visitor.get_module();
-    runner::Runner runner(std::move(m), features_str, shared_libs);
+
+    // Create the benchmark runner and intialize it.
+    std::string filename = "v" + std::to_string(llvm_build_info.vector_width) + "_" + mod_filename;
+    runner::BenchmarkRunner runner(std::move(m), filename, output_dir, features_str, shared_libs);
+    runner.initialize_driver();
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index acbdc37f19..b700f5ad59 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -64,7 +64,8 @@ int main(int argc, const char* argv[]) {
         throw std::runtime_error(
             "Error: entry-point functions with non-double return type are not supported\n");
 
-    Runner runner(std::move(module));
+    TestRunner runner(std::move(module));
+    runner.initialize_driver();
 
     // Since only double type is supported, provide explicit double type to the running function.
     auto r = runner.run_without_arguments<double>(entry_point_name);
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 4e2717e45c..cec4e5017b 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -130,7 +130,8 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
-        Runner runner(std::move(m));
+        TestRunner runner(std::move(m));
+        runner.initialize_driver();
 
         THEN("functions are evaluated correctly") {
             auto exp_result = runner.run_without_arguments<double>("exponential");
@@ -231,7 +232,8 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
-        Runner runner(std::move(m));
+        TestRunner runner(std::move(m));
+        runner.initialize_driver();
 
         THEN("optimizations preserve function results") {
             // Check exponential is turned into a constant.
@@ -325,7 +327,8 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        Runner runner(std::move(module));
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
 
         THEN("Values in struct have changed according to the formula") {
             runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
@@ -412,7 +415,8 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        Runner runner(std::move(module));
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
 
         THEN("Values in struct have changed according to the formula") {
             runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",

From 93732e0c0080c79b205d0a64a72601be741599e6 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 7 May 2021 15:13:53 -0700
Subject: [PATCH 049/331] Optimisation levels for benchmarking (#623)

This PR adds two flags to the benchmarking pipeline:

* `--opt-level-ir`: This flag is used to run `-On` passes on the
   generated LLVM IR module.
* `--opt-level-codegen`: This flag is used for setting optimisation level
    for machine code generation inside the JIT target machine.
* As an example:

```bash
$ ./nmodl file.mod \
     llvm --ir --vector-width 1 \
     benchmark --run --instance-size 10000000 --repeat 20 --opt-level-ir 2 --opt-level-codegen 2
```

fixes #616
---
 cmake/LLVMHelper.cmake              |   3 +
 src/codegen/llvm/jit_driver.cpp     | 197 ++++++++++++++++++++--------
 src/codegen/llvm/jit_driver.hpp     |  36 ++---
 src/codegen/llvm/llvm_benchmark.cpp |  40 +++---
 src/codegen/llvm/llvm_benchmark.hpp |  59 +++++----
 src/main.cpp                        |  59 ++++++---
 6 files changed, 266 insertions(+), 128 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 2b7db94a85..b0c8b2a48b 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -12,9 +12,12 @@ llvm_map_components_to_libnames(
   core
   executionengine
   instcombine
+  ipo
   mc
   native
   orcjit
+  target
+  transformutils
   scalaropts
   support)
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index 7910036848..1e8eb4bfd0 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -8,6 +8,7 @@
 #include "jit_driver.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
@@ -18,21 +19,139 @@
 #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 
 namespace nmodl {
 namespace runner {
 
+/****************************************************************************************/
+/*                            Utilities for JIT driver                                  */
+/****************************************************************************************/
+
+/// Initialises some LLVM optimisation passes.
+static void initialise_optimisation_passes() {
+    auto& registry = *llvm::PassRegistry::getPassRegistry();
+    llvm::initializeCore(registry);
+    llvm::initializeTransformUtils(registry);
+    llvm::initializeScalarOpts(registry);
+    llvm::initializeInstCombine(registry);
+    llvm::initializeAnalysis(registry);
+}
+
+/// Populates pass managers with passes for the given optimisation levels.
+static void populate_pms(llvm::legacy::FunctionPassManager& func_pm,
+                         llvm::legacy::PassManager& module_pm,
+                         int opt_level,
+                         int size_level,
+                         llvm::TargetMachine* tm) {
+    // First, set the pass manager builder with some basic optimisation information.
+    llvm::PassManagerBuilder pm_builder;
+    pm_builder.OptLevel = opt_level;
+    pm_builder.SizeLevel = size_level;
+    pm_builder.DisableUnrollLoops = opt_level == 0;
+
+    // If target machine is defined, then initialise the TargetTransformInfo for the target.
+    if (tm) {
+        module_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
+        func_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
+    }
+
+    // Populate pass managers.
+    pm_builder.populateModulePassManager(module_pm);
+    pm_builder.populateFunctionPassManager(func_pm);
+}
+
+/// Runs the function and module passes on the provided module.
+static void run_optimisation_passes(llvm::Module& module,
+                                    llvm::legacy::FunctionPassManager& func_pm,
+                                    llvm::legacy::PassManager& module_pm) {
+    func_pm.doInitialization();
+    auto& functions = module.getFunctionList();
+    for (auto& function: functions) {
+        llvm::verifyFunction(function);
+        func_pm.run(function);
+    }
+    func_pm.doFinalization();
+    module_pm.run(module);
+}
+
+/// Optimises the given LLVM IR module.
+static void optimise_module(llvm::Module& module,
+                            int opt_level,
+                            llvm::TargetMachine* tm = nullptr) {
+    llvm::legacy::FunctionPassManager func_pm(&module);
+    llvm::legacy::PassManager module_pm;
+    populate_pms(func_pm, module_pm, opt_level, /*size_level=*/0, tm);
+    run_optimisation_passes(module, func_pm, module_pm);
+}
+
+/// Sets the target triple and the data layout of the module.
+static void set_triple_and_data_layout(llvm::Module& module, const std::string& features) {
+    // Get the default target triple for the host.
+    auto target_triple = llvm::sys::getDefaultTargetTriple();
+    std::string error_msg;
+    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
+    if (!target)
+        throw std::runtime_error("Error " + error_msg + "\n");
+
+    // Get the CPU information and set a target machine to create the data layout.
+    std::string cpu(llvm::sys::getHostCPUName());
+    std::unique_ptr<llvm::TargetMachine> tm(
+        target->createTargetMachine(target_triple, cpu, features, {}, {}));
+    if (!tm)
+        throw std::runtime_error("Error: could not create the target machine\n");
+
+    // Set data layout and the target triple to the module.
+    module.setDataLayout(tm->createDataLayout());
+    module.setTargetTriple(target_triple);
+}
+
+/// Creates llvm::TargetMachine with certain CPU features turned on/off.
+static std::unique_ptr<llvm::TargetMachine> create_target(
+    llvm::orc::JITTargetMachineBuilder* tm_builder,
+    const std::string& features,
+    int opt_level) {
+    // First, look up the target.
+    std::string error_msg;
+    auto target_triple = tm_builder->getTargetTriple().getTriple();
+    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
+    if (!target)
+        throw std::runtime_error("Error " + error_msg + "\n");
+
+    // Create default target machine with provided features.
+    auto tm = target->createTargetMachine(target_triple,
+                                          llvm::sys::getHostCPUName().str(),
+                                          features,
+                                          tm_builder->getOptions(),
+                                          tm_builder->getRelocationModel(),
+                                          tm_builder->getCodeModel(),
+                                          static_cast<llvm::CodeGenOpt::Level>(opt_level),
+                                          /*JIT=*/true);
+    if (!tm)
+        throw std::runtime_error("Error: could not create the target machine\n");
+
+    return std::unique_ptr<llvm::TargetMachine>(tm);
+}
+
+/****************************************************************************************/
+/*                                      JIT driver                                      */
+/****************************************************************************************/
+
 void JITDriver::init(std::string features,
                      std::vector<std::string> lib_paths,
-                     ObjDumpInfo* dump_info) {
+                     BenchmarkInfo* benchmark_info) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
+    initialise_optimisation_passes();
 
     // Set the target triple and the data layout for the module.
-    set_triple_and_data_layout(features);
+    set_triple_and_data_layout(*module, features);
     auto data_layout = module->getDataLayout();
 
     // Create object linking function callback.
@@ -67,11 +186,31 @@ void JITDriver::init(std::string features,
     auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
         // Create target machine with some features possibly turned off.
-        auto tm = create_target(&tm_builder, features);
+        auto tm = create_target(&tm_builder, features, benchmark_info->opt_level_codegen);
+
+        // Optimise the LLVM IR module.
+        optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
+
+        // Save optimised module to .ll file if benchmarking.
+        if (benchmark_info) {
+            std::error_code error_code;
+            std::unique_ptr<llvm::ToolOutputFile> out =
+                std::make_unique<llvm::ToolOutputFile>(benchmark_info->output_dir + "/" +
+                                                           benchmark_info->filename + "_opt.ll",
+                                                       error_code,
+                                                       llvm::sys::fs::OF_Text);
+            if (error_code)
+                throw std::runtime_error("Error: " + error_code.message());
+
+            std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
+            module->print(out->os(), annotator.get());
+            out->keep();
+        }
+
         return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(tm));
     };
 
-    // Set JIT instance and extract the data layout from the module.
+    // Set the JIT instance.
     auto jit_instance = cantFail(llvm::orc::LLJITBuilder()
                                      .setCompileFunctionCreator(compile_function_creator)
                                      .setObjectLinkingLayerCreator(object_linking_layer_creator)
@@ -88,56 +227,10 @@ void JITDriver::init(std::string features,
         data_layout.getGlobalPrefix())));
 
     // Optionally, dump the binary to the object file.
-    if (dump_info) {
+    if (benchmark_info) {
         jit->getObjTransformLayer().setTransform(
-            llvm::orc::DumpObjects(dump_info->output_dir, dump_info->filename));
+            llvm::orc::DumpObjects(benchmark_info->output_dir, benchmark_info->filename));
     }
 }
-
-std::unique_ptr<llvm::TargetMachine> JITDriver::create_target(
-    llvm::orc::JITTargetMachineBuilder* builder,
-    const std::string& features) {
-    // First, look up the target.
-    std::string error_msg;
-    auto target_triple = builder->getTargetTriple().getTriple();
-    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
-    if (!target)
-        throw std::runtime_error("Error " + error_msg + "\n");
-
-    // Create default target machine with provided features.
-    auto tm = target->createTargetMachine(target_triple,
-                                          llvm::sys::getHostCPUName().str(),
-                                          features,
-                                          builder->getOptions(),
-                                          builder->getRelocationModel(),
-                                          builder->getCodeModel(),
-                                          /*OL=*/llvm::CodeGenOpt::Default,
-                                          /*JIT=*/true);
-    if (!tm)
-        throw std::runtime_error("Error: could not create the target machine\n");
-
-    return std::unique_ptr<llvm::TargetMachine>(tm);
-}
-
-void JITDriver::set_triple_and_data_layout(const std::string& features) {
-    // Get the default target triple for the host.
-    auto target_triple = llvm::sys::getDefaultTargetTriple();
-    std::string error_msg;
-    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
-    if (!target)
-        throw std::runtime_error("Error " + error_msg + "\n");
-
-    // Get the CPU information and set a target machine to create the data layout.
-    std::string cpu(llvm::sys::getHostCPUName());
-
-    std::unique_ptr<llvm::TargetMachine> tm(
-        target->createTargetMachine(target_triple, cpu, features, {}, {}));
-    if (!tm)
-        throw std::runtime_error("Error: could not create the target machine\n");
-
-    // Set data layout and the target triple to the module.
-    module->setDataLayout(tm->createDataLayout());
-    module->setTargetTriple(target_triple);
-}
 }  // namespace runner
 }  // namespace nmodl
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index dfd06ca7ee..151ec177d8 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -20,13 +20,19 @@
 namespace nmodl {
 namespace runner {
 
-/// A struct to hold the information for dumping object file.
-struct ObjDumpInfo {
-    /// Object file name.
+/// A struct to hold the information for benchmarking.
+struct BenchmarkInfo {
+    /// Object filename to dump.
     std::string filename;
 
     /// Object file output directory.
     std::string output_dir;
+
+    /// Optimisation level for generated IR.
+    int opt_level_ir;
+
+    /// Optimisation level for machine code generation.
+    int opt_level_codegen;
 };
 
 /**
@@ -45,10 +51,10 @@ class JITDriver {
     explicit JITDriver(std::unique_ptr<llvm::Module> m)
         : module(std::move(m)) {}
 
-    /// Initializes the JIT.
+    /// Initializes the JIT driver.
     void init(std::string features = "",
               std::vector<std::string> lib_paths = {},
-              ObjDumpInfo* dump_info = nullptr);
+              BenchmarkInfo* benchmark_info = nullptr);
 
     /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
     template <typename ReturnType>
@@ -73,14 +79,6 @@ class JITDriver {
         ReturnType result = res(arg);
         return result;
     }
-
-  private:
-    /// Creates llvm::TargetMachine with certain CPU features turned on/off.
-    std::unique_ptr<llvm::TargetMachine> create_target(llvm::orc::JITTargetMachineBuilder* builder,
-                                                       const std::string& features);
-
-    /// Sets the triple and the data layout for the module.
-    void set_triple_and_data_layout(const std::string& features);
 };
 
 /**
@@ -133,8 +131,8 @@ class TestRunner: public BaseRunner {
  */
 class BenchmarkRunner: public BaseRunner {
   private:
-    /// Information on dumping object file generated from LLVM IR.
-    ObjDumpInfo dump_info;
+    /// Benchmarking information passed to JIT driver.
+    BenchmarkInfo benchmark_info;
 
     /// CPU features specified by the user.
     std::string features;
@@ -147,14 +145,16 @@ class BenchmarkRunner: public BaseRunner {
                     std::string filename,
                     std::string output_dir,
                     std::string features = "",
-                    std::vector<std::string> lib_paths = {})
+                    std::vector<std::string> lib_paths = {},
+                    int opt_level_ir = 0,
+                    int opt_level_codegen = 0)
         : BaseRunner(std::move(m))
-        , dump_info{filename, output_dir}
+        , benchmark_info{filename, output_dir, opt_level_ir, opt_level_codegen}
         , features(features)
         , shared_lib_paths(lib_paths) {}
 
     virtual void initialize_driver() {
-        driver->init(features, shared_lib_paths, &dump_info);
+        driver->init(features, shared_lib_paths, &benchmark_info);
     }
 };
 
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index 87e36ec822..df0c54517d 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -19,10 +19,24 @@
 namespace nmodl {
 namespace benchmark {
 
-
 /// Precision for the timing measurements.
 static constexpr int PRECISION = 9;
 
+/// Get the host CPU features in the format:
+///   +feature,+feature,-feature,+feature,...
+/// where `+` indicates that the feature is enabled.
+static std::vector<std::string> get_cpu_features() {
+    std::string cpu(llvm::sys::getHostCPUName());
+
+    llvm::SubtargetFeatures features;
+    llvm::StringMap<bool> host_features;
+    if (llvm::sys::getHostCPUFeatures(host_features)) {
+        for (auto& f: host_features)
+            features.AddFeature(f.first(), f.second);
+    }
+    return features.getFeatures();
+}
+
 
 void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>& host_features) {
     for (auto& host_feature: host_features) {
@@ -34,7 +48,7 @@ void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>
     }
 }
 
-void LLVMBenchmark::benchmark(const std::shared_ptr<ast::Program>& node) {
+void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
     // First, set the output stream for the logs.
     set_log_output();
 
@@ -65,18 +79,6 @@ void LLVMBenchmark::generate_llvm(codegen::CodegenLLVMVisitor& visitor,
                 << diff.count() << "\n\n";
 }
 
-std::vector<std::string> LLVMBenchmark::get_cpu_features() {
-    std::string cpu(llvm::sys::getHostCPUName());
-
-    llvm::SubtargetFeatures features;
-    llvm::StringMap<bool> host_features;
-    if (llvm::sys::getHostCPUFeatures(host_features)) {
-        for (auto& f: host_features)
-            features.AddFeature(f.first(), f.second);
-    }
-    return features.getFeatures();
-}
-
 void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
                                   const std::shared_ptr<ast::Program>& node) {
     // Set the codegen data helper and find the kernels.
@@ -105,9 +107,15 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
     std::string features_str = llvm::join(features.begin(), features.end(), ",");
     std::unique_ptr<llvm::Module> m = visitor.get_module();
 
-    // Create the benchmark runner and intialize it.
+    // Create the benchmark runner and initialize it.
     std::string filename = "v" + std::to_string(llvm_build_info.vector_width) + "_" + mod_filename;
-    runner::BenchmarkRunner runner(std::move(m), filename, output_dir, features_str, shared_libs);
+    runner::BenchmarkRunner runner(std::move(m),
+                                   filename,
+                                   output_dir,
+                                   features_str,
+                                   shared_libs,
+                                   opt_level_ir,
+                                   opt_level_codegen);
     runner.initialize_driver();
 
     // Benchmark every kernel.
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/src/codegen/llvm/llvm_benchmark.hpp
index 646912c253..c2c781d7f0 100644
--- a/src/codegen/llvm/llvm_benchmark.hpp
+++ b/src/codegen/llvm/llvm_benchmark.hpp
@@ -30,43 +30,39 @@ struct LLVMBuildInfo {
  */
 class LLVMBenchmark {
   private:
+    /// Source MOD file name.
     std::string mod_filename;
 
+    /// The output directory for logs and other files.
     std::string output_dir;
 
+    /// Paths to shared libraries.
     std::vector<std::string> shared_libs;
 
+    /// The number of experiments to repeat.
     int num_experiments;
 
+    /// The size of the instance struct for benchmarking.
     int instance_size;
 
+    /// Benchmarking backend
     std::string backend;
 
+    /// Optimisation level for LLVM IR transformations.
+    int opt_level_ir;
+
+    /// Optimisation level for machine code generation.
+    int opt_level_codegen;
+
+    /// LLVM visitor information.
     LLVMBuildInfo llvm_build_info;
 
+    /// The log output stream (file or stdout).
     std::shared_ptr<std::ostream> log_stream;
 
+    /// Filestream for dumping logs to the file.
     std::ofstream ofs;
 
-    /// Disable the specified feature.
-    void disable(const std::string& feature, std::vector<std::string>& host_features);
-
-    /// Visits the AST to construct the LLVM IR module.
-    void generate_llvm(codegen::CodegenLLVMVisitor& visitor,
-                       const std::shared_ptr<ast::Program>& node);
-
-    /// Get the host CPU features in the format:
-    ///   +feature,+feature,-feature,+feature,...
-    /// where `+` indicates that the feature is enabled.
-    std::vector<std::string> get_cpu_features();
-
-    /// Runs the main body of the benchmark, executing the compute kernels.
-    void run_benchmark(codegen::CodegenLLVMVisitor& visitor,
-                       const std::shared_ptr<ast::Program>& node);
-
-    /// Sets the log output stream (file or console).
-    void set_log_output();
-
   public:
     LLVMBenchmark(const std::string& mod_filename,
                   const std::string& output_dir,
@@ -74,17 +70,36 @@ class LLVMBenchmark {
                   LLVMBuildInfo info,
                   int num_experiments,
                   int instance_size,
-                  const std::string& backend)
+                  const std::string& backend,
+                  int opt_level_ir,
+                  int opt_level_codegen)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
         , backend(backend)
-        , llvm_build_info(info) {}
+        , llvm_build_info(info)
+        , opt_level_ir(opt_level_ir)
+        , opt_level_codegen(opt_level_codegen) {}
 
     /// Runs the benchmark.
-    void benchmark(const std::shared_ptr<ast::Program>& node);
+    void run(const std::shared_ptr<ast::Program>& node);
+
+  private:
+    /// Disables the specified feature in the target.
+    void disable(const std::string& feature, std::vector<std::string>& host_features);
+
+    /// Visits the AST to construct the LLVM IR module.
+    void generate_llvm(codegen::CodegenLLVMVisitor& visitor,
+                       const std::shared_ptr<ast::Program>& node);
+
+    /// Runs the main body of the benchmark, executing the compute kernels.
+    void run_benchmark(codegen::CodegenLLVMVisitor& visitor,
+                       const std::shared_ptr<ast::Program>& node);
+
+    /// Sets the log output stream (file or console).
+    void set_log_output();
 };
 
 
diff --git a/src/main.cpp b/src/main.cpp
index 26958ae129..8e6e219df1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -178,19 +178,25 @@ int main(int argc, const char* argv[]) {
     int llvm_vec_width = 1;
 
     /// vector library name
-    std::string vec_lib("none");
-
-    /// list of shared libraries to link
-    std::vector<std::string> libs;
+    std::string vector_library("none");
 
     /// run llvm benchmark
-    bool run_benchmark(false);
+    bool run_llvm_benchmark(false);
+
+    /// optimisation level for IR generation
+    int llvm_opt_level_ir = 0;
+
+    /// optimisation level for machine code generation
+    int llvm_opt_level_codegen = 0;
+
+    /// list of shared libraries to link against in JIT
+    std::vector<std::string> shared_lib_paths;
 
     /// the size of the instance struct for the benchmark
     int instance_size = 10000;
 
-    /// the number of experiments to run for the benchmarking
-    int repeat = 100;
+    /// the number of repeated experiments for the benchmarking
+    int num_experiments = 100;
 
     /// specify the backend for LLVM IR to target
     std::string backend = "default";
@@ -316,23 +322,29 @@ int main(int argc, const char* argv[]) {
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
     llvm_opt->add_option("--veclib",
-                         vec_lib,
-                         "Vector library for maths functions ({})"_format(vec_lib))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
+                         vector_library,
+                         "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
 
     // LLVM IR benchmark options.
     auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
     benchmark_opt->add_flag("--run",
-                       run_benchmark,
-                       "Run LLVM benchmark ({})"_format(run_benchmark))->ignore_case();
-    benchmark_opt->add_option("--libs", libs, "Shared libraries to link IR against")
+                            run_llvm_benchmark,
+                            "Run LLVM benchmark ({})"_format(run_llvm_benchmark))->ignore_case();
+    benchmark_opt->add_option("--opt-level-ir",
+                              llvm_opt_level_ir,
+                              "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
+    benchmark_opt->add_option("--opt-level-codegen",
+                              llvm_opt_level_codegen,
+                              "Machine code optimisation level (O{})"_format(llvm_opt_level_codegen))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
+    benchmark_opt->add_option("--libs", shared_lib_paths, "Shared libraries to link IR against")
             ->ignore_case()
             ->check(CLI::ExistingFile);
     benchmark_opt->add_option("--instance-size",
                        instance_size,
                        "Instance struct size ({})"_format(instance_size))->ignore_case();
     benchmark_opt->add_option("--repeat",
-                       repeat,
-                       "Number of experiments for benchmarking ({})"_format(repeat))->ignore_case();
+                              num_experiments,
+                              "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
     benchmark_opt->add_option("--backend",
                        backend,
                        "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));
@@ -634,15 +646,22 @@ int main(int argc, const char* argv[]) {
 
 #ifdef NMODL_LLVM_BACKEND
 
-            if (run_benchmark) {
+            if (run_llvm_benchmark) {
                 logger->info("Running LLVM benchmark");
                 benchmark::LLVMBuildInfo info{llvm_vec_width,
                                               llvm_ir_opt_passes,
                                               llvm_float_type,
-                                              vec_lib};
-                benchmark::LLVMBenchmark bench(
-                    modfile, output_dir, libs, info, repeat, instance_size, backend);
-                bench.benchmark(ast);
+                                              vector_library};
+                benchmark::LLVMBenchmark benchmark(modfile,
+                                                   output_dir,
+                                                   shared_lib_paths,
+                                                   info,
+                                                   num_experiments,
+                                                   instance_size,
+                                                   backend,
+                                                   llvm_opt_level_ir,
+                                                   llvm_opt_level_codegen);
+                benchmark.run(ast);
             }
 
             else if (llvm_ir) {
@@ -652,7 +671,7 @@ int main(int argc, const char* argv[]) {
                                            llvm_ir_opt_passes,
                                            llvm_float_type,
                                            llvm_vec_width,
-                                           vec_lib);
+                                           vector_library);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));

From 150943c424019112cadccd37e23dcd402a8fd438 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 8 May 2021 02:44:01 -0700
Subject: [PATCH 050/331] Adding function debug information (#628)

Added debug support to LLVM code generation pipeline. Currently,
only basic support was added:

1. Debug information about functions (name)
2. Debug information about module

**What has been changed and added**

1. A new class `DebugBuilder` was created. It is used as a wrapper
around LLVM's `DIBuilder` and holds important information such as
`LLVMContext`, debug file and compile unit. It also wraps `DIBuilder`'s
functionality into a more suitable API.

2. A temporary `Location` struct has been added. It encapsulates the
location of the source AST construct and reflects `ModToken` on LLVM
code generation level. It is only used if the location of the source NMODL
function is known.

3. LLVM visitor know takes an extra `add_debug_information` flag and
handles debug information creation. Fore readability, `IRBuilder` was
renamed to `ir_builder`.

4. JIT runner is now able to listen for GDB, perf (build LLVM with
`-DLLVM_USE_PERF=ON`) and VTune (build LLVM with
`-DLLVM_USE_INTEL_JITEVENTS=ON`) events.

5. Necessary cmake changes were added to optionally support JIT event
listeners (`-DNMODL_HAVE_JIT_EVENT_LISTENERS`).

**How to generate debug information**

Debug information is attached to every function, procedure or artificially
created kernel (and corresponding wrappers). Debug information is enable
by default, so to turn it off use ` --disable-debug-info` flag. For example,
the given NMODL
```nmodl
1   FUNCTION func(x) {
2     func = x
3   }
4
5   PROCEDURE proc() {}
```
is transformed (running `./bin/nmodl <filename>.mod llvm --ir`) into
```llvm
define double @func(double %x1) !dbg !4 {
  ; ...
}

define i32 @proc() !dbg !6 {
  ; ...
}

!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3}

!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "NMODL-LLVM", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "foo", directory: ".")
!2 = !{}
!3 = !{i32 2, !"Debug Version", i32 3}
!4 = distinct !DISubprogram(name: "func", linkageName: "func", scope: null, file: !1, line: 1, type: !5, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!5 = !DISubroutineType(types: !2)
!6 = distinct !DISubprogram(name: "proc", linkageName: "proc", scope: null, file: !1, line: 5, type: !5, scopeLine: 5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
```

fixes #592 #612

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 CMakeLists.txt                                |   2 +
 ci/bb5-pr.sh                                  |   3 +-
 cmake/LLVMHelper.cmake                        |  38 ++--
 src/codegen/llvm/CMakeLists.txt               |   9 +-
 .../llvm/codegen_llvm_helper_visitor.cpp      |   3 +
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 201 ++++++++++--------
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  18 +-
 src/codegen/llvm/jit_driver.cpp               |  19 ++
 src/codegen/llvm/jit_driver.hpp               |  11 +
 src/codegen/llvm/llvm_benchmark.cpp           |   3 +-
 src/codegen/llvm/llvm_debug_builder.cpp       |  63 ++++++
 src/codegen/llvm/llvm_debug_builder.hpp       |  70 ++++++
 src/main.cpp                                  |   9 +-
 13 files changed, 339 insertions(+), 110 deletions(-)
 create mode 100644 src/codegen/llvm/llvm_debug_builder.cpp
 create mode 100644 src/codegen/llvm/llvm_debug_builder.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2a0b47a599..86da8140b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" OFF)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
 option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
+option(NMODL_ENABLE_JIT_EVENT_LISTENERS "Enable JITEventListener for Perf and Vtune" OFF)
 
 if(NMODL_ENABLE_LEGACY_UNITS)
   add_definitions(-DUSE_LEGACY_UNITS)
@@ -267,6 +268,7 @@ if(cmake_generator_tolower MATCHES "makefile")
     message(STATUS "  VERSION           | ${LLVM_PACKAGE_VERSION}")
     message(STATUS "  INCLUDE           | ${LLVM_INCLUDE_DIRS}")
     message(STATUS "  CMAKE             | ${LLVM_CMAKE_DIR}")
+    message(STATUS "  JIT LISTENERS     | ${NMODL_ENABLE_JIT_EVENT_LISTENERS}")
   endif()
   if(NMODL_CLANG_FORMAT)
     message(STATUS "Clang Format        | ${ClangFormat_EXECUTABLE}")
diff --git a/ci/bb5-pr.sh b/ci/bb5-pr.sh
index a840d38e17..b35f2b20d9 100755
--- a/ci/bb5-pr.sh
+++ b/ci/bb5-pr.sh
@@ -41,7 +41,8 @@ function build_with() {
              -DPYTHON_EXECUTABLE=$(which python3) \
              -DNMODL_FORMATTING:BOOL=ON \
              -DClangFormat_EXECUTABLE=$clang_format_exe \
-             -DLLVM_DIR=/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0421/lib/cmake/llvm
+             -DNMODL_ENABLE_JIT_EVENT_LISTENERS=ON \
+             -DLLVM_DIR=/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0521/lib/cmake/llvm
     make -j6
     popd
 }
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index b0c8b2a48b..780ae29cfa 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -4,22 +4,28 @@
 
 find_package(LLVM REQUIRED CONFIG)
 
-# include LLVM header and core library
-llvm_map_components_to_libnames(
-  LLVM_LIBS_TO_LINK
-  analysis
-  codegen
-  core
-  executionengine
-  instcombine
-  ipo
-  mc
-  native
-  orcjit
-  target
-  transformutils
-  scalaropts
-  support)
+# include LLVM libraries
+set(NMODL_LLVM_COMPONENTS
+    analysis
+    codegen
+    core
+    executionengine
+    instcombine
+    ipo
+    mc
+    native
+    orcjit
+    target
+    transformutils
+    scalaropts
+    support)
+
+if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
+  list(APPEND NMODL_LLVM_COMPONENTS inteljitevents perfjitevents)
+endif()
+
+llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK ${NMODL_LLVM_COMPONENTS})
+
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
 set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 8c2a295598..7814b502a3 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -9,7 +9,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
@@ -20,8 +22,11 @@ add_library(runner_obj OBJECT ${LLVM_CODEGEN_SOURCE_FILES})
 add_dependencies(runner_obj lexer_obj)
 set_property(TARGET runner_obj PROPERTY POSITION_INDEPENDENT_CODE ON)
 
-add_library(llvm_codegen STATIC $<TARGET_OBJECTS:runner_obj>)
+if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
+  target_compile_definitions(runner_obj PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
+endif()
 
+add_library(llvm_codegen STATIC $<TARGET_OBJECTS:runner_obj>)
 add_dependencies(llvm_codegen lexer util visitor)
 
 if(NOT NMODL_AS_SUBPROJECT)
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 0df364e649..de64e16bd3 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -193,6 +193,9 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     /// we have all information for code generation function, create a new node
     /// which will be inserted later into AST
     auto function = std::make_shared<ast::CodegenFunction>(fun_ret_type, name, arguments, block);
+    if (node.get_token()) {
+        function->set_token(*node.get_token()->clone());
+    }
     codegen_functions.push_back(function);
 }
 /**
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 1738d4139e..830814286e 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -16,7 +16,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/FileSystem.h"
@@ -69,7 +68,7 @@ llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, llvm::Value
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
     indices.push_back(index);
 
-    return builder.CreateInBoundsGEP(lookup(name), indices);
+    return ir_builder.CreateInBoundsGEP(lookup(name), indices);
 }
 
 llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
@@ -86,7 +85,7 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
         throw std::runtime_error("Error: " + member_name + " is not a member of the instance!");
 
     // Load the instance struct given its name from the ValueSymbolTable.
-    llvm::Value* instance_ptr = builder.CreateLoad(lookup(instance_name));
+    llvm::Value* instance_ptr = ir_builder.CreateLoad(lookup(instance_name));
 
     // Create a GEP instruction to get a pointer to the member.
     int member_index = instance_var_helper.get_variable_index(member_name);
@@ -95,7 +94,7 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     std::vector<llvm::Value*> indices;
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
     indices.push_back(llvm::ConstantInt::get(index_type, member_index));
-    llvm::Value* member_ptr = builder.CreateInBoundsGEP(instance_ptr, indices);
+    llvm::Value* member_ptr = ir_builder.CreateInBoundsGEP(instance_ptr, indices);
 
     // Get the member AST node from the instance AST node, for which we proceed with the code
     // generation. If the member is scalar, return the pointer to it straight away.
@@ -122,25 +121,25 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     // load the member which would be indexed later.
     llvm::Type* type = get_codegen_var_type(*codegen_var_with_type->get_type());
     llvm::Value* instance_member =
-        builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
+        ir_builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
 
     // Check if the code is vectorised and the index is indirect.
     std::string id = member_indexed_name->get_length()->get_node_name();
     if (id != kernel_id && is_kernel_code && vector_width > 1) {
         // Calculate a vector of addresses via GEP instruction, and then created a gather to load
         // indirectly.
-        llvm::Value* addresses = builder.CreateInBoundsGEP(instance_member, {i64_index});
-        return builder.CreateMaskedGather(addresses, llvm::Align());
+        llvm::Value* addresses = ir_builder.CreateInBoundsGEP(instance_member, {i64_index});
+        return ir_builder.CreateMaskedGather(addresses, llvm::Align());
     }
 
-    llvm::Value* member_addr = builder.CreateInBoundsGEP(instance_member, {i64_index});
+    llvm::Value* member_addr = ir_builder.CreateInBoundsGEP(instance_member, {i64_index});
 
     // If the code is vectorised, then bitcast to a vector pointer.
     if (is_kernel_code && vector_width > 1) {
         llvm::Type* vector_type =
             llvm::PointerType::get(llvm::FixedVectorType::get(type, vector_width),
                                    /*AddressSpace=*/0);
-        return builder.CreateBitCast(member_addr, vector_type);
+        return ir_builder.CreateBitCast(member_addr, vector_type);
     }
     return member_addr;
 }
@@ -152,7 +151,7 @@ llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
     llvm::Value* index_value;
     if (node.get_length()->is_name()) {
         llvm::Value* ptr = lookup(node.get_length()->get_node_name());
-        index_value = builder.CreateLoad(ptr);
+        index_value = ir_builder.CreateLoad(ptr);
     } else {
         node.get_length()->accept(*this);
         index_value = values.back();
@@ -169,15 +168,15 @@ llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
     if (auto index_type = llvm::dyn_cast<llvm::IntegerType>(index_value->getType())) {
         if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
             return index_value;
-        return builder.CreateSExtOrTrunc(index_value, i64_type);
+        return ir_builder.CreateSExtOrTrunc(index_value, i64_type);
     }
 
     auto vector_type = llvm::cast<llvm::FixedVectorType>(index_value->getType());
     auto element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
     if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
         return index_value;
-    return builder.CreateSExtOrTrunc(index_value,
-                                     llvm::FixedVectorType::get(i64_type, vector_width));
+    return ir_builder.CreateSExtOrTrunc(index_value,
+                                        llvm::FixedVectorType::get(i64_type, vector_width));
 }
 
 int CodegenLLVMVisitor::get_array_length(const ast::IndexedName& node) {
@@ -334,11 +333,12 @@ void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
         argument_values.push_back(value);
     }
 
-#define DISPATCH(method_name, intrinsic)                                                           \
-    if (name == (method_name)) {                                                                   \
-        llvm::Value* result = builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
-        values.push_back(result);                                                                  \
-        return;                                                                                    \
+#define DISPATCH(method_name, intrinsic)                                            \
+    if (name == (method_name)) {                                                    \
+        llvm::Value* result =                                                       \
+            ir_builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
+        values.push_back(result);                                                   \
+        return;                                                                     \
     }
 
     DISPATCH("exp", llvm::Intrinsic::exp);
@@ -360,7 +360,7 @@ void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
     std::vector<llvm::Value*> argument_values;
     argument_values.reserve(arguments.size());
     pack_function_call_arguments(arguments, argument_values);
-    llvm::Value* call = builder.CreateCall(func, argument_values);
+    llvm::Value* call = ir_builder.CreateCall(func, argument_values);
     values.push_back(call);
 }
 
@@ -382,7 +382,7 @@ void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& argumen
     std::vector<llvm::Value*> argument_values;
     argument_values.reserve(arguments.size());
     pack_function_call_arguments(arguments, argument_values);
-    builder.CreateCall(printf, argument_values);
+    ir_builder.CreateCall(printf, argument_values);
 }
 
 void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::CodegenFunction& node) {
@@ -397,10 +397,21 @@ void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Codeg
     llvm::Type* return_type = get_codegen_var_type(*node.get_return_type());
 
     // Create a function that is automatically inserted into module's symbol table.
-    llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
-                           llvm::Function::ExternalLinkage,
-                           name,
-                           *module);
+    auto func =
+        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                               llvm::Function::ExternalLinkage,
+                               name,
+                               *module);
+
+    // Add function debug information, with location information if it exists.
+    if (add_debug_information) {
+        if (node.get_token()) {
+            Location loc{node.get_token()->start_line(), node.get_token()->start_column()};
+            debug_builder.add_function_debug_info(func, &loc);
+        } else {
+            debug_builder.add_function_debug_info(func);
+        }
+    }
 }
 
 llvm::Value* CodegenLLVMVisitor::lookup(const std::string& name) {
@@ -416,7 +427,7 @@ void CodegenLLVMVisitor::pack_function_call_arguments(const ast::ExpressionVecto
         if (arg->is_string()) {
             // If the argument is a string, create a global i8* variable with it.
             auto string_arg = std::dynamic_pointer_cast<ast::String>(arg);
-            llvm::Value* str = builder.CreateGlobalStringPtr(string_arg->get_value());
+            llvm::Value* str = ir_builder.CreateGlobalStringPtr(string_arg->get_value());
             arg_values.push_back(str);
         } else {
             arg->accept(*this);
@@ -443,10 +454,10 @@ llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
             result = llvm_fp_op(lhs, rhs);           \
         return result;
 
-        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd, builder.CreateAdd);
-        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv, builder.CreateSDiv);
-        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul, builder.CreateMul);
-        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub, builder.CreateSub);
+        DISPATCH(ast::BinaryOp::BOP_ADDITION, ir_builder.CreateFAdd, ir_builder.CreateAdd);
+        DISPATCH(ast::BinaryOp::BOP_DIVISION, ir_builder.CreateFDiv, ir_builder.CreateSDiv);
+        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, ir_builder.CreateFMul, ir_builder.CreateMul);
+        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, ir_builder.CreateFSub, ir_builder.CreateSub);
 
 #undef DISPATCH
 
@@ -461,15 +472,15 @@ void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm
         throw std::runtime_error("Error: only VarName assignment is supported!");
 
     llvm::Value* ptr = get_variable_ptr(*var);
-    builder.CreateStore(rhs, ptr);
+    ir_builder.CreateStore(rhs, ptr);
 }
 
 llvm::Value* CodegenLLVMVisitor::visit_logical_bin_op(llvm::Value* lhs,
                                                       llvm::Value* rhs,
                                                       unsigned op) {
     const auto& bin_op = static_cast<ast::BinaryOp>(op);
-    return bin_op == ast::BinaryOp::BOP_AND ? builder.CreateAnd(lhs, rhs)
-                                            : builder.CreateOr(lhs, rhs);
+    return bin_op == ast::BinaryOp::BOP_AND ? ir_builder.CreateAnd(lhs, rhs)
+                                            : ir_builder.CreateOr(lhs, rhs);
 }
 
 llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
@@ -488,12 +499,14 @@ llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
             result = i_llvm_op(lhs, rhs);                    \
         return result;
 
-        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, builder.CreateICmpEQ, builder.CreateFCmpOEQ);
-        DISPATCH(ast::BinaryOp::BOP_GREATER, builder.CreateICmpSGT, builder.CreateFCmpOGT);
-        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL, builder.CreateICmpSGE, builder.CreateFCmpOGE);
-        DISPATCH(ast::BinaryOp::BOP_LESS, builder.CreateICmpSLT, builder.CreateFCmpOLT);
-        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, builder.CreateICmpSLE, builder.CreateFCmpOLE);
-        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, builder.CreateICmpNE, builder.CreateFCmpONE);
+        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, ir_builder.CreateICmpEQ, ir_builder.CreateFCmpOEQ);
+        DISPATCH(ast::BinaryOp::BOP_GREATER, ir_builder.CreateICmpSGT, ir_builder.CreateFCmpOGT);
+        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL,
+                 ir_builder.CreateICmpSGE,
+                 ir_builder.CreateFCmpOGE);
+        DISPATCH(ast::BinaryOp::BOP_LESS, ir_builder.CreateICmpSLT, ir_builder.CreateFCmpOLT);
+        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, ir_builder.CreateICmpSLE, ir_builder.CreateFCmpOLE);
+        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, ir_builder.CreateICmpNE, ir_builder.CreateFCmpONE);
 
 #undef DISPATCH
 
@@ -602,7 +615,7 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     is_kernel_code = false;
 
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -633,31 +646,31 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     }
 
     // Branch to condition basic block and insert condition code there.
-    builder.CreateBr(for_cond);
-    builder.SetInsertPoint(for_cond);
+    ir_builder.CreateBr(for_cond);
+    ir_builder.SetInsertPoint(for_cond);
     node.get_condition()->accept(*this);
 
     // Extract the condition to decide whether to branch to the loop body or loop exit.
     llvm::Value* cond = values.back();
     values.pop_back();
-    builder.CreateCondBr(cond, for_body, exit);
+    ir_builder.CreateCondBr(cond, for_body, exit);
 
     // Generate code for the loop body and create the basic block for the increment.
-    builder.SetInsertPoint(for_body);
+    ir_builder.SetInsertPoint(for_body);
     is_kernel_code = true;
     const auto& statement_block = node.get_statement_block();
     statement_block->accept(*this);
     is_kernel_code = false;
-    builder.CreateBr(for_inc);
+    ir_builder.CreateBr(for_inc);
 
     // Process increment.
-    builder.SetInsertPoint(for_inc);
+    ir_builder.SetInsertPoint(for_inc);
     node.get_increment()->accept(*this);
 
     // Create a branch to condition block, then generate exit code out of the loop. Restore the
     // vector width.
-    builder.CreateBr(for_cond);
-    builder.SetInsertPoint(exit);
+    ir_builder.CreateBr(for_cond);
+    ir_builder.SetInsertPoint(exit);
     vector_width = tmp_vector_width;
     is_kernel_code = true;
 }
@@ -672,7 +685,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
     llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
-    builder.SetInsertPoint(body);
+    ir_builder.SetInsertPoint(body);
 
     // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
     // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
@@ -687,9 +700,10 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     unsigned i = 0;
     for (auto& arg: func->args()) {
         std::string arg_name = arguments[i++].get()->get_node_name();
-        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
+        llvm::Type* arg_type = arg.getType();
+        llvm::Value* alloca = ir_builder.CreateAlloca(arg_type, /*ArraySize=*/nullptr, arg_name);
         arg.setName(arg_name);
-        builder.CreateStore(&arg, alloca);
+        ir_builder.CreateStore(&arg, alloca);
     }
 
     // Process function or procedure body. If the function is a compute kernel, then set the
@@ -705,7 +719,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
     if (has_void_ret_type)
-        builder.CreateRetVoid();
+        ir_builder.CreateRetVoid();
 
     // Clear local values stack and remove the pointer to the local symbol table.
     values.clear();
@@ -717,8 +731,8 @@ void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturn
         throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
 
     std::string ret = "ret_" + current_func->getName().str();
-    llvm::Value* ret_value = builder.CreateLoad(lookup(ret));
-    builder.CreateRet(ret_value);
+    llvm::Value* ret_value = ir_builder.CreateLoad(lookup(ret));
+    ir_builder.CreateRet(ret_value);
 }
 
 void CodegenLLVMVisitor::visit_codegen_var_list_statement(
@@ -750,7 +764,7 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
         } else {
             throw std::runtime_error("Error: Unsupported local variable type");
         }
-        builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+        ir_builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
     }
 }
 
@@ -785,7 +799,7 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
 
 void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -799,9 +813,9 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     values.pop_back();
 
     // Process the true block.
-    builder.SetInsertPoint(true_block);
+    ir_builder.SetInsertPoint(true_block);
     node.get_statement_block()->accept(*this);
-    builder.CreateBr(merge_block);
+    ir_builder.CreateBr(merge_block);
 
     // Save the merge block and proceed with codegen for `else if` statements.
     llvm::BasicBlock* exit = merge_block;
@@ -809,11 +823,11 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
         // Link the current block to the true and else blocks.
         llvm::BasicBlock* else_block =
             llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        builder.SetInsertPoint(curr_block);
-        builder.CreateCondBr(cond, true_block, else_block);
+        ir_builder.SetInsertPoint(curr_block);
+        ir_builder.CreateCondBr(cond, true_block, else_block);
 
         // Process else block.
-        builder.SetInsertPoint(else_block);
+        ir_builder.SetInsertPoint(else_block);
         else_if->get_condition()->accept(*this);
         cond = values.back();
         values.pop_back();
@@ -823,13 +837,13 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
         true_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
         llvm::BasicBlock* tmp = merge_block;
         merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        builder.SetInsertPoint(merge_block);
-        builder.CreateBr(tmp);
+        ir_builder.SetInsertPoint(merge_block);
+        ir_builder.CreateBr(tmp);
 
         // Process true block.
-        builder.SetInsertPoint(true_block);
+        ir_builder.SetInsertPoint(true_block);
         else_if->get_statement_block()->accept(*this);
-        builder.CreateBr(merge_block);
+        ir_builder.CreateBr(merge_block);
         curr_block = else_block;
     }
 
@@ -838,15 +852,15 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     llvm::BasicBlock* else_block;
     if (elses) {
         else_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        builder.SetInsertPoint(else_block);
+        ir_builder.SetInsertPoint(else_block);
         elses->get_statement_block()->accept(*this);
-        builder.CreateBr(merge_block);
+        ir_builder.CreateBr(merge_block);
     } else {
         else_block = merge_block;
     }
-    builder.SetInsertPoint(curr_block);
-    builder.CreateCondBr(cond, true_block, else_block);
-    builder.SetInsertPoint(exit);
+    ir_builder.SetInsertPoint(curr_block);
+    ir_builder.CreateCondBr(cond, true_block, else_block);
+    ir_builder.SetInsertPoint(exit);
 }
 
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
@@ -867,9 +881,13 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     CodegenLLVMHelperVisitor v{vector_width};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
-
     kernel_id = v.get_kernel_id();
 
+    // Create compile unit if adding debug information to the module.
+    if (add_debug_information) {
+        debug_builder.create_compile_unit(*module, module->getModuleIdentifier(), output_dir);
+    }
+
     // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
     for (const auto& func: functions) {
@@ -889,6 +907,11 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         visit_codegen_function(*func);
     }
 
+    // Finalize the debug information.
+    if (add_debug_information) {
+        debug_builder.finalize();
+    }
+
     // Verify the generated LLVM IR module.
     std::string error;
     llvm::raw_string_ostream ostream(error);
@@ -958,9 +981,9 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
     llvm::Value* value = values.back();
     values.pop_back();
     if (op == ast::UOP_NEGATION) {
-        values.push_back(builder.CreateFNeg(value));
+        values.push_back(ir_builder.CreateFNeg(value));
     } else if (op == ast::UOP_NOT) {
-        values.push_back(builder.CreateNot(value));
+        values.push_back(ir_builder.CreateNot(value));
     } else {
         throw std::runtime_error("Error: unsupported unary operator\n");
     }
@@ -971,7 +994,7 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 
     // Finally, load the variable from the pointer value unless it has already been loaded (e.g. via
     // gather instruction).
-    llvm::Value* var = ptr->getType()->isPointerTy() ? builder.CreateLoad(ptr) : ptr;
+    llvm::Value* var = ptr->getType()->isPointerTy() ? ir_builder.CreateLoad(ptr) : ptr;
 
     // If the value should not be vectorised, or it is already a vector, add it to the stack.
     if (!is_kernel_code || vector_width <= 1 || var->getType()->isVectorTy()) {
@@ -981,13 +1004,13 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 
     // Otherwise, if we are generating vectorised inside the loop, replicate the value to form a
     // vector of `vector_width`.
-    llvm::Value* vector_var = builder.CreateVectorSplat(vector_width, var);
+    llvm::Value* vector_var = ir_builder.CreateVectorSplat(vector_width, var);
     values.push_back(vector_var);
 }
 
 void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -996,20 +1019,20 @@ void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node)
     llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
 
-    builder.CreateBr(header);
-    builder.SetInsertPoint(header);
+    ir_builder.CreateBr(header);
+    ir_builder.SetInsertPoint(header);
 
     // Generate code for condition and create branch to the body block.
     node.get_condition()->accept(*this);
     llvm::Value* condition = values.back();
     values.pop_back();
-    builder.CreateCondBr(condition, body, exit);
+    ir_builder.CreateCondBr(condition, body, exit);
 
-    builder.SetInsertPoint(body);
+    ir_builder.SetInsertPoint(body);
     node.get_statement_block()->accept(*this);
-    builder.CreateBr(header);
+    ir_builder.CreateBr(header);
 
-    builder.SetInsertPoint(exit);
+    ir_builder.SetInsertPoint(exit);
 }
 
 void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
@@ -1050,17 +1073,23 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
             llvm::Function::ExternalLinkage,
             "__" + kernel_name + "_wrapper",
             *module);
+
+        // Optionally, add debug information for the wrapper function.
+        if (add_debug_information) {
+            debug_builder.add_function_debug_info(wrapper_func);
+        }
+
         llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
-        builder.SetInsertPoint(body);
+        ir_builder.SetInsertPoint(body);
 
         // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
         // and adding a terminator.
-        llvm::Value* bitcasted = builder.CreateBitCast(wrapper_func->getArg(0),
-                                                       instance_struct_ptr_type);
+        llvm::Value* bitcasted = ir_builder.CreateBitCast(wrapper_func->getArg(0),
+                                                          instance_struct_ptr_type);
         std::vector<llvm::Value*> args;
         args.push_back(bitcasted);
-        builder.CreateCall(kernel, args);
-        builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
+        ir_builder.CreateCall(kernel, args);
+        ir_builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
     }
 }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 099613f8d4..450e1872a4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -19,11 +19,13 @@
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+#include "codegen/llvm/llvm_debug_builder.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -74,7 +76,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
     std::unique_ptr<llvm::Module> module = std::make_unique<llvm::Module>(mod_filename, *context);
 
-    llvm::IRBuilder<> builder;
+    // LLVM IR builder.
+    llvm::IRBuilder<> ir_builder;
+
+    // Debug information builder.
+    DebugBuilder debug_builder;
+
+    // Add debug information to the module.
+    bool add_debug_information;
 
     // Pass manager for optimisation passes that are used for target code generation.
     llvm::legacy::FunctionPassManager codegen_pm;
@@ -129,14 +138,17 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        bool opt_passes,
                        bool use_single_precision = false,
                        int vector_width = 1,
-                       std::string vec_lib = "none")
+                       std::string vec_lib = "none",
+                       bool add_debug_information = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , use_single_precision(use_single_precision)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
-        , builder(*context)
+        , add_debug_information(add_debug_information)
+        , ir_builder(*context)
+        , debug_builder(*module)
         , codegen_pm(module.get())
         , opt_pm(module.get()) {}
 
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index 1e8eb4bfd0..532cd20b8f 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -154,6 +154,16 @@ void JITDriver::init(std::string features,
     set_triple_and_data_layout(*module, features);
     auto data_layout = module->getDataLayout();
 
+    // If benchmarking, enable listeners to use GDB, perf or VTune. Note that LLVM should be built
+    // with listeners on (e.g. -DLLVM_USE_PERF=ON).
+    if (benchmark_info) {
+        gdb_event_listener = llvm::JITEventListener::createGDBRegistrationListener();
+#if defined(NMODL_HAVE_JIT_EVENT_LISTENERS)
+        perf_event_listener = llvm::JITEventListener::createPerfJITEventListener();
+        intel_event_listener = llvm::JITEventListener::createIntelJITEventListener();
+#endif
+    }
+
     // Create object linking function callback.
     auto object_linking_layer_creator = [&](llvm::orc::ExecutionSession& session,
                                             const llvm::Triple& triple) {
@@ -161,6 +171,15 @@ void JITDriver::init(std::string features,
         auto layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, []() {
             return std::make_unique<llvm::SectionMemoryManager>();
         });
+
+        // Register event listeners if they exist.
+        if (gdb_event_listener)
+            layer->registerJITEventListener(*gdb_event_listener);
+        if (perf_event_listener)
+            layer->registerJITEventListener(*perf_event_listener);
+        if (intel_event_listener)
+            layer->registerJITEventListener(*intel_event_listener);
+
         for (const auto& lib_path: lib_paths) {
             // For every library path, create a corresponding memory buffer.
             auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index 151ec177d8..afb1317cd8 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -15,6 +15,7 @@
  * \brief \copybrief nmodl::runner::JITDriver
  */
 
+#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 
 namespace nmodl {
@@ -45,8 +46,18 @@ class JITDriver {
 
     std::unique_ptr<llvm::orc::LLJIT> jit;
 
+    /// LLVM IR module to execute.
     std::unique_ptr<llvm::Module> module;
 
+    /// GDB event listener.
+    llvm::JITEventListener* gdb_event_listener = nullptr;
+
+    /// perf event listener.
+    llvm::JITEventListener* perf_event_listener = nullptr;
+
+    /// Intel event listener.
+    llvm::JITEventListener* intel_event_listener = nullptr;
+
   public:
     explicit JITDriver(std::unique_ptr<llvm::Module> m)
         : module(std::move(m)) {}
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index df0c54517d..adbe653f1e 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -58,7 +58,8 @@ void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
                                         llvm_build_info.opt_passes,
                                         llvm_build_info.use_single_precision,
                                         llvm_build_info.vector_width,
-                                        llvm_build_info.vec_lib);
+                                        llvm_build_info.vec_lib,
+                                        /*add_debug_information=*/true);
     generate_llvm(visitor, node);
 
     // Finally, run the benchmark and log the measurements.
diff --git a/src/codegen/llvm/llvm_debug_builder.cpp b/src/codegen/llvm/llvm_debug_builder.cpp
new file mode 100644
index 0000000000..5682a6e904
--- /dev/null
+++ b/src/codegen/llvm/llvm_debug_builder.cpp
@@ -0,0 +1,63 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/llvm_debug_builder.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+
+static constexpr const char debug_version_key[] = "Debug Version";
+
+
+void DebugBuilder::add_function_debug_info(llvm::Function* function, Location* loc) {
+    // Create the function debug type (subroutine type). We are not interested in parameters and
+    // types, and therefore passing llvm::None as argument suffices for now.
+    llvm::DISubroutineType* subroutine_type = di_builder.createSubroutineType(
+        di_builder.getOrCreateTypeArray(llvm::None));
+    llvm::DISubprogram::DISPFlags sp_flags = llvm::DISubprogram::SPFlagDefinition |
+                                             llvm::DISubprogram::SPFlagOptimized;
+    // If there is no location associated with the function, just use 0.
+    int line = loc ? loc->line : 0;
+    llvm::DISubprogram* program = di_builder.createFunction(compile_unit,
+                                                            function->getName(),
+                                                            function->getName(),
+                                                            file,
+                                                            line,
+                                                            subroutine_type,
+                                                            line,
+                                                            llvm::DINode::FlagZero,
+                                                            sp_flags);
+    function->setSubprogram(program);
+    di_builder.finalizeSubprogram(program);
+}
+
+void DebugBuilder::create_compile_unit(llvm::Module& module,
+                                       const std::string& debug_filename,
+                                       const std::string& debug_output_dir) {
+    // Create the debug file and compile unit for the module.
+    file = di_builder.createFile(debug_filename, debug_output_dir);
+    compile_unit = di_builder.createCompileUnit(llvm::dwarf::DW_LANG_C,
+                                                file,
+                                                /*Producer=*/"NMODL-LLVM",
+                                                /*isOptimized=*/false,
+                                                /*Flags=*/"",
+                                                /*RV=*/0);
+
+    // Add a flag to the module to specify that it has debug information.
+    if (!module.getModuleFlag(debug_version_key)) {
+        module.addModuleFlag(llvm::Module::Warning,
+                             debug_version_key,
+                             llvm::DEBUG_METADATA_VERSION);
+    }
+}
+
+void DebugBuilder::finalize() {
+    di_builder.finalize();
+}
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_debug_builder.hpp b/src/codegen/llvm/llvm_debug_builder.hpp
new file mode 100644
index 0000000000..9322cd461a
--- /dev/null
+++ b/src/codegen/llvm/llvm_debug_builder.hpp
@@ -0,0 +1,70 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
+namespace nmodl {
+namespace codegen {
+
+/// A struct to store AST location information.
+/// \todo Currently, not all AST nodes have location information. Moreover,
+/// some may not have it as they were artificially introduced (e.g.
+/// CodegenForStatement). This simple wrapper suffices for now, but in future
+/// we may want to handle this properly.
+struct Location {
+    /// Line in the file.
+    int line;
+
+    /// Column in the file.
+    int column;
+};
+
+
+/**
+ * \class DebugBuilder
+ * \brief A helper class to create debug information for LLVM IR module.
+ * \todo Only function debug information is supported.
+ */
+class DebugBuilder {
+  private:
+    /// Debug information builder.
+    llvm::DIBuilder di_builder;
+
+    /// LLVM context.
+    llvm::LLVMContext& context;
+
+    /// Debug compile unit for the module.
+    llvm::DICompileUnit* compile_unit = nullptr;
+
+    /// Debug file pointer.
+    llvm::DIFile* file = nullptr;
+
+  public:
+    DebugBuilder(llvm::Module& module)
+        : di_builder(module)
+        , context(module.getContext()) {}
+
+    /// Adds function debug information with an optional location.
+    void add_function_debug_info(llvm::Function* function, Location* loc = nullptr);
+
+    /// Creates the compile unit for and sets debug flags for the module.
+    void create_compile_unit(llvm::Module& module,
+                             const std::string& debug_filename,
+                             const std::string& debug_output_dir);
+
+    /// Finalizes the debug information.
+    void finalize();
+};
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/main.cpp b/src/main.cpp
index 8e6e219df1..da3cb9d7dd 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -180,6 +180,9 @@ int main(int argc, const char* argv[]) {
     /// vector library name
     std::string vector_library("none");
 
+    /// disable debug information generation for the IR
+    bool disable_debug_information(false);
+
     /// run llvm benchmark
     bool run_llvm_benchmark(false);
 
@@ -312,6 +315,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--ir",
         llvm_ir,
         "Generate LLVM IR ({})"_format(llvm_ir))->ignore_case();
+    llvm_opt->add_flag("--disable-debug-info",
+                       disable_debug_information,
+                       "Disable debug information ({})"_format(disable_debug_information))->ignore_case();
     llvm_opt->add_flag("--opt",
                        llvm_ir_opt_passes,
                        "Run LLVM optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
@@ -671,7 +677,8 @@ int main(int argc, const char* argv[]) {
                                            llvm_ir_opt_passes,
                                            llvm_float_type,
                                            llvm_vec_width,
-                                           vector_library);
+                                           vector_library,
+                                           !disable_debug_information);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));

From 1802b74f28cca883e3b7f39fb6b2d8eaff33414d Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 8 May 2021 03:22:06 -0700
Subject: [PATCH 051/331] Fixed using benchmarking_info in TestRunner (#631)

---
 src/codegen/llvm/jit_driver.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index 532cd20b8f..2a6842d0fb 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -205,13 +205,13 @@ void JITDriver::init(std::string features,
     auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
         // Create target machine with some features possibly turned off.
-        auto tm = create_target(&tm_builder, features, benchmark_info->opt_level_codegen);
+        int opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
+        auto tm = create_target(&tm_builder, features, opt_level_codegen);
 
-        // Optimise the LLVM IR module.
-        optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
-
-        // Save optimised module to .ll file if benchmarking.
+        // Optimise the LLVM IR module and save it to .ll file if benchmarking.
         if (benchmark_info) {
+            optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
+
             std::error_code error_code;
             std::unique_ptr<llvm::ToolOutputFile> out =
                 std::make_unique<llvm::ToolOutputFile>(benchmark_info->output_dir + "/" +

From 3359ea3d3c45edf1fb228c543d1fbf2af6aa93d9 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sat, 8 May 2021 10:24:22 +0200
Subject: [PATCH 052/331] Fixes to run CI with NVHPC/PGI compiler   * in the
 new deployment, pgi module is changed to nvhpc   * fix gitlab CI script
 accordingly

---
 ci/bb5-pr.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/bb5-pr.sh b/ci/bb5-pr.sh
index b35f2b20d9..6ecff76144 100755
--- a/ci/bb5-pr.sh
+++ b/ci/bb5-pr.sh
@@ -80,7 +80,7 @@ function bb5_pr_build_intel() {
 }
 
 function bb5_pr_build_pgi() {
-    build_with pgi
+    build_with nvhpc
 }
 
 function bb5_pr_test_gcc() {
@@ -92,7 +92,7 @@ function bb5_pr_test_intel() {
 }
 
 function bb5_pr_test_pgi() {
-    test_with pgi
+    test_with nvhpc
 }
 
 function bb5_pr_build_llvm() {

From 524b2920db1437944800fb380dbb8fe0d7655c01 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 11 May 2021 16:14:34 -0700
Subject: [PATCH 053/331] Fixed addition of SOLVE block to kernel's FOR loop
 (#636)

* Fix `append_statements_from_block` function in LLVM helper visitor.
* Before, if nonspecific current was not specified, the whole `BREAKPOINT`
   block would be added to the kernel body.
* This led to cases when `SOLVE` block was together with the actual
    solution to `DERIVATIVE`
---
 src/codegen/llvm/codegen_llvm_helper_visitor.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index de64e16bd3..c9968df8ee 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -248,7 +248,12 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
 static void append_statements_from_block(ast::StatementVector& statements,
                                          const std::shared_ptr<ast::StatementBlock>& block) {
     const auto& block_statements = block->get_statements();
-    statements.insert(statements.end(), block_statements.begin(), block_statements.end());
+    for (const auto& statement: block_statements) {
+        const auto& expression_statement = std::dynamic_pointer_cast<ast::ExpressionStatement>(
+            statement);
+        if (!expression_statement->get_expression()->is_solve_block())
+            statements.push_back(statement);
+    }
 }
 
 static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(std::string& lhs_str,
@@ -638,7 +643,6 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         /// add breakpoint block if no current
         if (info.currents.empty() && info.breakpoint_node != nullptr) {
             auto block = info.breakpoint_node->get_statement_block();
-            // \todo this automatically adds `SOLVE states METHOD ...`
             append_statements_from_block(loop_body_statements, block);
         }
 

From 68639a71d0e7716462eeb96cd281271601efa55b Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 13 May 2021 00:49:23 -0700
Subject: [PATCH 054/331] IR builder redesign for LLVM IR code generation
 pipeline (#634)

Improves the code structure for the LLVM code generation pipeline

The following changes were added:

1. New IR builder class.

Before, LLVM visitor just simply used `llvm::IRBuilder<>` class to
generate instructions. Recently, this (as well as adding the functionality
to the visitor on the go) had led to code duplication and it became hard
to introduce new features nicely.

Hence, a special `IRBuilder` class is now used. This class is a wrapper
around `llvm::IRBuilder<>` that keeps track of certain IR generation
specific fields (that are unrelated to the visitor), defines an API that the
visitor can use to generate LLVM IR.

Also, this IR builder has been designed to be nearly fully-independent from
NMODL AST nodes. this allows it to be more generic and to be more
extensible.

2. Visitor clean-up

 LLVM visitor has been refactored to take the new IR builder class into
account. Also, the functions were reordered, refactored and renamed to
better reflect the intended use and provide encapsulation.

3. Scatter preparation

The functionality of the generating code for `CodegenInstanceVar` node
has been extended with `read_from_or_write_to_instance(...)` function.
Now, an optional `value_to_store` is passed to indicate whether the code
needs to be generated for reading the instance variable or writing to it.


fixes #538
---
 src/codegen/llvm/CMakeLists.txt           |   4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 926 ++++++++--------------
 src/codegen/llvm/codegen_llvm_visitor.hpp | 342 +++-----
 src/codegen/llvm/llvm_ir_builder.cpp      | 427 ++++++++++
 src/codegen/llvm/llvm_ir_builder.hpp      | 272 +++++++
 5 files changed, 1134 insertions(+), 837 deletions(-)
 create mode 100644 src/codegen/llvm/llvm_ir_builder.cpp
 create mode 100644 src/codegen/llvm/llvm_ir_builder.hpp

diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 7814b502a3..5ebf9c7acd 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -11,7 +11,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 830814286e..a86a5cd8b5 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -14,10 +14,8 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -34,7 +32,7 @@ static constexpr const char instance_struct_type_name[] = "__instance_var__type"
 
 
 /****************************************************************************************/
-/*                            Helper routines                                           */
+/*                                  Helper routines                                     */
 /****************************************************************************************/
 
 /// A utility to check for supported Statement AST nodes.
@@ -44,8 +42,8 @@ static bool is_supported_statement(const ast::Statement& statement) {
            statement.is_if_statement() || statement.is_while_statement();
 }
 
-/// A utility to check of the kernel body can be vectorised.
-static bool can_vectorise(const ast::CodegenForStatement& statement, symtab::SymbolTable* sym_tab) {
+/// A utility to check that the kernel body can be vectorised.
+static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::SymbolTable* sym_tab) {
     // Check that function calls are made to external methods only.
     const auto& function_calls = collect_nodes(statement, {ast::AstNodeType::FUNCTION_CALL});
     for (const auto& call: function_calls) {
@@ -62,458 +60,352 @@ static bool can_vectorise(const ast::CodegenForStatement& statement, symtab::Sym
     return collected.empty();
 }
 
-llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, llvm::Value* index) {
-    llvm::Type* index_type = llvm::Type::getInt64Ty(*context);
-    std::vector<llvm::Value*> indices;
-    indices.push_back(llvm::ConstantInt::get(index_type, 0));
-    indices.push_back(index);
-
-    return ir_builder.CreateInBoundsGEP(lookup(name), indices);
-}
-
-llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
-    llvm::Value* index = get_array_index(node);
-    return create_gep(node.get_node_name(), index);
+llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
+    node->accept(*this);
+    return ir_builder.pop_last_value();
 }
 
-llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstanceVar& node) {
-    const auto& member_node = node.get_member_var();
-    const auto& instance_name = node.get_instance_var()->get_node_name();
-    const auto& member_name = member_node->get_node_name();
-
-    if (!instance_var_helper.is_an_instance_variable(member_name))
-        throw std::runtime_error("Error: " + member_name + " is not a member of the instance!");
+void CodegenLLVMVisitor::create_external_function_call(const std::string& name,
+                                                       const ast::ExpressionVector& arguments) {
+    if (name == "printf") {
+        create_printf_call(arguments);
+        return;
+    }
 
-    // Load the instance struct given its name from the ValueSymbolTable.
-    llvm::Value* instance_ptr = ir_builder.CreateLoad(lookup(instance_name));
+    ValueVector argument_values;
+    TypeVector argument_types;
+    for (const auto& arg: arguments) {
+        llvm::Value* value = accept_and_get(arg);
+        llvm::Type* type = value->getType();
+        argument_types.push_back(type);
+        argument_values.push_back(value);
+    }
+    ir_builder.create_intrinsic(name, argument_values, argument_types);
+}
 
-    // Create a GEP instruction to get a pointer to the member.
-    int member_index = instance_var_helper.get_variable_index(member_name);
-    llvm::Type* index_type = llvm::Type::getInt32Ty(*context);
+void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
+                                              const std::string& name,
+                                              const ast::ExpressionVector& arguments) {
+    // Check that function is called with the expected number of arguments.
+    if (!func->isVarArg() && arguments.size() != func->arg_size()) {
+        throw std::runtime_error("Error: Incorrect number of arguments passed");
+    }
 
-    std::vector<llvm::Value*> indices;
-    indices.push_back(llvm::ConstantInt::get(index_type, 0));
-    indices.push_back(llvm::ConstantInt::get(index_type, member_index));
-    llvm::Value* member_ptr = ir_builder.CreateInBoundsGEP(instance_ptr, indices);
+    // Pack function call arguments to vector and create a call instruction.
+    ValueVector argument_values;
+    argument_values.reserve(arguments.size());
+    create_function_call_arguments(arguments, argument_values);
+    ir_builder.create_function_call(func, argument_values);
+}
 
-    // Get the member AST node from the instance AST node, for which we proceed with the code
-    // generation. If the member is scalar, return the pointer to it straight away.
-    auto codegen_var_with_type = instance_var_helper.get_variable(member_name);
-    if (!codegen_var_with_type->get_is_pointer()) {
-        return member_ptr;
+void CodegenLLVMVisitor::create_function_call_arguments(const ast::ExpressionVector& arguments,
+                                                        ValueVector& arg_values) {
+    for (const auto& arg: arguments) {
+        if (arg->is_string()) {
+            // If the argument is a string, create a global i8* variable with it.
+            auto string_arg = std::dynamic_pointer_cast<ast::String>(arg);
+            arg_values.push_back(ir_builder.create_global_string(*string_arg));
+        } else {
+            llvm::Value* value = accept_and_get(arg);
+            arg_values.push_back(value);
+        }
     }
+}
 
-    // Otherwise, the codegen variable is a pointer, and the member AST node must be an IndexedName.
-    auto member_var_name = std::dynamic_pointer_cast<ast::VarName>(member_node);
-    if (!member_var_name->get_name()->is_indexed_name())
-        throw std::runtime_error("Error: " + member_name + " is not an IndexedName!");
-
-    // Proceed to creating a GEP instruction to get the pointer to the member's element.
-    auto member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
-        member_var_name->get_name());
+void CodegenLLVMVisitor::create_function_declaration(const ast::CodegenFunction& node) {
+    const auto& name = node.get_node_name();
+    const auto& arguments = node.get_arguments();
 
-    if (!member_indexed_name->get_length()->is_name())
-        throw std::runtime_error("Error: " + member_name + " must be indexed with a variable!");
+    // Procedure or function parameters are doubles by default.
+    TypeVector arg_types;
+    for (size_t i = 0; i < arguments.size(); ++i)
+        arg_types.push_back(get_codegen_var_type(*arguments[i]->get_type()));
 
-    llvm::Value* i64_index = get_array_index(*member_indexed_name);
-
-    // The codegen variable type is always a scalar, so we need to transform it to a pointer. Then
-    // load the member which would be indexed later.
-    llvm::Type* type = get_codegen_var_type(*codegen_var_with_type->get_type());
-    llvm::Value* instance_member =
-        ir_builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
-
-    // Check if the code is vectorised and the index is indirect.
-    std::string id = member_indexed_name->get_length()->get_node_name();
-    if (id != kernel_id && is_kernel_code && vector_width > 1) {
-        // Calculate a vector of addresses via GEP instruction, and then created a gather to load
-        // indirectly.
-        llvm::Value* addresses = ir_builder.CreateInBoundsGEP(instance_member, {i64_index});
-        return ir_builder.CreateMaskedGather(addresses, llvm::Align());
-    }
+    llvm::Type* return_type = get_codegen_var_type(*node.get_return_type());
 
-    llvm::Value* member_addr = ir_builder.CreateInBoundsGEP(instance_member, {i64_index});
+    // Create a function that is automatically inserted into module's symbol table.
+    auto func =
+        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                               llvm::Function::ExternalLinkage,
+                               name,
+                               *module);
 
-    // If the code is vectorised, then bitcast to a vector pointer.
-    if (is_kernel_code && vector_width > 1) {
-        llvm::Type* vector_type =
-            llvm::PointerType::get(llvm::FixedVectorType::get(type, vector_width),
-                                   /*AddressSpace=*/0);
-        return ir_builder.CreateBitCast(member_addr, vector_type);
+    // Add function debug information, with location information if it exists.
+    if (add_debug_information) {
+        if (node.get_token()) {
+            Location loc{node.get_token()->start_line(), node.get_token()->start_column()};
+            debug_builder.add_function_debug_info(func, &loc);
+        } else {
+            debug_builder.add_function_debug_info(func);
+        }
     }
-    return member_addr;
 }
 
-llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
-    // Process the index expression. It can either be a Name node:
-    //    k[id]     // id is an integer
-    // or an integer expression.
-    llvm::Value* index_value;
-    if (node.get_length()->is_name()) {
-        llvm::Value* ptr = lookup(node.get_length()->get_node_name());
-        index_value = ir_builder.CreateLoad(ptr);
-    } else {
-        node.get_length()->accept(*this);
-        index_value = values.back();
-        values.pop_back();
-    }
+void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& arguments) {
+    // First, create printf declaration or insert it if it does not exit.
+    std::string name = "printf";
+    llvm::Function* printf = module->getFunction(name);
+    if (!printf) {
+        llvm::FunctionType* printf_type = llvm::FunctionType::get(ir_builder.get_i32_type(),
+                                                                  ir_builder.get_i8_ptr_type(),
+                                                                  /*isVarArg=*/true);
 
-    // Check if index is a double. While it is possible to use casting from double to integer
-    // values, we choose not to support these cases.
-    if (!index_value->getType()->isIntOrIntVectorTy())
-        throw std::runtime_error("Error: only integer indexing is supported!");
-
-    // Conventionally, in LLVM array indices are 64 bit.
-    llvm::Type* i64_type = llvm::Type::getInt64Ty(*context);
-    if (auto index_type = llvm::dyn_cast<llvm::IntegerType>(index_value->getType())) {
-        if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
-            return index_value;
-        return ir_builder.CreateSExtOrTrunc(index_value, i64_type);
+        printf =
+            llvm::Function::Create(printf_type, llvm::Function::ExternalLinkage, name, *module);
     }
 
-    auto vector_type = llvm::cast<llvm::FixedVectorType>(index_value->getType());
-    auto element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
-    if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
-        return index_value;
-    return ir_builder.CreateSExtOrTrunc(index_value,
-                                        llvm::FixedVectorType::get(i64_type, vector_width));
+    // Create a call instruction.
+    ValueVector argument_values;
+    argument_values.reserve(arguments.size());
+    create_function_call_arguments(arguments, argument_values);
+    ir_builder.create_function_call(printf, argument_values, /*use_result=*/false);
 }
 
-int CodegenLLVMVisitor::get_array_length(const ast::IndexedName& node) {
-    auto integer = std::dynamic_pointer_cast<ast::Integer>(node.get_length());
-    if (!integer)
-        throw std::runtime_error("Error: only integer length is supported!");
-
-    // Check if integer value is taken from a macro.
-    if (!integer->get_macro())
-        return integer->get_value();
-    const auto& macro = sym_tab->lookup(integer->get_macro()->get_node_name());
-    return static_cast<int>(*macro->get_value());
+void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
+    // By convention, only kernel functions have a return type of void.
+    const auto& functions = module->getFunctionList();
+    for (const auto& func: functions) {
+        if (func.getReturnType()->isVoidTy()) {
+            container.push_back(func.getName().str());
+        }
+    }
 }
 
 llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType& node) {
     switch (node.get_type()) {
     case ast::AstNodeType::BOOLEAN:
-        return llvm::Type::getInt1Ty(*context);
+        return ir_builder.get_boolean_type();
     case ast::AstNodeType::DOUBLE:
-        return get_default_fp_type();
+        return ir_builder.get_fp_type();
     case ast::AstNodeType::INSTANCE_STRUCT:
         return get_instance_struct_type();
     case ast::AstNodeType::INTEGER:
-        return llvm::Type::getInt32Ty(*context);
+        return ir_builder.get_i32_type();
     case ast::AstNodeType::VOID:
-        return llvm::Type::getVoidTy(*context);
+        return ir_builder.get_void_type();
     default:
         throw std::runtime_error("Error: expecting a type in CodegenVarType node\n");
     }
 }
 
-llvm::Value* CodegenLLVMVisitor::get_constant_int_vector(int value) {
-    llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-    std::vector<llvm::Constant*> constants;
-    for (unsigned i = 0; i < vector_width; ++i) {
-        const auto& element = llvm::ConstantInt::get(i32_type, value);
-        constants.push_back(element);
-    }
-    return llvm::ConstantVector::get(constants);
+llvm::Value* CodegenLLVMVisitor::get_index(const ast::IndexedName& node) {
+    // In NMODL, the index is either an integer expression or a named constant, such as "id".
+    llvm::Value* index_value = node.get_length()->is_name()
+                                   ? ir_builder.create_load(node.get_length()->get_node_name())
+                                   : accept_and_get(node.get_length());
+    return ir_builder.create_index(index_value);
 }
 
-llvm::Value* CodegenLLVMVisitor::get_constant_fp_vector(const std::string& value) {
-    llvm::Type* fp_type = get_default_fp_type();
-    std::vector<llvm::Constant*> constants;
-    for (unsigned i = 0; i < vector_width; ++i) {
-        const auto& element = llvm::ConstantFP::get(fp_type, value);
-        constants.push_back(element);
+llvm::Type* CodegenLLVMVisitor::get_instance_struct_type() {
+    TypeVector member_types;
+    for (const auto& variable: instance_var_helper.instance->get_codegen_vars()) {
+        // Get the type information of the codegen variable.
+        const auto& is_pointer = variable->get_is_pointer();
+        const auto& nmodl_type = variable->get_type()->get_type();
+
+        // Create the corresponding LLVM type.
+        switch (nmodl_type) {
+        case ast::AstNodeType::DOUBLE:
+            member_types.push_back(is_pointer ? ir_builder.get_fp_ptr_type()
+                                              : ir_builder.get_fp_type());
+            break;
+        case ast::AstNodeType::INTEGER:
+            member_types.push_back(is_pointer ? ir_builder.get_i32_ptr_type()
+                                              : ir_builder.get_i32_type());
+            break;
+        default:
+            throw std::runtime_error("Error: unsupported type found in instance struct\n");
+        }
     }
-    return llvm::ConstantVector::get(constants);
-}
 
-llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
-    if (use_single_precision)
-        return llvm::Type::getFloatTy(*context);
-    return llvm::Type::getDoubleTy(*context);
+    return ir_builder.get_struct_ptr_type(mod_filename + instance_struct_type_name, member_types);
 }
 
-llvm::Type* CodegenLLVMVisitor::get_default_fp_ptr_type() {
-    if (use_single_precision)
-        return llvm::Type::getFloatPtrTy(*context);
-    return llvm::Type::getDoublePtrTy(*context);
+int CodegenLLVMVisitor::get_num_elements(const ast::IndexedName& node) {
+    // First, verify if the length is an integer value.
+    const auto& integer = std::dynamic_pointer_cast<ast::Integer>(node.get_length());
+    if (!integer)
+        throw std::runtime_error("Error: only integer length is supported\n");
+
+    // Check if the length value is a constant.
+    if (!integer->get_macro())
+        return integer->get_value();
+
+    // Otherwise, the length is taken from the macro.
+    const auto& macro = sym_tab->lookup(integer->get_macro()->get_node_name());
+    return static_cast<int>(*macro->get_value());
 }
 
-llvm::Type* CodegenLLVMVisitor::get_instance_struct_type() {
-    std::vector<llvm::Type*> members;
-    for (const auto& variable: instance_var_helper.instance->get_codegen_vars()) {
-        auto is_pointer = variable->get_is_pointer();
-        auto nmodl_type = variable->get_type()->get_type();
+llvm::Value* CodegenLLVMVisitor::read_from_or_write_to_instance(const ast::CodegenInstanceVar& node,
+                                                                llvm::Value* maybe_value_to_store) {
+    const auto& instance_name = node.get_instance_var()->get_node_name();
+    const auto& member_node = node.get_member_var();
+    const auto& member_name = member_node->get_node_name();
 
-        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::Type* i32ptr_type = llvm::Type::getInt32PtrTy(*context);
+    if (!instance_var_helper.is_an_instance_variable(member_name))
+        throw std::runtime_error("Error: " + member_name +
+                                 " is not a member of the instance variable\n");
 
-        switch (nmodl_type) {
-#define DISPATCH(type, llvm_ptr_type, llvm_type)                       \
-    case type:                                                         \
-        members.push_back(is_pointer ? (llvm_ptr_type) : (llvm_type)); \
-        break;
+    // Load the instance struct by its name.
+    llvm::Value* instance_ptr = ir_builder.create_load(instance_name);
 
-            DISPATCH(ast::AstNodeType::DOUBLE, get_default_fp_ptr_type(), get_default_fp_type());
-            DISPATCH(ast::AstNodeType::INTEGER, i32ptr_type, i32_type);
+    // Get the pointer to the specified member.
+    int member_index = instance_var_helper.get_variable_index(member_name);
+    llvm::Value* member_ptr = ir_builder.get_struct_member_ptr(instance_ptr, member_index);
 
-#undef DISPATCH
-        default:
-            throw std::runtime_error("Error: unsupported type found in instance struct");
+    // Check if the member is scalar. Load the value or store to it straight away. Otherwise, we
+    // need some extra handling.
+    auto codegen_var_with_type = instance_var_helper.get_variable(member_name);
+    if (!codegen_var_with_type->get_is_pointer()) {
+        if (maybe_value_to_store) {
+            ir_builder.create_store(member_ptr, maybe_value_to_store);
+            return nullptr;
+        } else {
+            return ir_builder.create_load(member_ptr);
         }
     }
 
-    llvm::StructType* llvm_struct_type =
-        llvm::StructType::create(*context, mod_filename + instance_struct_type_name);
-    llvm_struct_type->setBody(members);
-    return llvm::PointerType::get(llvm_struct_type, /*AddressSpace=*/0);
+    // Check that the member is an indexed name indeed, and that it is indexed by a named constant
+    // (e.g. "id").
+    const auto& member_var_name = std::dynamic_pointer_cast<ast::VarName>(member_node);
+    if (!member_var_name->get_name()->is_indexed_name())
+        throw std::runtime_error("Error: " + member_name + " is not an IndexedName\n");
+
+    const auto& member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
+        member_var_name->get_name());
+    if (!member_indexed_name->get_length()->is_name())
+        throw std::runtime_error("Error: " + member_name + " must be indexed with a variable!");
+
+    // Get the index to the member and the id used to index it.
+    llvm::Value* i64_index = get_index(*member_indexed_name);
+    const std::string id = member_indexed_name->get_length()->get_node_name();
+
+    // Load the member of the instance struct.
+    llvm::Value* instance_member = ir_builder.create_load(member_ptr);
+
+    // Create a pointer to the specified element of the struct member.
+    return ir_builder.load_to_or_store_from_array(id,
+                                                  i64_index,
+                                                  instance_member,
+                                                  maybe_value_to_store);
 }
 
-llvm::Value* CodegenLLVMVisitor::get_variable_ptr(const ast::VarName& node) {
+llvm::Value* CodegenLLVMVisitor::read_variable(const ast::VarName& node) {
     const auto& identifier = node.get_name();
-    if (!identifier->is_name() && !identifier->is_indexed_name() &&
-        !identifier->is_codegen_instance_var()) {
-        throw std::runtime_error("Error: Unsupported variable type - " + node.get_node_name());
-    }
 
-    llvm::Value* ptr;
-    if (identifier->is_name())
-        ptr = lookup(node.get_node_name());
+    if (identifier->is_name()) {
+        return ir_builder.create_load(node.get_node_name());
+    }
 
     if (identifier->is_indexed_name()) {
-        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-        ptr = codegen_indexed_name(*indexed_name);
+        const auto& indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        llvm::Value* index = get_index(*indexed_name);
+        return ir_builder.create_load_from_array(node.get_node_name(), index);
     }
 
     if (identifier->is_codegen_instance_var()) {
-        auto instance_var = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(identifier);
-        ptr = codegen_instance_var(*instance_var);
+        const auto& instance_var = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(identifier);
+        return read_from_or_write_to_instance(*instance_var);
     }
-    return ptr;
-}
 
-std::shared_ptr<ast::InstanceStruct> CodegenLLVMVisitor::get_instance_struct_ptr() {
-    return instance_var_helper.instance;
+    throw std::runtime_error("Error: the type of '" + node.get_node_name() +
+                             "' is not supported\n");
 }
 
 void CodegenLLVMVisitor::run_ir_opt_passes() {
-    /// run some common optimisation passes that are commonly suggested
+    // Run some common optimisation passes that are commonly suggested.
     opt_pm.add(llvm::createInstructionCombiningPass());
     opt_pm.add(llvm::createReassociatePass());
     opt_pm.add(llvm::createGVNPass());
     opt_pm.add(llvm::createCFGSimplificationPass());
 
-    /// initialize pass manager
+    // Initialize pass manager.
     opt_pm.doInitialization();
 
-    /// iterate over all functions and run the optimisation passes
+    // Iterate over all functions and run the optimisation passes.
     auto& functions = module->getFunctionList();
     for (auto& function: functions) {
         llvm::verifyFunction(function);
         opt_pm.run(function);
     }
+    opt_pm.doFinalization();
 }
 
-void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
-                                                     const ast::ExpressionVector& arguments) {
-    if (name == "printf") {
-        create_printf_call(arguments);
-        return;
+void CodegenLLVMVisitor::write_to_variable(const ast::VarName& node, llvm::Value* value) {
+    const auto& identifier = node.get_name();
+    if (!identifier->is_name() && !identifier->is_indexed_name() &&
+        !identifier->is_codegen_instance_var()) {
+        throw std::runtime_error("Error: the type of '" + node.get_node_name() +
+                                 "' is not supported\n");
     }
 
-    std::vector<llvm::Value*> argument_values;
-    std::vector<llvm::Type*> argument_types;
-    for (const auto& arg: arguments) {
-        arg->accept(*this);
-        llvm::Value* value = values.back();
-        llvm::Type* type = value->getType();
-        values.pop_back();
-        argument_types.push_back(type);
-        argument_values.push_back(value);
+    if (identifier->is_name()) {
+        ir_builder.create_store(node.get_node_name(), value);
     }
 
-#define DISPATCH(method_name, intrinsic)                                            \
-    if (name == (method_name)) {                                                    \
-        llvm::Value* result =                                                       \
-            ir_builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
-        values.push_back(result);                                                   \
-        return;                                                                     \
+    if (identifier->is_indexed_name()) {
+        const auto& indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        llvm::Value* index = get_index(*indexed_name);
+        ir_builder.create_store_to_array(node.get_node_name(), index, value);
     }
 
-    DISPATCH("exp", llvm::Intrinsic::exp);
-    DISPATCH("pow", llvm::Intrinsic::pow);
-#undef DISPATCH
-
-    throw std::runtime_error("Error: External method" + name + " is not currently supported");
-}
-
-void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
-                                              const std::string& name,
-                                              const ast::ExpressionVector& arguments) {
-    // Check that function is called with the expected number of arguments.
-    if (!func->isVarArg() && arguments.size() != func->arg_size()) {
-        throw std::runtime_error("Error: Incorrect number of arguments passed");
+    if (identifier->is_codegen_instance_var()) {
+        const auto& instance_var = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(identifier);
+        read_from_or_write_to_instance(*instance_var, value);
     }
-
-    // Pack function call arguments to vector and create a call instruction.
-    std::vector<llvm::Value*> argument_values;
-    argument_values.reserve(arguments.size());
-    pack_function_call_arguments(arguments, argument_values);
-    llvm::Value* call = ir_builder.CreateCall(func, argument_values);
-    values.push_back(call);
 }
 
-void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& arguments) {
-    // First, create printf declaration or insert it if it does not exit.
-    std::string name = "printf";
-    llvm::Function* printf = module->getFunction(name);
-    if (!printf) {
-        llvm::Type* ptr_type = llvm::Type::getInt8PtrTy(*context);
-        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::FunctionType* printf_type =
-            llvm::FunctionType::get(i32_type, ptr_type, /*isVarArg=*/true);
-
-        printf =
-            llvm::Function::Create(printf_type, llvm::Function::ExternalLinkage, name, *module);
-    }
-
-    // Create a call instruction.
-    std::vector<llvm::Value*> argument_values;
-    argument_values.reserve(arguments.size());
-    pack_function_call_arguments(arguments, argument_values);
-    ir_builder.CreateCall(printf, argument_values);
-}
+void CodegenLLVMVisitor::wrap_kernel_functions() {
+    // First, identify all kernels.
+    std::vector<std::string> kernel_names;
+    find_kernel_names(kernel_names);
 
-void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::CodegenFunction& node) {
-    const auto& name = node.get_node_name();
-    const auto& arguments = node.get_arguments();
+    for (const auto& kernel_name: kernel_names) {
+        // Get the kernel function and the instance struct type.
+        auto kernel = module->getFunction(kernel_name);
+        if (!kernel)
+            throw std::runtime_error("Error: kernel " + kernel_name + " is not found\n");
 
-    // Procedure or function parameters are doubles by default.
-    std::vector<llvm::Type*> arg_types;
-    for (size_t i = 0; i < arguments.size(); ++i)
-        arg_types.push_back(get_codegen_var_type(*arguments[i]->get_type()));
+        if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
+            throw std::runtime_error("Error: kernel " + kernel_name +
+                                     " must have a single argument\n");
 
-    llvm::Type* return_type = get_codegen_var_type(*node.get_return_type());
+        auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(
+            kernel->getArg(0)->getType());
+        if (!instance_struct_ptr_type)
+            throw std::runtime_error("Error: kernel " + kernel_name +
+                                     " does not have an instance struct pointer as an argument\n");
 
-    // Create a function that is automatically inserted into module's symbol table.
-    auto func =
-        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
-                               llvm::Function::ExternalLinkage,
-                               name,
-                               *module);
+        // Create a wrapper void function that takes a void pointer as a single argument.
+        llvm::Type* i32_type = ir_builder.get_i32_type();
+        llvm::Type* void_ptr_type = ir_builder.get_i8_ptr_type();
+        llvm::Function* wrapper_func = llvm::Function::Create(
+            llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
+            llvm::Function::ExternalLinkage,
+            "__" + kernel_name + "_wrapper",
+            *module);
 
-    // Add function debug information, with location information if it exists.
-    if (add_debug_information) {
-        if (node.get_token()) {
-            Location loc{node.get_token()->start_line(), node.get_token()->start_column()};
-            debug_builder.add_function_debug_info(func, &loc);
-        } else {
-            debug_builder.add_function_debug_info(func);
+        // Optionally, add debug information for the wrapper function.
+        if (add_debug_information) {
+            debug_builder.add_function_debug_info(wrapper_func);
         }
-    }
-}
 
-llvm::Value* CodegenLLVMVisitor::lookup(const std::string& name) {
-    auto val = current_func->getValueSymbolTable()->lookup(name);
-    if (!val)
-        throw std::runtime_error("Error: variable " + name + " is not in scope\n");
-    return val;
-}
+        ir_builder.create_block_and_set_insertion_point(wrapper_func);
 
-void CodegenLLVMVisitor::pack_function_call_arguments(const ast::ExpressionVector& arguments,
-                                                      std::vector<llvm::Value*>& arg_values) {
-    for (const auto& arg: arguments) {
-        if (arg->is_string()) {
-            // If the argument is a string, create a global i8* variable with it.
-            auto string_arg = std::dynamic_pointer_cast<ast::String>(arg);
-            llvm::Value* str = ir_builder.CreateGlobalStringPtr(string_arg->get_value());
-            arg_values.push_back(str);
-        } else {
-            arg->accept(*this);
-            llvm::Value* value = values.back();
-            values.pop_back();
-            arg_values.push_back(value);
-        }
-    }
-}
-
-llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
-                                                         llvm::Value* rhs,
-                                                         unsigned op) {
-    const auto& bin_op = static_cast<ast::BinaryOp>(op);
-    llvm::Type* lhs_type = lhs->getType();
-    llvm::Value* result;
-
-    switch (bin_op) {
-#define DISPATCH(binary_op, llvm_fp_op, llvm_int_op) \
-    case binary_op:                                  \
-        if (lhs_type->isIntOrIntVectorTy())          \
-            result = llvm_int_op(lhs, rhs);          \
-        else                                         \
-            result = llvm_fp_op(lhs, rhs);           \
-        return result;
-
-        DISPATCH(ast::BinaryOp::BOP_ADDITION, ir_builder.CreateFAdd, ir_builder.CreateAdd);
-        DISPATCH(ast::BinaryOp::BOP_DIVISION, ir_builder.CreateFDiv, ir_builder.CreateSDiv);
-        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, ir_builder.CreateFMul, ir_builder.CreateMul);
-        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, ir_builder.CreateFSub, ir_builder.CreateSub);
-
-#undef DISPATCH
+        // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
+        // and adding a terminator.
+        llvm::Value* bitcasted = ir_builder.create_bitcast(wrapper_func->getArg(0),
+                                                           instance_struct_ptr_type);
+        ValueVector args;
+        args.push_back(bitcasted);
+        ir_builder.create_function_call(kernel, args, /*use_result=*/false);
 
-    default:
-        return nullptr;
+        // Create a 0 return value and a return instruction.
+        ir_builder.create_i32_constant(0);
+        ir_builder.create_return(ir_builder.pop_last_value());
     }
 }
 
-void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs) {
-    auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
-    if (!var)
-        throw std::runtime_error("Error: only VarName assignment is supported!");
-
-    llvm::Value* ptr = get_variable_ptr(*var);
-    ir_builder.CreateStore(rhs, ptr);
-}
-
-llvm::Value* CodegenLLVMVisitor::visit_logical_bin_op(llvm::Value* lhs,
-                                                      llvm::Value* rhs,
-                                                      unsigned op) {
-    const auto& bin_op = static_cast<ast::BinaryOp>(op);
-    return bin_op == ast::BinaryOp::BOP_AND ? ir_builder.CreateAnd(lhs, rhs)
-                                            : ir_builder.CreateOr(lhs, rhs);
-}
-
-llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
-                                                         llvm::Value* rhs,
-                                                         unsigned op) {
-    const auto& bin_op = static_cast<ast::BinaryOp>(op);
-    llvm::Type* lhs_type = lhs->getType();
-    llvm::Value* result;
-
-    switch (bin_op) {
-#define DISPATCH(binary_op, i_llvm_op, f_llvm_op)            \
-    case binary_op:                                          \
-        if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
-            result = f_llvm_op(lhs, rhs);                    \
-        else                                                 \
-            result = i_llvm_op(lhs, rhs);                    \
-        return result;
-
-        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, ir_builder.CreateICmpEQ, ir_builder.CreateFCmpOEQ);
-        DISPATCH(ast::BinaryOp::BOP_GREATER, ir_builder.CreateICmpSGT, ir_builder.CreateFCmpOGT);
-        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL,
-                 ir_builder.CreateICmpSGE,
-                 ir_builder.CreateFCmpOGE);
-        DISPATCH(ast::BinaryOp::BOP_LESS, ir_builder.CreateICmpSLT, ir_builder.CreateFCmpOLT);
-        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, ir_builder.CreateICmpSLE, ir_builder.CreateFCmpOLE);
-        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, ir_builder.CreateICmpNE, ir_builder.CreateFCmpONE);
-
-#undef DISPATCH
-
-    default:
-        return nullptr;
-    }
-}
 
 /****************************************************************************************/
 /*                            Overloaded visitor routines                               */
@@ -525,43 +417,18 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
 
     // Process rhs first, since lhs is handled differently for assignment and binary
     // operators.
-    node.get_rhs()->accept(*this);
-    llvm::Value* rhs = values.back();
-    values.pop_back();
+    llvm::Value* rhs = accept_and_get(node.get_rhs());
     if (op == ast::BinaryOp::BOP_ASSIGN) {
-        visit_assign_op(node, rhs);
-        return;
-    }
+        auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
+        if (!var)
+            throw std::runtime_error("Error: only 'VarName' assignment is supported\n");
 
-    node.get_lhs()->accept(*this);
-    llvm::Value* lhs = values.back();
-    values.pop_back();
-
-    llvm::Value* result;
-    switch (op) {
-    case ast::BOP_ADDITION:
-    case ast::BOP_DIVISION:
-    case ast::BOP_MULTIPLICATION:
-    case ast::BOP_SUBTRACTION:
-        result = visit_arithmetic_bin_op(lhs, rhs, op);
-        break;
-    case ast::BOP_AND:
-    case ast::BOP_OR:
-        result = visit_logical_bin_op(lhs, rhs, op);
-        break;
-    case ast::BOP_EXACT_EQUAL:
-    case ast::BOP_GREATER:
-    case ast::BOP_GREATER_EQUAL:
-    case ast::BOP_LESS:
-    case ast::BOP_LESS_EQUAL:
-    case ast::BOP_NOT_EQUAL:
-        result = visit_comparison_bin_op(lhs, rhs, op);
-        break;
-    default:
-        throw std::runtime_error("Error: binary operator is not supported\n");
+        write_to_variable(*var, rhs);
+        return;
     }
 
-    values.push_back(result);
+    llvm::Value* lhs = accept_and_get(node.get_lhs());
+    ir_builder.create_binary_op(lhs, rhs, op);
 }
 
 void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node) {
@@ -573,9 +440,7 @@ void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node)
 }
 
 void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
-    const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*context),
-                                                  node.get_value());
-    values.push_back(constant);
+    ir_builder.create_boolean_constant(node.get_value());
 }
 
 // Generating FOR loop in LLVM IR creates the following structure:
@@ -612,10 +477,10 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
 //  +---------------------------+
 void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatement& node) {
     // Disable vector code generation for condition and increment blocks.
-    is_kernel_code = false;
+    ir_builder.stop_vectorization();
 
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.get_current_block();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -631,10 +496,12 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     int tmp_vector_width = vector_width;
 
     // Check if the kernel can be vectorised. If not, generate scalar code.
-    if (!can_vectorise(node, sym_tab)) {
-        logger->info("Cannot vectorise the for loop in '" + current_func->getName().str() + "'");
+    if (!can_vectorize(node, sym_tab)) {
+        logger->info("Cannot vectorise the for loop in '" + ir_builder.get_current_function_name() +
+                     "'");
         logger->info("Generating scalar code...");
         vector_width = 1;
+        ir_builder.generate_scalar_code();
     }
 
     // First, initialise the loop in the same basic block. This block is optional. Also, reset
@@ -643,36 +510,33 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
         node.get_initialization()->accept(*this);
     } else {
         vector_width = 1;
+        ir_builder.generate_scalar_code();
     }
 
     // Branch to condition basic block and insert condition code there.
-    ir_builder.CreateBr(for_cond);
-    ir_builder.SetInsertPoint(for_cond);
-    node.get_condition()->accept(*this);
+    ir_builder.create_br_and_set_insertion_point(for_cond);
 
     // Extract the condition to decide whether to branch to the loop body or loop exit.
-    llvm::Value* cond = values.back();
-    values.pop_back();
-    ir_builder.CreateCondBr(cond, for_body, exit);
+    llvm::Value* cond = accept_and_get(node.get_condition());
+    ir_builder.create_cond_br(cond, for_body, exit);
 
     // Generate code for the loop body and create the basic block for the increment.
-    ir_builder.SetInsertPoint(for_body);
-    is_kernel_code = true;
+    ir_builder.set_insertion_point(for_body);
+    ir_builder.start_vectorization();
     const auto& statement_block = node.get_statement_block();
     statement_block->accept(*this);
-    is_kernel_code = false;
-    ir_builder.CreateBr(for_inc);
-
+    ir_builder.stop_vectorization();
+    ir_builder.create_br_and_set_insertion_point(for_inc);
     // Process increment.
-    ir_builder.SetInsertPoint(for_inc);
     node.get_increment()->accept(*this);
 
     // Create a branch to condition block, then generate exit code out of the loop. Restore the
     // vector width.
-    ir_builder.CreateBr(for_cond);
-    ir_builder.SetInsertPoint(exit);
+    ir_builder.create_br(for_cond);
+    ir_builder.set_insertion_point(exit);
     vector_width = tmp_vector_width;
-    is_kernel_code = true;
+    ir_builder.generate_vectorized_code();
+    ir_builder.start_vectorization();
 }
 
 
@@ -680,12 +544,11 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     const auto& name = node.get_node_name();
     const auto& arguments = node.get_arguments();
     llvm::Function* func = module->getFunction(name);
-    current_func = func;
+    ir_builder.set_function(func);
 
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
-    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
-    ir_builder.SetInsertPoint(body);
+    llvm::BasicBlock* body = ir_builder.create_block_and_set_insertion_point(func);
 
     // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
     // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
@@ -697,84 +560,59 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
 
 
     // Allocate parameters on the stack and add them to the symbol table.
-    unsigned i = 0;
-    for (auto& arg: func->args()) {
-        std::string arg_name = arguments[i++].get()->get_node_name();
-        llvm::Type* arg_type = arg.getType();
-        llvm::Value* alloca = ir_builder.CreateAlloca(arg_type, /*ArraySize=*/nullptr, arg_name);
-        arg.setName(arg_name);
-        ir_builder.CreateStore(&arg, alloca);
-    }
+    ir_builder.allocate_function_arguments(func, arguments);
 
     // Process function or procedure body. If the function is a compute kernel, then set the
     // corresponding flags. The return statement is handled in a separate visitor.
     bool has_void_ret_type = node.get_return_type()->get_type() == ast::AstNodeType::VOID;
     if (has_void_ret_type) {
-        is_kernel_code = true;
+        ir_builder.start_vectorization();
         block->accept(*this);
-        is_kernel_code = false;
+        ir_builder.stop_vectorization();
     } else {
         block->accept(*this);
     }
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
     if (has_void_ret_type)
-        ir_builder.CreateRetVoid();
+        ir_builder.create_return();
 
     // Clear local values stack and remove the pointer to the local symbol table.
-    values.clear();
-    current_func = nullptr;
+    ir_builder.clear_function();
 }
 
 void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturnStatement& node) {
     if (!node.get_statement()->is_name())
         throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
 
-    std::string ret = "ret_" + current_func->getName().str();
-    llvm::Value* ret_value = ir_builder.CreateLoad(lookup(ret));
-    ir_builder.CreateRet(ret_value);
+    std::string ret = "ret_" + ir_builder.get_current_function_name();
+    llvm::Value* ret_value = ir_builder.create_load(ret);
+    ir_builder.create_return(ret_value);
 }
 
 void CodegenLLVMVisitor::visit_codegen_var_list_statement(
     const ast::CodegenVarListStatement& node) {
-    llvm::Type* scalar_var_type = get_codegen_var_type(*node.get_var_type());
+    llvm::Type* scalar_type = get_codegen_var_type(*node.get_var_type());
     for (const auto& variable: node.get_variables()) {
-        std::string name = variable->get_node_name();
         const auto& identifier = variable->get_name();
+        std::string name = variable->get_node_name();
+
         // Local variable can be a scalar (Node AST class) or an array (IndexedName AST class). For
-        // each case, create memory allocations with the corresponding LLVM type.
-        llvm::Type* var_type;
+        // each case, create memory allocations.
         if (identifier->is_indexed_name()) {
-            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-            int length = get_array_length(*indexed_name);
-            var_type = llvm::ArrayType::get(scalar_var_type, length);
+            const auto& indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+            int length = get_num_elements(*indexed_name);
+            ir_builder.create_array_alloca(name, scalar_type, length);
         } else if (identifier->is_name()) {
-            // This case corresponds to a scalar or vector local variable.
-            const auto& identifier_name = identifier->get_node_name();
-
-            // Even if generating vectorised code, some variables still need to be scalar.
-            // Particularly, the induction variable "id" and remainder loop variables (that start
-            // with "epilogue").
-            if (is_kernel_code && vector_width > 1 && identifier_name != kernel_id &&
-                identifier_name.rfind("epilogue", 0)) {
-                var_type = llvm::FixedVectorType::get(scalar_var_type, vector_width);
-            } else {
-                var_type = scalar_var_type;
-            }
+            ir_builder.create_scalar_or_vector_alloca(name, scalar_type);
         } else {
-            throw std::runtime_error("Error: Unsupported local variable type");
+            throw std::runtime_error("Error: unsupported local variable type\n");
         }
-        ir_builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
     }
 }
 
 void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
-    if (is_kernel_code && vector_width > 1) {
-        values.push_back(get_constant_fp_vector(node.get_value()));
-        return;
-    }
-    const auto& constant = llvm::ConstantFP::get(get_default_fp_type(), node.get_value());
-    values.push_back(constant);
+    ir_builder.create_fp_constant(node.get_value());
 }
 
 void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
@@ -783,23 +621,22 @@ void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
 
 void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
     const auto& name = node.get_node_name();
-    auto func = module->getFunction(name);
+    llvm::Function* func = module->getFunction(name);
     if (func) {
         create_function_call(func, name, node.get_arguments());
     } else {
         auto symbol = sym_tab->lookup(name);
         if (symbol && symbol->has_any_property(symtab::syminfo::NmodlType::extern_method)) {
-            create_external_method_call(name, node.get_arguments());
+            create_external_function_call(name, node.get_arguments());
         } else {
-            throw std::runtime_error("Error: Unknown function name: " + name +
-                                     ". (External functions references are not supported)");
+            throw std::runtime_error("Error: unknown function name: " + name + "\n");
         }
     }
 }
 
 void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.get_current_block();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -808,14 +645,12 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     llvm::BasicBlock* merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
 
     // Add condition to the current block.
-    node.get_condition()->accept(*this);
-    llvm::Value* cond = values.back();
-    values.pop_back();
+    llvm::Value* cond = accept_and_get(node.get_condition());
 
     // Process the true block.
-    ir_builder.SetInsertPoint(true_block);
+    ir_builder.set_insertion_point(true_block);
     node.get_statement_block()->accept(*this);
-    ir_builder.CreateBr(merge_block);
+    ir_builder.create_br(merge_block);
 
     // Save the merge block and proceed with codegen for `else if` statements.
     llvm::BasicBlock* exit = merge_block;
@@ -823,27 +658,25 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
         // Link the current block to the true and else blocks.
         llvm::BasicBlock* else_block =
             llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        ir_builder.SetInsertPoint(curr_block);
-        ir_builder.CreateCondBr(cond, true_block, else_block);
+        ir_builder.set_insertion_point(curr_block);
+        ir_builder.create_cond_br(cond, true_block, else_block);
 
         // Process else block.
-        ir_builder.SetInsertPoint(else_block);
-        else_if->get_condition()->accept(*this);
-        cond = values.back();
-        values.pop_back();
+        ir_builder.set_insertion_point(else_block);
+        cond = accept_and_get(else_if->get_condition());
 
         // Reassign true and merge blocks respectively. Note that the new merge block has to be
         // connected to the old merge block (tmp).
         true_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
         llvm::BasicBlock* tmp = merge_block;
         merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        ir_builder.SetInsertPoint(merge_block);
-        ir_builder.CreateBr(tmp);
+        ir_builder.set_insertion_point(merge_block);
+        ir_builder.create_br(tmp);
 
         // Process true block.
-        ir_builder.SetInsertPoint(true_block);
+        ir_builder.set_insertion_point(true_block);
         else_if->get_statement_block()->accept(*this);
-        ir_builder.CreateBr(merge_block);
+        ir_builder.create_br(merge_block);
         curr_block = else_block;
     }
 
@@ -852,25 +685,19 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     llvm::BasicBlock* else_block;
     if (elses) {
         else_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        ir_builder.SetInsertPoint(else_block);
+        ir_builder.set_insertion_point(else_block);
         elses->get_statement_block()->accept(*this);
-        ir_builder.CreateBr(merge_block);
+        ir_builder.create_br(merge_block);
     } else {
         else_block = merge_block;
     }
-    ir_builder.SetInsertPoint(curr_block);
-    ir_builder.CreateCondBr(cond, true_block, else_block);
-    ir_builder.SetInsertPoint(exit);
+    ir_builder.set_insertion_point(curr_block);
+    ir_builder.create_cond_br(cond, true_block, else_block);
+    ir_builder.set_insertion_point(exit);
 }
 
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
-    if (is_kernel_code && vector_width > 1) {
-        values.push_back(get_constant_int_vector(node.get_value()));
-        return;
-    }
-    const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
-                                                  node.get_value());
-    values.push_back(constant);
+    ir_builder.create_i32_constant(node.get_value());
 }
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
@@ -881,7 +708,11 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     CodegenLLVMHelperVisitor v{vector_width};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
-    kernel_id = v.get_kernel_id();
+    sym_tab = node.get_symbol_table();
+    std::string kernel_id = v.get_kernel_id();
+
+    // Initialize the builder for this NMODL program.
+    ir_builder.initialize(*sym_tab, kernel_id);
 
     // Create compile unit if adding debug information to the module.
     if (add_debug_information) {
@@ -891,12 +722,9 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
     for (const auto& func: functions) {
-        emit_procedure_or_function_declaration(*func);
+        create_function_declaration(*func);
     }
 
-    // Set the AST symbol table.
-    sym_tab = node.get_symbol_table();
-
     // Proceed with code generation. Right now, we do not do
     //     node.visit_children(*this);
     // The reason is that the node may contain AST nodes for which the visitor functions have been
@@ -977,40 +805,18 @@ void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node)
 
 void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node) {
     ast::UnaryOp op = node.get_op().get_value();
-    node.get_expression()->accept(*this);
-    llvm::Value* value = values.back();
-    values.pop_back();
-    if (op == ast::UOP_NEGATION) {
-        values.push_back(ir_builder.CreateFNeg(value));
-    } else if (op == ast::UOP_NOT) {
-        values.push_back(ir_builder.CreateNot(value));
-    } else {
-        throw std::runtime_error("Error: unsupported unary operator\n");
-    }
+    llvm::Value* value = accept_and_get(node.get_expression());
+    ir_builder.create_unary_op(value, op);
 }
 
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
-    llvm::Value* ptr = get_variable_ptr(node);
-
-    // Finally, load the variable from the pointer value unless it has already been loaded (e.g. via
-    // gather instruction).
-    llvm::Value* var = ptr->getType()->isPointerTy() ? ir_builder.CreateLoad(ptr) : ptr;
-
-    // If the value should not be vectorised, or it is already a vector, add it to the stack.
-    if (!is_kernel_code || vector_width <= 1 || var->getType()->isVectorTy()) {
-        values.push_back(var);
-        return;
-    }
-
-    // Otherwise, if we are generating vectorised inside the loop, replicate the value to form a
-    // vector of `vector_width`.
-    llvm::Value* vector_var = ir_builder.CreateVectorSplat(vector_width, var);
-    values.push_back(vector_var);
+    llvm::Value* value = read_variable(node);
+    ir_builder.maybe_replicate_value(value);
 }
 
 void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.get_current_block();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -1019,78 +825,18 @@ void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node)
     llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
 
-    ir_builder.CreateBr(header);
-    ir_builder.SetInsertPoint(header);
+    ir_builder.create_br_and_set_insertion_point(header);
+
 
     // Generate code for condition and create branch to the body block.
-    node.get_condition()->accept(*this);
-    llvm::Value* condition = values.back();
-    values.pop_back();
-    ir_builder.CreateCondBr(condition, body, exit);
+    llvm::Value* condition = accept_and_get(node.get_condition());
+    ir_builder.create_cond_br(condition, body, exit);
 
-    ir_builder.SetInsertPoint(body);
+    ir_builder.set_insertion_point(body);
     node.get_statement_block()->accept(*this);
-    ir_builder.CreateBr(header);
+    ir_builder.create_br(header);
 
-    ir_builder.SetInsertPoint(exit);
-}
-
-void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
-    // By convention, only the kernel functions return void type.
-    const auto& functions = module->getFunctionList();
-    for (const auto& func: functions) {
-        if (func.getReturnType()->isVoidTy()) {
-            container.push_back(func.getName().str());
-        }
-    }
-}
-
-void CodegenLLVMVisitor::wrap_kernel_functions() {
-    // First, identify all kernels.
-    std::vector<std::string> kernel_names;
-    find_kernel_names(kernel_names);
-
-    for (const auto& kernel_name: kernel_names) {
-        // Get the kernel function and the instance struct type.
-        auto kernel = module->getFunction(kernel_name);
-        if (!kernel)
-            throw std::runtime_error("Kernel " + kernel_name + " is not found!");
-
-        if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
-            throw std::runtime_error("Kernel " + kernel_name + " must have a single argument!");
-
-        auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(
-            kernel->getArg(0)->getType());
-        if (!instance_struct_ptr_type)
-            throw std::runtime_error("Kernel " + kernel_name +
-                                     " does not have an instance struct pointer argument!");
-
-        // Create a wrapper void function that takes a void pointer as a single argument.
-        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::Type* void_ptr_type = llvm::Type::getInt8PtrTy(*context);
-        llvm::Function* wrapper_func = llvm::Function::Create(
-            llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
-            llvm::Function::ExternalLinkage,
-            "__" + kernel_name + "_wrapper",
-            *module);
-
-        // Optionally, add debug information for the wrapper function.
-        if (add_debug_information) {
-            debug_builder.add_function_debug_info(wrapper_func);
-        }
-
-        llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
-        ir_builder.SetInsertPoint(body);
-
-        // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
-        // and adding a terminator.
-        llvm::Value* bitcasted = ir_builder.CreateBitCast(wrapper_func->getArg(0),
-                                                          instance_struct_ptr_type);
-        std::vector<llvm::Value*> args;
-        args.push_back(bitcasted);
-        ir_builder.CreateCall(kernel, args);
-        ir_builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
-    }
+    ir_builder.set_insertion_point(exit);
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 450e1872a4..0ada7b8097 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -20,13 +20,13 @@
 
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 #include "codegen/llvm/llvm_debug_builder.hpp"
+#include "codegen/llvm/llvm_ir_builder.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
@@ -63,76 +63,50 @@ static const std::map<std::string, llvm::TargetLibraryInfoImpl::VectorLibrary> v
  * \brief %Visitor for transforming NMODL AST to LLVM IR
  */
 class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
-    // Name of mod file (without .mod suffix)
+    /// Name of mod file (without .mod suffix).
     std::string mod_filename;
 
-    // Output directory for code generation
+    /// Output directory for code generation.
     std::string output_dir;
 
   private:
-    InstanceVarHelper instance_var_helper;
-
+    /// Underlying LLVM context.
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
 
+    /// Underlying LLVM module.
     std::unique_ptr<llvm::Module> module = std::make_unique<llvm::Module>(mod_filename, *context);
 
-    // LLVM IR builder.
-    llvm::IRBuilder<> ir_builder;
+    /// LLVM IR builder.
+    IRBuilder ir_builder;
 
-    // Debug information builder.
+    /// Debug information builder.
     DebugBuilder debug_builder;
 
-    // Add debug information to the module.
+    /// Add debug information to the module.
     bool add_debug_information;
 
-    // Pass manager for optimisation passes that are used for target code generation.
-    llvm::legacy::FunctionPassManager codegen_pm;
-
-    // Vector library used for maths functions.
-    llvm::TargetLibraryInfoImpl::VectorLibrary vector_library;
-
-    // Pass manager for optimisation passes that are run on IR and are not related to target.
-    llvm::legacy::FunctionPassManager opt_pm;
-
-    // Stack to hold visited values
-    std::vector<llvm::Value*> values;
-
-    // Pointer to the current function.
-    llvm::Function* current_func = nullptr;
-
-    // Pointer to AST symbol table.
+    /// Pointer to AST symbol table.
     symtab::SymbolTable* sym_tab;
 
-    // Run optimisation passes if true.
-    bool opt_passes;
+    /// Instance variable helper.
+    InstanceVarHelper instance_var_helper;
 
-    // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
-    bool use_single_precision;
+    /// Run optimisation passes if true.
+    bool opt_passes;
 
-    // Explicit vectorisation width.
-    int vector_width;
+    /// Pass manager for optimisation passes that are run on IR and are not related to target.
+    llvm::legacy::FunctionPassManager opt_pm;
 
-    // The name of induction variable used in the kernel functions.
-    std::string kernel_id;
+    /// Pass manager for optimisation passes that are used for target code generation.
+    llvm::legacy::FunctionPassManager codegen_pm;
 
-    // A flag to indicate that the code is generated for the kernel.
-    bool is_kernel_code = false;
+    /// Vector library used for maths functions.
+    llvm::TargetLibraryInfoImpl::VectorLibrary vector_library;
 
-    /**
-     *\brief Run LLVM optimisation passes on generated IR
-     *
-     * LLVM provides number of optimisation passes that can be run on the generated IR.
-     * Here we run common optimisation LLVM passes that benefits code optimisation.
-     */
-    void run_ir_opt_passes();
+    /// Explicit vectorisation width.
+    int vector_width;
 
   public:
-    /**
-     * \brief Constructs the LLVM code generator visitor
-     *
-     * This constructor instantiates an NMODL LLVM code generator. This is
-     * just template to work with initial implementation.
-     */
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
                        bool opt_passes,
@@ -143,202 +117,44 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
-        , use_single_precision(use_single_precision)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
         , add_debug_information(add_debug_information)
-        , ir_builder(*context)
+        , ir_builder(*context, use_single_precision, vector_width)
         , debug_builder(*module)
         , codegen_pm(module.get())
         , opt_pm(module.get()) {}
 
+    /// Dumps the generated LLVM IR module to string.
+    std::string dump_module() const {
+        std::string str;
+        llvm::raw_string_ostream os(str);
+        os << *module;
+        os.flush();
+        return str;
+    }
 
-    /**
-     * Generates LLVM code for the given IndexedName
-     * \param node IndexedName NMODL AST node
-     * \return LLVM code generated for this AST node
-     */
-    llvm::Value* codegen_indexed_name(const ast::IndexedName& node);
-
-    /**
-     * Generates LLVM code for the given Instance variable
-     * \param node CodegenInstanceVar NMODL AST node
-     * \return LLVM code generated for this AST node
-     */
-    llvm::Value* codegen_instance_var(const ast::CodegenInstanceVar& node);
-
-    /**
-     * Returns GEP instruction to 1D array
-     * \param name 1D array name
-     * \param index element index
-     * \return GEP instruction value
-     */
-    llvm::Value* create_gep(const std::string& name, llvm::Value* index);
-
-    /**
-     * Returns array index from given IndexedName
-     * \param node IndexedName representing array
-     * \return array index
-     */
-    llvm::Value* get_array_index(const ast::IndexedName& node);
-
-    /**
-     * Returns array length from given IndexedName
-     * \param node IndexedName representing array
-     * \return array length
-     */
-    int get_array_length(const ast::IndexedName& node);
-
-    /**
-     * Returns LLVM type for the given CodegenVarType node
-     * \param node CodegenVarType
-     * \return LLVM type
-     */
-    llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
+    /// Fills the container with the names of kernel functions from the MOD file.
+    void find_kernel_names(std::vector<std::string>& container);
 
-    /**
-     * Returns LLVM vector with `vector_width` int values.
-     * \param int value to replicate
-     * \return LLVM value
-     */
-    llvm::Value* get_constant_int_vector(int value);
-
-    /**
-     * Returns LLVM vector with `vector_width` double values.
-     * \param string a double value to replicate
-     * \return LLVM value
-     */
-    llvm::Value* get_constant_fp_vector(const std::string& value);
-
-    /**
-     * Returns 64-bit or 32-bit LLVM floating type
-     * \return     \c LLVM floating point type according to `use_single_precision` flag
-     */
-    llvm::Type* get_default_fp_type();
-
-    /**
-     * Returns pointer to 64-bit or 32-bit LLVM floating type
-     * \return     \c LLVM pointer to floating point type according to `use_single_precision` flag
-     */
-    llvm::Type* get_default_fp_ptr_type();
-
-    /**
-     * Returns a pointer to LLVM struct type
-     * \return LLVM pointer type
-     */
-    llvm::Type* get_instance_struct_type();
+    /// Returns underlying module.
+    std::unique_ptr<llvm::Module> get_module() {
+        return std::move(module);
+    }
 
-    /**
-     * Returns a LLVM value corresponding to the VarName node
-     * \return LLVM value
-     */
-    llvm::Value* get_variable_ptr(const ast::VarName& node);
-
-    /**
-     * Returns shared_ptr to generated ast::InstanceStruct
-     * \return std::shared_ptr<ast::InstanceStruct>
-     */
-    std::shared_ptr<ast::InstanceStruct> get_instance_struct_ptr();
-
-    /**
-     * Create a function call to an external method
-     * \param name external method name
-     * \param arguments expressions passed as arguments to the given external method
-     */
-    void create_external_method_call(const std::string& name,
-                                     const ast::ExpressionVector& arguments);
-
-    /**
-     * Create a function call to NMODL function or procedure in the same mod file
-     * \param func LLVM function corresponding ti this call
-     * \param name function name
-     * \param arguments expressions passed as arguments to the function call
-     */
-    void create_function_call(llvm::Function* func,
-                              const std::string& name,
-                              const ast::ExpressionVector& arguments);
-    /**
-     * Create a function call to printf function
-     * \param arguments expressions passed as arguments to the printf call
-     */
-    void create_printf_call(const ast::ExpressionVector& arguments);
+    /// Returns shared_ptr to generated ast::InstanceStruct.
+    std::shared_ptr<ast::InstanceStruct> get_instance_struct_ptr() {
+        return instance_var_helper.instance;
+    }
 
-    /**
-     * Emit function or procedure declaration in LLVM given the node
-     *
-     * \param node the AST node representing the function or procedure in NMODL
-     */
-    void emit_procedure_or_function_declaration(const ast::CodegenFunction& node);
-
-    /**
-     * Return InstanceVarHelper
-     * \return InstanceVarHelper
-     */
+    /// Returns InstanceVarHelper for the given MOD file.
     InstanceVarHelper get_instance_var_helper() {
         return instance_var_helper;
     }
 
-    /**
-     * Return module pointer
-     * \return LLVM IR module pointer
-     */
-    std::unique_ptr<llvm::Module> get_module() {
-        return std::move(module);
-    }
-
-    /**
-     * Lookup the given name in the current function's symbol table
-     * \return LLVM value
-     */
-    llvm::Value* lookup(const std::string& name);
-
-    /**
-     * Fills values vector with processed NMODL function call arguments
-     * \param arguments expression vector
-     * \param arg_values vector of LLVM IR values to fill
-     */
-    void pack_function_call_arguments(const ast::ExpressionVector& arguments,
-                                      std::vector<llvm::Value*>& arg_values);
-
-    /**
-     * Visit nmodl arithmetic binary operator
-     * \param lhs LLVM value of evaluated lhs expression
-     * \param rhs LLVM value of evaluated rhs expression
-     * \param op the AST binary operator (ADD, DIV, MUL, SUB)
-     * \return LLVM IR value result
-     */
-    llvm::Value* visit_arithmetic_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
-
-    /**
-     * Visit nmodl assignment operator (ASSIGN)
-     * \param node the AST node representing the binary expression in NMODL
-     * \param rhs LLVM value of evaluated rhs expression
-     */
-    void visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs);
-
-    /**
-     * Visit nmodl logical binary operator
-     * \param lhs LLVM value of evaluated lhs expression
-     * \param rhs LLVM value of evaluated rhs expression
-     * \param op the AST binary operator (AND, OR)
-     * \return LLVM IR value result
-     */
-    llvm::Value* visit_logical_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
-
-    /**
-     * Visit nmodl comparison binary operator
-     * \param lhs LLVM value of evaluated lhs expression
-     * \param rhs LLVM value of evaluated rhs expression
-     * \param op the AST binary operator (EXACT_EQUAL, GREATER, GREATER_EQUAL, LESS, LESS_EQUAL,
-     * NOT_EQUAL) \return LLVM IR value result
-     */
-    llvm::Value* visit_comparison_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
-
-
-    // Visitors
+    // Visitors.
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
-    void visit_statement_block(const ast::StatementBlock& node) override;
     void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
@@ -350,31 +166,65 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_integer(const ast::Integer& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
+    void visit_statement_block(const ast::StatementBlock& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
     void visit_while_statement(const ast::WhileStatement& node) override;
 
-    /**
-     * Dumps the generated LLVM IR module to string.
-     */
-    std::string dump_module() const {
-        std::string str;
-        llvm::raw_string_ostream os(str);
-        os << *module;
-        os.flush();
-        return str;
-    }
+    /// Wraps all kernel function calls into wrapper functions that use `void*` to pass the data to
+    /// the kernel.
+    void wrap_kernel_functions();
 
-    /**
-     * Fills the container with the names of kernel functions from the MOD file.
-     */
-    void find_kernel_names(std::vector<std::string>& container);
+  private:
+    /// Accepts the given AST node and returns the processed value.
+    llvm::Value* accept_and_get(const std::shared_ptr<ast::Node>& node);
 
-    /**
-     * Wraps all kernel function calls into wrapper functions that use void* to pass the data to the
-     * kernel.
-     */
-    void wrap_kernel_functions();
+    /// Creates a call to an external function (e.g pow, exp, etc.)
+    void create_external_function_call(const std::string& name,
+                                       const ast::ExpressionVector& arguments);
+
+    /// Creates a call to NMODL function or procedure in the same MOD file.
+    void create_function_call(llvm::Function* func,
+                              const std::string& name,
+                              const ast::ExpressionVector& arguments);
+
+    /// Fills values vector with processed NMODL function call arguments.
+    void create_function_call_arguments(const ast::ExpressionVector& arguments,
+                                        ValueVector& arg_values);
+
+    /// Creates the function declaration for the given AST node.
+    void create_function_declaration(const ast::CodegenFunction& node);
+
+    /// Creates a call to `printf` function.
+    void create_printf_call(const ast::ExpressionVector& arguments);
+
+    /// Returns LLVM type for the given CodegenVarType AST node.
+    llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
+
+    /// Returns the index value from the IndexedName AST node.
+    llvm::Value* get_index(const ast::IndexedName& node);
+
+    /// Returns an instance struct type.
+    llvm::Type* get_instance_struct_type();
+
+    /// Returns the number of elements in the array specified by the IndexedName AST node.
+    int get_num_elements(const ast::IndexedName& node);
+
+    /// If the value to store is specified, writes it to the instance. Otherwise, returns the
+    /// instance variable.
+    llvm::Value* read_from_or_write_to_instance(const ast::CodegenInstanceVar& node,
+                                                llvm::Value* maybe_value_to_store = nullptr);
+
+    /// Reads the given variable and returns the processed value.
+    llvm::Value* read_variable(const ast::VarName& node);
+
+
+    /// Run multiple LLVM optimisation passes on generated IR.
+    /// TODO: this can be moved to a dedicated file or deprecated.
+    void run_ir_opt_passes();
+
+    //// Writes the value to the given variable.
+    void write_to_variable(const ast::VarName& node, llvm::Value* value);
 };
 
 /** \} */  // end of llvm_backends
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
new file mode 100644
index 0000000000..2773e6929b
--- /dev/null
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -0,0 +1,427 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/llvm_ir_builder.hpp"
+#include "ast/all.hpp"
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/ValueSymbolTable.h"
+
+namespace nmodl {
+namespace codegen {
+
+
+/****************************************************************************************/
+/*                            LLVM type utilities                                       */
+/****************************************************************************************/
+
+llvm::Type* IRBuilder::get_boolean_type() {
+    return llvm::Type::getInt1Ty(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_i8_ptr_type() {
+    return llvm::Type::getInt8PtrTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_i32_type() {
+    return llvm::Type::getInt32Ty(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_i32_ptr_type() {
+    return llvm::Type::getInt32PtrTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_i64_type() {
+    return llvm::Type::getInt64Ty(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_fp_type() {
+    if (fp_precision == single_precision)
+        return llvm::Type::getFloatTy(builder.getContext());
+    return llvm::Type::getDoubleTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_fp_ptr_type() {
+    if (fp_precision == single_precision)
+        return llvm::Type::getFloatPtrTy(builder.getContext());
+    return llvm::Type::getDoublePtrTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_void_type() {
+    return llvm::Type::getVoidTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_struct_ptr_type(const std::string& struct_type_name,
+                                           TypeVector& member_types) {
+    llvm::StructType* llvm_struct_type = llvm::StructType::create(builder.getContext(),
+                                                                  struct_type_name);
+    llvm_struct_type->setBody(member_types);
+    return llvm::PointerType::get(llvm_struct_type, /*AddressSpace=*/0);
+}
+
+
+/****************************************************************************************/
+/*                            LLVM value utilities                                      */
+/****************************************************************************************/
+
+llvm::Value* IRBuilder::lookup_value(const std::string& value_name) {
+    auto value = current_function->getValueSymbolTable()->lookup(value_name);
+    if (!value)
+        throw std::runtime_error("Error: variable " + value_name + " is not in the scope\n");
+    return value;
+}
+
+llvm::Value* IRBuilder::pop_last_value() {
+    // Check if the stack is empty.
+    if (value_stack.empty())
+        throw std::runtime_error("Error: popping a value from the empty stack\n");
+
+    // Return the last added value and delete it from the stack.
+    llvm::Value* last = value_stack.back();
+    value_stack.pop_back();
+    return last;
+}
+
+/****************************************************************************************/
+/*                            LLVM constants utilities                                  */
+/****************************************************************************************/
+
+void IRBuilder::create_boolean_constant(int value) {
+    value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_boolean_type(), value));
+}
+
+void IRBuilder::create_fp_constant(const std::string& value) {
+    if (instruction_width > 1 && vectorize) {
+        value_stack.push_back(get_vector_constant<llvm::ConstantFP>(get_fp_type(), value));
+    } else {
+        value_stack.push_back(get_scalar_constant<llvm::ConstantFP>(get_fp_type(), value));
+    }
+}
+
+llvm::Value* IRBuilder::create_global_string(const ast::String& node) {
+    return builder.CreateGlobalStringPtr(node.get_value());
+}
+
+void IRBuilder::create_i32_constant(int value) {
+    if (instruction_width > 1 && vectorize) {
+        value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_i32_type(), value));
+    } else {
+        value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_i32_type(), value));
+    }
+}
+
+template <typename C, typename V>
+llvm::Value* IRBuilder::get_scalar_constant(llvm::Type* type, V value) {
+    return C::get(type, value);
+}
+
+template <typename C, typename V>
+llvm::Value* IRBuilder::get_vector_constant(llvm::Type* type, V value) {
+    ConstantVector constants;
+    for (unsigned i = 0; i < instruction_width; ++i) {
+        const auto& element = C::get(type, value);
+        constants.push_back(element);
+    }
+    return llvm::ConstantVector::get(constants);
+}
+
+/****************************************************************************************/
+/*                              LLVM function utilities                                 */
+/****************************************************************************************/
+
+void IRBuilder::allocate_function_arguments(llvm::Function* function,
+                                            const ast::CodegenVarWithTypeVector& nmodl_arguments) {
+    unsigned i = 0;
+    for (auto& arg: function->args()) {
+        std::string arg_name = nmodl_arguments[i++].get()->get_node_name();
+        llvm::Type* arg_type = arg.getType();
+        llvm::Value* alloca = builder.CreateAlloca(arg_type, /*ArraySize=*/nullptr, arg_name);
+        arg.setName(arg_name);
+        builder.CreateStore(&arg, alloca);
+    }
+}
+
+std::string IRBuilder::get_current_function_name() {
+    return current_function->getName().str();
+}
+
+void IRBuilder::create_function_call(llvm::Function* callee,
+                                     ValueVector& arguments,
+                                     bool use_result) {
+    llvm::Value* call_instruction = builder.CreateCall(callee, arguments);
+    if (use_result)
+        value_stack.push_back(call_instruction);
+}
+
+void IRBuilder::create_intrinsic(const std::string& name,
+                                 ValueVector& argument_values,
+                                 TypeVector& argument_types) {
+    unsigned intrinsic_id = llvm::StringSwitch<llvm::Intrinsic::ID>(name)
+                                .Case("exp", llvm::Intrinsic::exp)
+                                .Case("pow", llvm::Intrinsic::pow)
+                                .Default(llvm::Intrinsic::not_intrinsic);
+    if (intrinsic_id) {
+        llvm::Value* intrinsic =
+            builder.CreateIntrinsic(intrinsic_id, argument_types, argument_values);
+        value_stack.push_back(intrinsic);
+    } else {
+        throw std::runtime_error("Error: calls to " + name + " are not valid or not supported\n");
+    }
+}
+
+/****************************************************************************************/
+/*                             LLVM instruction utilities                               */
+/****************************************************************************************/
+
+void IRBuilder::create_array_alloca(const std::string& name,
+                                    llvm::Type* element_type,
+                                    int num_elements) {
+    llvm::Type* array_type = llvm::ArrayType::get(element_type, num_elements);
+    builder.CreateAlloca(array_type, /*ArraySize=*/nullptr, name);
+}
+
+void IRBuilder::create_binary_op(llvm::Value* lhs, llvm::Value* rhs, ast::BinaryOp op) {
+    // Check that both lhs and rhs have the same types.
+    if (lhs->getType() != rhs->getType())
+        throw std::runtime_error(
+            "Error: lhs and rhs of the binary operator have different types\n");
+
+    llvm::Value* result;
+    switch (op) {
+#define DISPATCH(binary_op, fp_instruction, integer_instruction) \
+    case binary_op:                                              \
+        if (lhs->getType()->isIntOrIntVectorTy())                \
+            result = integer_instruction(lhs, rhs);              \
+        else                                                     \
+            result = fp_instruction(lhs, rhs);                   \
+        break;
+
+        // Arithmetic instructions.
+        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd, builder.CreateAdd);
+        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv, builder.CreateSDiv);
+        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul, builder.CreateMul);
+        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub, builder.CreateSub);
+
+        // Comparison instructions.
+        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, builder.CreateFCmpOEQ, builder.CreateICmpEQ);
+        DISPATCH(ast::BinaryOp::BOP_GREATER, builder.CreateFCmpOGT, builder.CreateICmpSGT);
+        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL, builder.CreateFCmpOGE, builder.CreateICmpSGE);
+        DISPATCH(ast::BinaryOp::BOP_LESS, builder.CreateFCmpOLT, builder.CreateICmpSLT);
+        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, builder.CreateFCmpOLE, builder.CreateICmpSLE);
+        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, builder.CreateFCmpONE, builder.CreateICmpNE);
+
+#undef DISPATCH
+
+    // Logical instructions.
+    case ast::BinaryOp::BOP_AND:
+        result = builder.CreateAnd(lhs, rhs);
+        break;
+    case ast::BinaryOp::BOP_OR:
+        result = builder.CreateOr(lhs, rhs);
+        break;
+
+    default:
+        throw std::runtime_error("Error: unsupported binary operator\n");
+    }
+    value_stack.push_back(result);
+}
+
+llvm::Value* IRBuilder::create_bitcast(llvm::Value* value, llvm::Type* dst_type) {
+    return builder.CreateBitCast(value, dst_type);
+}
+
+llvm::Value* IRBuilder::create_inbounds_gep(const std::string& var_name, llvm::Value* index) {
+    llvm::Value* variable_ptr = lookup_value(var_name);
+
+    // Since we index through the pointer, we need an extra 0 index in the indices list for GEP.
+    ValueVector indices{llvm::ConstantInt::get(get_i64_type(), 0), index};
+    return builder.CreateInBoundsGEP(variable_ptr, indices);
+}
+
+llvm::Value* IRBuilder::create_inbounds_gep(llvm::Value* variable, llvm::Value* index) {
+    return builder.CreateInBoundsGEP(variable, {index});
+}
+
+llvm::Value* IRBuilder::create_index(llvm::Value* value) {
+    // Check if index is a double. While it is possible to use casting from double to integer
+    // values, we choose not to support these cases.
+    llvm::Type* value_type = value->getType();
+    if (!value_type->isIntOrIntVectorTy())
+        throw std::runtime_error("Error: only integer indexing is supported\n");
+
+    // Conventionally, in LLVM array indices are 64 bit.
+    llvm::Type* i64_type = get_i64_type();
+    if (auto index_type = llvm::dyn_cast<llvm::IntegerType>(value_type)) {
+        if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
+            return value;
+        return builder.CreateSExtOrTrunc(value, i64_type);
+    }
+
+    const auto& vector_type = llvm::cast<llvm::FixedVectorType>(value_type);
+    const auto& element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
+    if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
+        return value;
+    return builder.CreateSExtOrTrunc(value,
+                                     llvm::FixedVectorType::get(i64_type, instruction_width));
+}
+
+llvm::Value* IRBuilder::create_load(const std::string& name) {
+    llvm::Value* ptr = lookup_value(name);
+    llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
+    return builder.CreateLoad(loaded_type, ptr);
+}
+
+llvm::Value* IRBuilder::create_load(llvm::Value* ptr) {
+    llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
+    return builder.CreateLoad(loaded_type, ptr);
+}
+
+llvm::Value* IRBuilder::create_load_from_array(const std::string& name, llvm::Value* index) {
+    llvm::Value* element_ptr = create_inbounds_gep(name, index);
+    return create_load(element_ptr);
+}
+
+void IRBuilder::create_store(const std::string& name, llvm::Value* value) {
+    llvm::Value* ptr = lookup_value(name);
+    builder.CreateStore(value, ptr);
+}
+
+void IRBuilder::create_store(llvm::Value* ptr, llvm::Value* value) {
+    builder.CreateStore(value, ptr);
+}
+
+void IRBuilder::create_store_to_array(const std::string& name,
+                                      llvm::Value* index,
+                                      llvm::Value* value) {
+    llvm::Value* element_ptr = create_inbounds_gep(name, index);
+    create_store(element_ptr, value);
+}
+
+void IRBuilder::create_return(llvm::Value* return_value) {
+    if (return_value)
+        builder.CreateRet(return_value);
+    else
+        builder.CreateRetVoid();
+}
+
+void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
+                                               llvm::Type* element_or_scalar_type) {
+    // Even if generating vectorised code, some variables still need to be scalar. Particularly, the
+    // induction variable "id" and remainder loop variables (that start with "epilogue" prefix).
+    llvm::Type* type;
+    if (instruction_width > 1 && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+        type = llvm::FixedVectorType::get(element_or_scalar_type, instruction_width);
+    } else {
+        type = element_or_scalar_type;
+    }
+    builder.CreateAlloca(type, /*ArraySize=*/nullptr, name);
+}
+
+void IRBuilder::create_unary_op(llvm::Value* value, ast::UnaryOp op) {
+    if (op == ast::UOP_NEGATION) {
+        value_stack.push_back(builder.CreateFNeg(value));
+    } else if (op == ast::UOP_NOT) {
+        value_stack.push_back(builder.CreateNot(value));
+    } else {
+        throw std::runtime_error("Error: unsupported unary operator\n");
+    }
+}
+
+llvm::Value* IRBuilder::get_struct_member_ptr(llvm::Value* struct_variable, int member_index) {
+    ValueVector indices;
+    indices.push_back(llvm::ConstantInt::get(get_i32_type(), 0));
+    indices.push_back(llvm::ConstantInt::get(get_i32_type(), member_index));
+    return builder.CreateInBoundsGEP(struct_variable, indices);
+}
+
+llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
+                                                    llvm::Value* id_value,
+                                                    llvm::Value* array,
+                                                    llvm::Value* maybe_value_to_store) {
+    // First, calculate the address of the element in the array.
+    llvm::Value* element_ptr = create_inbounds_gep(array, id_value);
+
+    // If the vector code is generated, we need to distinguish between two cases. If the array is
+    // indexed indirectly (i.e. not by an induction variable `kernel_id`), create a gather
+    // instruction.
+    if (id_name != kernel_id && vectorize && instruction_width > 1)
+        return builder.CreateMaskedGather(element_ptr, llvm::Align());
+
+    llvm::Value* ptr;
+    if (vectorize && instruction_width > 1) {
+        // If direct indexing is used during the vectorization, we simply bitcast the scalar pointer
+        // to a vector pointer
+        llvm::Type* vector_type = llvm::PointerType::get(
+            llvm::FixedVectorType::get(element_ptr->getType()->getPointerElementType(),
+                                       instruction_width),
+            /*AddressSpace=*/0);
+        ptr = builder.CreateBitCast(element_ptr, vector_type);
+    } else {
+        // Otherwise, scalar code is generated and hence return the element pointer.
+        ptr = element_ptr;
+    }
+
+    if (maybe_value_to_store) {
+        create_store(ptr, maybe_value_to_store);
+        return nullptr;
+    } else {
+        return create_load(ptr);
+    }
+}
+
+void IRBuilder::maybe_replicate_value(llvm::Value* value) {
+    // If the value should not be vectorised, or it is already a vector, add it to the stack.
+    if (!vectorize || instruction_width == 1 || value->getType()->isVectorTy()) {
+        value_stack.push_back(value);
+    } else {
+        // Otherwise, we generate vectorized code inside the loop, so replicate the value to form a
+        // vector.
+        llvm::Value* vector_value = builder.CreateVectorSplat(instruction_width, value);
+        value_stack.push_back(vector_value);
+    }
+}
+
+
+/****************************************************************************************/
+/*                                 LLVM block utilities                                 */
+/****************************************************************************************/
+
+llvm::BasicBlock* IRBuilder::create_block_and_set_insertion_point(llvm::Function* function,
+                                                                  llvm::BasicBlock* insert_before,
+                                                                  std::string name) {
+    llvm::BasicBlock* block =
+        llvm::BasicBlock::Create(builder.getContext(), name, function, insert_before);
+    builder.SetInsertPoint(block);
+    return block;
+}
+
+void IRBuilder::create_br(llvm::BasicBlock* block) {
+    builder.CreateBr(block);
+}
+
+void IRBuilder::create_br_and_set_insertion_point(llvm::BasicBlock* block) {
+    builder.CreateBr(block);
+    builder.SetInsertPoint(block);
+}
+
+void IRBuilder::create_cond_br(llvm::Value* condition,
+                               llvm::BasicBlock* true_block,
+                               llvm::BasicBlock* false_block) {
+    builder.CreateCondBr(condition, true_block, false_block);
+}
+
+llvm::BasicBlock* IRBuilder::get_current_block() {
+    return builder.GetInsertBlock();
+}
+
+void IRBuilder::set_insertion_point(llvm::BasicBlock* block) {
+    builder.SetInsertPoint(block);
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
new file mode 100644
index 0000000000..b1b23ff0cf
--- /dev/null
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -0,0 +1,272 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+#include "symtab/symbol_table.hpp"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+
+namespace nmodl {
+namespace codegen {
+
+/// Floating point bit widths.
+static constexpr const unsigned single_precision = 32;
+static constexpr const unsigned double_precision = 64;
+
+/// Some typedefs.
+using ConstantVector = std::vector<llvm::Constant*>;
+using TypeVector = std::vector<llvm::Type*>;
+using ValueVector = std::vector<llvm::Value*>;
+
+/**
+ * \class IRBuilder
+ * \brief A helper class to generate LLVM IR for NMODL AST.
+ */
+class IRBuilder {
+  private:
+    /// Underlying LLVM IR builder.
+    llvm::IRBuilder<> builder;
+
+    /// Stack to hold visited and processed values.
+    ValueVector value_stack;
+
+    /// Pointer to the current function for which the code is generated.
+    llvm::Function* current_function;
+
+    /// Symbol table of the NMODL AST.
+    symtab::SymbolTable* symbol_table;
+
+    /// Flag to indicate that the generated IR should be vectorized.
+    bool vectorize;
+
+    /// Precision of the floating-point numbers (32 or 64 bit).
+    unsigned fp_precision;
+
+    /// If 1, indicates that the scalar code is generated. Otherwise, the current vectorization
+    /// width.
+    unsigned instruction_width;
+
+    /// The vector width used for the vectorized code.
+    unsigned vector_width;
+
+    /// The name of induction variable used in kernel loops.
+    std::string kernel_id;
+
+  public:
+    IRBuilder(llvm::LLVMContext& context,
+              bool use_single_precision = false,
+              unsigned vector_width = 1)
+        : builder(context)
+        , symbol_table(nullptr)
+        , current_function(nullptr)
+        , vectorize(false)
+        , fp_precision(use_single_precision ? single_precision : double_precision)
+        , vector_width(vector_width)
+        , instruction_width(vector_width)
+        , kernel_id("") {}
+
+    /// Initializes the builder with the symbol table and the kernel induction variable id.
+    void initialize(symtab::SymbolTable& symbol_table, std::string& kernel_id) {
+        this->symbol_table = &symbol_table;
+        this->kernel_id = kernel_id;
+    }
+
+    /// Explicitly sets the builder to produce scalar code (even during vectorization).
+    void generate_scalar_code() {
+        instruction_width = 1;
+    }
+
+    /// Explicitly sets the builder to produce vectorized code.
+    void generate_vectorized_code() {
+        instruction_width = vector_width;
+    }
+
+    /// Turns on vectorization mode.
+    void start_vectorization() {
+        vectorize = true;
+    }
+
+    /// Turns off vectorization mode.
+    void stop_vectorization() {
+        vectorize = false;
+    }
+
+    /// Sets the current function for which LLVM IR is generated.
+    void set_function(llvm::Function* function) {
+        current_function = function;
+    }
+
+    /// Clears the stack of the values and unsets the current function.
+    void clear_function() {
+        value_stack.clear();
+        current_function = nullptr;
+    }
+
+    /// Generates LLVM IR to allocate the arguments of the function on the stack.
+    void allocate_function_arguments(llvm::Function* function,
+                                     const ast::CodegenVarWithTypeVector& nmodl_arguments);
+
+    /// Generates IR for allocating an array.
+    void create_array_alloca(const std::string& name, llvm::Type* element_type, int num_elements);
+
+    /// Generates LLVM IR for the given binary operator.
+    void create_binary_op(llvm::Value* lhs, llvm::Value* rhs, ast::BinaryOp op);
+
+    /// Generates LLVM IR for the bitcast instruction.
+    llvm::Value* create_bitcast(llvm::Value* value, llvm::Type* dst_type);
+
+    /// Create a basic block and set the builder's insertion point to it.
+    llvm::BasicBlock* create_block_and_set_insertion_point(
+        llvm::Function* function,
+        llvm::BasicBlock* insert_before = nullptr,
+        std::string name = "");
+
+    /// Generates LLVM IR for unconditional branch.
+    void create_br(llvm::BasicBlock* block);
+
+    /// Generates LLVM IR for unconditional branch and sets the insertion point to this block.
+    void create_br_and_set_insertion_point(llvm::BasicBlock* block);
+
+    /// Generates LLVM IR for conditional branch.
+    void create_cond_br(llvm::Value* condition,
+                        llvm::BasicBlock* true_block,
+                        llvm::BasicBlock* false_block);
+
+    /// Generates LLVM IR for the boolean constant.
+    void create_boolean_constant(int value);
+
+    /// Generates LLVM IR for the floating-point constant.
+    void create_fp_constant(const std::string& value);
+
+    /// Generates LLVM IR for a call to the function.
+    void create_function_call(llvm::Function* callee,
+                              ValueVector& arguments,
+                              bool use_result = true);
+
+    /// Generates LLVM IR for the string value.
+    llvm::Value* create_global_string(const ast::String& node);
+
+    /// Generates LLVM IR to transform the value into an index by possibly sign-extending it.
+    llvm::Value* create_index(llvm::Value* value);
+
+    /// Generates an intrinsic that corresponds to the given name.
+    void create_intrinsic(const std::string& name,
+                          ValueVector& argument_values,
+                          TypeVector& argument_types);
+
+    /// Generates LLVM IR for the integer constant.
+    void create_i32_constant(int value);
+
+    /// Generates LLVM IR to load the value specified by its name and returns it.
+    llvm::Value* create_load(const std::string& name);
+
+    /// Generates LLVM IR to load the value from the pointer and returns it.
+    llvm::Value* create_load(llvm::Value* ptr);
+
+    /// Generates LLVM IR to load the element at the specified index from the given array name and
+    /// returns it.
+    llvm::Value* create_load_from_array(const std::string& name, llvm::Value* index);
+
+    /// Generates LLVM IR to store the value to the location specified by the name.
+    void create_store(const std::string& name, llvm::Value* value);
+
+    /// Generates LLVM IR to store the value to the location specified by the pointer.
+    void create_store(llvm::Value* ptr, llvm::Value* value);
+
+    /// Generates LLVM IR to store the value to the array element, where array is specified by the
+    /// name.
+    void create_store_to_array(const std::string& name, llvm::Value* index, llvm::Value* value);
+
+    /// Generates LLVM IR return instructions.
+    void create_return(llvm::Value* return_value = nullptr);
+
+    /// Generates IR for allocating a scalar or vector variable.
+    void create_scalar_or_vector_alloca(const std::string& name,
+                                        llvm::Type* element_or_scalar_type);
+
+    /// Generates LLVM IR for the given unary operator.
+    void create_unary_op(llvm::Value* value, ast::UnaryOp op);
+
+    /// Creates a boolean (1-bit integer) type.
+    llvm::Type* get_boolean_type();
+
+    /// Returns current basic block.
+    llvm::BasicBlock* get_current_block();
+
+    /// Returns the name of the function for which LLVM IR is generated.
+    std::string get_current_function_name();
+
+    /// Creates a pointer to 8-bit integer type.
+    llvm::Type* get_i8_ptr_type();
+
+    /// Creates a 32-bit integer type.
+    llvm::Type* get_i32_type();
+
+    /// Creates a pointer to 32-bit integer type.
+    llvm::Type* get_i32_ptr_type();
+
+    /// Creates a 64-bit integer type.
+    llvm::Type* get_i64_type();
+
+    /// Creates a floating-point type.
+    llvm::Type* get_fp_type();
+
+    /// Creates a pointer to floating-point type.
+    llvm::Type* get_fp_ptr_type();
+
+    /// Creates a void type.
+    llvm::Type* get_void_type();
+
+    /// Generates LLVM IR to get the address of the struct's member at given index. Returns the
+    /// calculated value.
+    llvm::Value* get_struct_member_ptr(llvm::Value* struct_variable, int member_index);
+
+    /// Creates a pointer to struct type with the given name and given members.
+    llvm::Type* get_struct_ptr_type(const std::string& struct_type_name, TypeVector& member_types);
+
+    /// Generates IR that loads the elements of the array even during vectorization. If the value is
+    /// specified, then it is stored to the array at the given index.
+    llvm::Value* load_to_or_store_from_array(const std::string& id_name,
+                                             llvm::Value* id_value,
+                                             llvm::Value* array,
+                                             llvm::Value* maybe_value_to_store = nullptr);
+
+    /// Lookups the value by  its name in the current function's symbol table.
+    llvm::Value* lookup_value(const std::string& value_name);
+
+    /// Generates IR to replicate the value if vectorizing the code.
+    void maybe_replicate_value(llvm::Value* value);
+
+    /// Sets builder's insertion point to the given block.
+    void set_insertion_point(llvm::BasicBlock* block);
+
+    /// Pops the last visited value from the value stack.
+    llvm::Value* pop_last_value();
+
+  private:
+    /// Generates an inbounds GEP instruction for the given name and returns calculated address.
+    llvm::Value* create_inbounds_gep(const std::string& variable_name, llvm::Value* index);
+
+    /// Generates an inbounds GEP instruction for the given value and returns calculated address.
+    llvm::Value* create_inbounds_gep(llvm::Value* variable, llvm::Value* index);
+
+    /// Returns a scalar constant of the provided type.
+    template <typename C, typename V>
+    llvm::Value* get_scalar_constant(llvm::Type* type, V value);
+
+    /// Returns a vector constant of the provided type.
+    template <typename C, typename V>
+    llvm::Value* get_vector_constant(llvm::Type* type, V value);
+};
+
+}  // namespace codegen
+}  // namespace nmodl

From 454a18fe38d4e91e42bb05f9fc3bda5dab9b2b15 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 13 May 2021 05:46:12 -0700
Subject: [PATCH 055/331] Fixed initialisation of `CodegenAtomicStatement`
 (#642)

  * Fixed CodegenAtomicStatement initialisation
  * Removed unused variable and changed comment
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c9968df8ee..10aee780ce 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -256,10 +256,18 @@ static void append_statements_from_block(ast::StatementVector& statements,
     }
 }
 
-static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(std::string& lhs_str,
-                                                                            std::string& op_str,
-                                                                            std::string& rhs_str) {
-    auto lhs = std::make_shared<ast::Name>(new ast::String(lhs_str));
+static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(
+    std::string& ion_varname,
+    std::string& index_varname,
+    std::string& op_str,
+    std::string& rhs_str) {
+    // create lhs expression
+    auto varname = new ast::Name(new ast::String(ion_varname));
+    auto index = new ast::Name(new ast::String(index_varname));
+    auto lhs = std::make_shared<ast::VarName>(new ast::IndexedName(varname, index),
+                                              /*at=*/nullptr,
+                                              /*index=*/nullptr);
+
     auto op = ast::BinaryOperator(ast::string_to_binaryop(op_str));
     auto rhs = create_expression(rhs_str);
     return std::make_shared<ast::CodegenAtomicStatement>(lhs, op, rhs);
@@ -362,12 +370,11 @@ void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
         std::string index_varname = "{}_id"_format(ion_varname);
         // load index
         std::string index_statement = "{} = {}_index[id]"_format(index_varname, ion_varname);
-        // ion variable to write (with index)
-        std::string ion_to_write = "{}[{}]"_format(ion_varname, index_varname);
         // push index definition, index statement and actual write statement
         int_variables.push_back(index_varname);
         index_statements.push_back(visitor::create_statement(index_statement));
-        body_statements.push_back(create_atomic_statement(ion_to_write, op, rhs));
+        // pass ion variable to write and its index
+        body_statements.push_back(create_atomic_statement(ion_varname, index_varname, op, rhs));
     };
 
     /// iterate over all ions and create write ion statements for given block type

From 53727df5b76fd3c3e763adbb373122d1cd61c7cc Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Thu, 13 May 2021 15:25:24 +0200
Subject: [PATCH 056/331] Fix instance struct data generation for
 testing/benchmarking (#641)

* Instance data structure initialization had following bug
   - instance struct has int member variables which act as
     offsets to other vectors (e.g. node_index, na_ion_index)
   - these variables were initialized from 1 to N where N was
     incremented always without considering the upper bound on
     for offset.
* With this fix
   - index / integer variables are always initialized from
     0 to N-1.
   - Variables are initialised 1e-5 prevision so that we have
     reaosanbly bigger values
   - Update tests to check offset from 0 to N-1
---
 test/unit/codegen/codegen_data_helper.cpp     |  9 +++++++-
 test/unit/codegen/codegen_data_helper.hpp     | 23 +++++++++----------
 .../codegen/codegen_llvm_instance_struct.cpp  |  9 +++++---
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index 4bf94f583d..a0ee6ec957 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -115,7 +115,14 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
         // allocate memory and setup a pointer
         void* member;
         posix_memalign(&member, NBYTE_ALIGNMENT, member_size * num_elements);
-        initialize_variable(var, member, variable_index, num_elements);
+
+        // integer values are often offsets so they must start from
+        // 0 to num_elements-1 to avoid out of bound accesses.
+        int initial_value = variable_index;
+        if (type == ast::AstNodeType::INTEGER) {
+            initial_value = 0;
+        }
+        initialize_variable(var, member, initial_value, num_elements);
         data.num_bytes += member_size * num_elements;
 
         // copy address at specific location in the struct
diff --git a/test/unit/codegen/codegen_data_helper.hpp b/test/unit/codegen/codegen_data_helper.hpp
index ef8e869366..76c4f422d9 100644
--- a/test/unit/codegen/codegen_data_helper.hpp
+++ b/test/unit/codegen/codegen_data_helper.hpp
@@ -57,11 +57,12 @@ struct CodegenInstanceData {
 /**
  * Generate vector of dummy data according to the template type specified
  *
- * For double type: generate vector starting from (initial_value + 1e-15)
- *                  with increments of 1e-15
- * For float type:  generate vector starting from (initial_value + 1e-6)
- *                  with increments of 1e-6
- * For int type:    generate vector starting from (initial_value + 1) with
+ * For double or float type: generate vector starting from `initial_value`
+ *                  with an increment of 1e-5. The increment can be any other
+ *                  value but 1e-5 is chosen because when we benchmark with
+ *                  a million elements then the values are in the range of
+ *                  <initial_value, initial_value + 10>.
+ * For int type:    generate vector starting from initial_value with an
  *                  increments of 1
  *
  * \param inital_value Base value for initializing the data
@@ -71,16 +72,14 @@ struct CodegenInstanceData {
 template <typename T>
 std::vector<T> generate_dummy_data(size_t initial_value, size_t num_elements) {
     std::vector<T> data(num_elements);
-    T precision;
-    if (std::is_same<T, double>::value) {
-        precision = 1e-15;
-    } else if (std::is_same<T, float>::value) {
-        precision = 1e-6;
+    T increment;
+    if (std::is_same<T, int>::value) {
+        increment = 1;
     } else {
-        precision = 1;
+        increment = 1e-5;
     }
     for (size_t i = 0; i < num_elements; i++) {
-        data[i] = initial_value + precision * (i + 1);
+        data[i] = initial_value + increment * i;
     }
     return data;
 }
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 52b9bb9868..e77b6844ae 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -132,8 +132,12 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
                             generate_dummy_data<double>(ena_index, num_elements)));
             REQUIRE(compare(instance_data.members[ion_ena_index],
                             generate_dummy_data<double>(ion_ena_index, num_elements)));
+            // index variables are offsets, they start from 0
+            REQUIRE(compare(instance_data.members[ion_ena_index_index],
+                            generate_dummy_data<int>(0, num_elements)));
             REQUIRE(compare(instance_data.members[node_index_index],
-                            generate_dummy_data<int>(node_index_index, num_elements)));
+                            generate_dummy_data<int>(0, num_elements)));
+
             REQUIRE(*static_cast<double*>(instance_data.members[t_index]) ==
                     default_nthread_t_value);
             REQUIRE(*static_cast<int*>(instance_data.members[node_count_index]) == num_elements);
@@ -164,8 +168,7 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
             REQUIRE(compare(instance->ena, generate_dummy_data<double>(ena_index, num_elements)));
             REQUIRE(compare(instance->ion_ena,
                             generate_dummy_data<double>(ion_ena_index, num_elements)));
-            REQUIRE(compare(instance->node_index,
-                            generate_dummy_data<int>(node_index_index, num_elements)));
+            REQUIRE(compare(instance->node_index, generate_dummy_data<int>(0, num_elements)));
             REQUIRE(instance->t == default_nthread_t_value);
             REQUIRE(instance->celsius == default_celsius_value);
             REQUIRE(instance->secondorder == default_second_order_value);

From 23100e92493ac47d5197fae5ceb10ecf26f7b979 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 13 May 2021 13:14:13 -0700
Subject: [PATCH 057/331] Basic scatter support (#643)

Added basic support to transform indirect writes into
`llvm.masked.scatter` intrinsic. Currently, the scatter functionality is
limited to non-atomic writes and assignment (e.g. `+=` operator is
not yet supported). Hence, a warning is logged to the console
indicating all limitations.

Corresponding IR and execution tests were also added.

fixes #539
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    | 42 +++++++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  1 +
 src/codegen/llvm/llvm_ir_builder.cpp         |  7 +-
 test/unit/codegen/codegen_llvm_execution.cpp | 76 ++++++++++++++++++++
 test/unit/codegen/codegen_llvm_ir.cpp        | 47 ++++++++++++
 5 files changed, 165 insertions(+), 8 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index a86a5cd8b5..39594169f4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -37,9 +37,10 @@ static constexpr const char instance_struct_type_name[] = "__instance_var__type"
 
 /// A utility to check for supported Statement AST nodes.
 static bool is_supported_statement(const ast::Statement& statement) {
-    return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_codegen_for_statement() || statement.is_codegen_return_statement() ||
-           statement.is_if_statement() || statement.is_while_statement();
+    return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
+           statement.is_if_statement() || statement.is_codegen_return_statement() ||
+           statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
+           statement.is_while_statement();
 }
 
 /// A utility to check that the kernel body can be vectorised.
@@ -162,10 +163,12 @@ void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& argumen
 }
 
 void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
-    // By convention, only kernel functions have a return type of void.
+    // By convention, only kernel functions have a return type of void and single argument. The
+    // number of arguments check is needed to avoid LLVM void intrinsics to be considered as
+    // kernels.
     const auto& functions = module->getFunctionList();
     for (const auto& func: functions) {
-        if (func.getReturnType()->isVoidTy()) {
+        if (func.getReturnType()->isVoidTy() && llvm::hasSingleElement(func.args())) {
             container.push_back(func.getName().str());
         }
     }
@@ -366,7 +369,7 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         if (!kernel)
             throw std::runtime_error("Error: kernel " + kernel_name + " is not found\n");
 
-        if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
+        if (!llvm::hasSingleElement(kernel->args()))
             throw std::runtime_error("Error: kernel " + kernel_name +
                                      " must have a single argument\n");
 
@@ -443,6 +446,33 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
     ir_builder.create_boolean_constant(node.get_value());
 }
 
+/**
+ * Currently, this functions is very similar to visiting the binary operator. However, the
+ * difference here is that the writes to the LHS variable must be atomic. These has a particular
+ * use case in synapse kernels. For simplicity, we choose not to support atomic writes at this
+ * stage and emit a warning.
+ *
+ * \todo support this properly.
+ */
+void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomicStatement& node) {
+    if (vector_width > 1)
+        logger->warn("Atomic operations are not supported");
+
+    // Support only assignment for now.
+    llvm::Value* rhs = accept_and_get(node.get_rhs());
+    if (node.get_atomic_op().get_value() != ast::BinaryOp::BOP_ASSIGN)
+        throw std::runtime_error(
+            "Error: only assignment is supported for CodegenAtomicStatement\n");
+    const auto& var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
+    if (!var)
+        throw std::runtime_error("Error: only 'VarName' assignment is supported\n");
+
+    // Process the assignment as if it was non-atomic.
+    if (vector_width > 1)
+        logger->warn("Treating write as non-atomic");
+    write_to_variable(*var, rhs);
+}
+
 // Generating FOR loop in LLVM IR creates the following structure:
 //
 //  +---------------------------+
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 0ada7b8097..14a608d3ca 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -155,6 +155,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Visitors.
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
+    void visit_codegen_atomic_statement(const ast::CodegenAtomicStatement& node) override;
     void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 2773e6929b..04e36e50cd 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -349,8 +349,11 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     // If the vector code is generated, we need to distinguish between two cases. If the array is
     // indexed indirectly (i.e. not by an induction variable `kernel_id`), create a gather
     // instruction.
-    if (id_name != kernel_id && vectorize && instruction_width > 1)
-        return builder.CreateMaskedGather(element_ptr, llvm::Align());
+    if (id_name != kernel_id && vectorize && instruction_width > 1) {
+        return maybe_value_to_store
+                   ? builder.CreateMaskedScatter(maybe_value_to_store, element_ptr, llvm::Align())
+                   : builder.CreateMaskedGather(element_ptr, llvm::Align());
+    }
 
     llvm::Value* ptr;
     if (vectorize && instruction_width > 1) {
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index cec4e5017b..296417c5f3 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -432,3 +432,79 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         }
     }
 }
+
+//=============================================================================
+// Vectorised kernel with ion writes.
+//=============================================================================
+
+SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
+    GIVEN("Simple MOD file with ion writes") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                USEION ca WRITE cai
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+                : increment cai to test scatter
+                cai = cai + 1
+            }
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/false,
+                                                 /*use_single_precision=*/false,
+                                                 /*vector_width=*/2);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_functions();
+
+        // Create the instance struct data.
+        int num_elements = 5;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // Fill the instance struct data with some values.
+        std::vector<double> cai = {1.0, 2.0, 3.0, 4.0, 5.0};
+        std::vector<double> ion_cai = {1.0, 2.0, 3.0, 4.0, 5.0};
+        std::vector<int> ion_cai_index = {4, 2, 3, 0, 1};
+
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
+        initialise_instance_variable(instance_info, cai, "cai");
+        initialise_instance_variable(instance_info, ion_cai, "ion_cai");
+        initialise_instance_variable(instance_info, ion_cai_index, "ion_cai_index");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
+
+        THEN("Ion values in struct have been updated correctly") {
+            runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
+                                                 instance_data.base_ptr);
+            // cai[id] = ion_cai[ion_cai_index[id]]
+            // cai[id] += 1
+            std::vector<double> cai_expected = {6.0, 4.0, 5.0, 2.0, 3.0};
+            REQUIRE(check_instance_variable(instance_info, cai_expected, "cai"));
+
+            // ion_cai[ion_cai_index[id]] = cai[id]
+            std::vector<double> ion_cai_expected = {2.0, 3.0, 4.0, 5.0, 6.0};
+            REQUIRE(check_instance_variable(instance_info, ion_cai_expected, "ion_cai"));
+        }
+    }
+}
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 93fd269b8e..11f2faf99b 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -956,6 +956,53 @@ SCENARIO("Vectorised simple kernel", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// Scatter for vectorised kernel
+//=============================================================================
+
+SCENARIO("Vectorised simple kernel with ion writes", "[visitor][llvm]") {
+    GIVEN("An indirect indexing of ca ion") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                USEION ca WRITE cai
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {}
+        )";
+
+        THEN("a scatter instructions is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text,
+                                                         /*opt=*/false,
+                                                         /*use_single_precision=*/false,
+                                                         /*vector_width=*/4);
+            std::smatch m;
+
+            // Check scatter intrinsic is correctly declared.
+            std::regex declaration(
+                R"(declare void @llvm\.masked\.scatter\.v4f64\.v4p0f64\(<4 x double>, <4 x double\*>, i32 immarg, <4 x i1>\))");
+            REQUIRE(std::regex_search(module_string, m, declaration));
+
+            // Check that the indices vector is created correctly and extended to i64.
+            std::regex index_load(R"(load <4 x i32>, <4 x i32>\* %ion_cai_id)");
+            std::regex sext(R"(sext <4 x i32> %.* to <4 x i64>)");
+            REQUIRE(std::regex_search(module_string, m, index_load));
+            REQUIRE(std::regex_search(module_string, m, sext));
+
+            // Check that store to `ion_cai` is performed via scatter instruction.
+            //      ion_cai[ion_cai_id] = cai[id]
+            std::regex scatter(
+                "call void @llvm\\.masked\\.scatter\\.v4f64\\.v4p0f64\\(<4 x double> %.*, <4 x "
+                "double\\*> %.*, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>\\)");
+            REQUIRE(std::regex_search(module_string, m, scatter));
+        }
+    }
+}
+
 //=============================================================================
 // Derivative block : test optimization
 //=============================================================================

From bd054798ae120ad4202648aa50e5f8f05918ca7f Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Sun, 16 May 2021 18:07:01 +0200
Subject: [PATCH 058/331] Benchmarking code re-organisation and minor
 improvements (#647)

* Move benchmark + JIT related code from src/codegen/llvm
  to test/benchmark
* Common execution of CodegenLLVMVisitor for llvm --ir and
  benchmark option. With this, ast transformed for LLVM
  code generation is dumped to file.
* Previous object file is removed (if exist) so that output
  file name is same / deterministic
* Benchmark output is always printed to stdout via common
  logger object
* Remove unnecessary LLVMBuildInfo struct
---
 CMakeLists.txt                                |  1 +
 src/CMakeLists.txt                            |  3 +-
 src/codegen/llvm/CMakeLists.txt               | 10 +--
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  5 ++
 src/codegen/llvm/main.cpp                     |  2 +-
 src/main.cpp                                  | 39 +++++-----
 test/benchmark/CMakeLists.txt                 | 17 +++++
 .../llvm => test/benchmark}/jit_driver.cpp    |  8 ++
 .../llvm => test/benchmark}/jit_driver.hpp    |  0
 .../benchmark}/llvm_benchmark.cpp             | 73 +++++--------------
 .../benchmark}/llvm_benchmark.hpp             | 33 +++------
 test/unit/CMakeLists.txt                      |  5 +-
 test/unit/codegen/codegen_llvm_execution.cpp  |  2 +-
 13 files changed, 85 insertions(+), 113 deletions(-)
 create mode 100644 test/benchmark/CMakeLists.txt
 rename {src/codegen/llvm => test/benchmark}/jit_driver.cpp (97%)
 rename {src/codegen/llvm => test/benchmark}/jit_driver.hpp (100%)
 rename {src/codegen/llvm => test/benchmark}/llvm_benchmark.cpp (61%)
 rename {src/codegen/llvm => test/benchmark}/llvm_benchmark.hpp (76%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86da8140b0..dd11e2be8d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -188,6 +188,7 @@ set(MEMORYCHECK_COMMAND_OPTIONS
 # do not enable tests if nmodl is used as submodule
 if(NOT NMODL_AS_SUBPROJECT)
   include(CTest)
+  add_subdirectory(test/benchmark)
   add_subdirectory(test/unit)
 endif()
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bda007c3a0..e4da0b713c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -28,9 +28,8 @@ target_link_libraries(
   util
   lexer
   ${NMODL_WRAPPER_LIBS})
-
 if(NMODL_ENABLE_LLVM)
-  target_link_libraries(nmodl llvm_codegen llvm_benchmark ${LLVM_LIBS_TO_LINK})
+  target_link_libraries(nmodl llvm_codegen llvm_benchmark benchmark_data ${LLVM_LIBS_TO_LINK})
 endif()
 
 # =============================================================================
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 5ebf9c7acd..b927475f15 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -6,10 +6,6 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.cpp
@@ -24,10 +20,6 @@ add_library(runner_obj OBJECT ${LLVM_CODEGEN_SOURCE_FILES})
 add_dependencies(runner_obj lexer_obj)
 set_property(TARGET runner_obj PROPERTY POSITION_INDEPENDENT_CODE ON)
 
-if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
-  target_compile_definitions(runner_obj PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
-endif()
-
 add_library(llvm_codegen STATIC $<TARGET_OBJECTS:runner_obj>)
 add_dependencies(llvm_codegen lexer util visitor)
 
@@ -36,9 +28,9 @@ if(NOT NMODL_AS_SUBPROJECT)
 
   target_link_libraries(
     nmodl_llvm_runner
+    llvm_benchmark
     llvm_codegen
     codegen
-    llvm_benchmark
     visitor
     symtab
     lexer
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 14a608d3ca..990485d8e2 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -152,6 +152,11 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         return instance_var_helper;
     }
 
+    /// Returns vector width
+    int get_vector_width() const {
+        return vector_width;
+    }
+
     // Visitors.
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index b700f5ad59..2f4e1f653d 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -9,8 +9,8 @@
 
 #include "ast/program.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "jit_driver.hpp"
 #include "parser/nmodl_driver.hpp"
+#include "test/benchmark/jit_driver.hpp"
 #include "utils/logger.hpp"
 #include "visitors/symtab_visitor.hpp"
 
diff --git a/src/main.cpp b/src/main.cpp
index da3cb9d7dd..ee781444c8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -20,7 +20,7 @@
 
 #ifdef NMODL_LLVM_BACKEND
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "codegen/llvm/llvm_benchmark.hpp"
+#include "test/benchmark/llvm_benchmark.hpp"
 #endif
 
 #include "config/config.h"
@@ -320,7 +320,7 @@ int main(int argc, const char* argv[]) {
                        "Disable debug information ({})"_format(disable_debug_information))->ignore_case();
     llvm_opt->add_flag("--opt",
                        llvm_ir_opt_passes,
-                       "Run LLVM optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
+                       "Run few common LLVM IR optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
@@ -651,26 +651,7 @@ int main(int argc, const char* argv[]) {
             }
 
 #ifdef NMODL_LLVM_BACKEND
-
-            if (run_llvm_benchmark) {
-                logger->info("Running LLVM benchmark");
-                benchmark::LLVMBuildInfo info{llvm_vec_width,
-                                              llvm_ir_opt_passes,
-                                              llvm_float_type,
-                                              vector_library};
-                benchmark::LLVMBenchmark benchmark(modfile,
-                                                   output_dir,
-                                                   shared_lib_paths,
-                                                   info,
-                                                   num_experiments,
-                                                   instance_size,
-                                                   backend,
-                                                   llvm_opt_level_ir,
-                                                   llvm_opt_level_codegen);
-                benchmark.run(ast);
-            }
-
-            else if (llvm_ir) {
+            if (llvm_ir || run_llvm_benchmark) {
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
                                            output_dir,
@@ -682,6 +663,20 @@ int main(int argc, const char* argv[]) {
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
+
+                if (run_llvm_benchmark) {
+                    logger->info("Running LLVM benchmark");
+                    benchmark::LLVMBenchmark benchmark(visitor,
+                                                       modfile,
+                                                       output_dir,
+                                                       shared_lib_paths,
+                                                       num_experiments,
+                                                       instance_size,
+                                                       backend,
+                                                       llvm_opt_level_ir,
+                                                       llvm_opt_level_codegen);
+                    benchmark.run(ast);
+                }
             }
 #endif
         }
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000..4441d53251
--- /dev/null
+++ b/test/benchmark/CMakeLists.txt
@@ -0,0 +1,17 @@
+# =============================================================================
+# llvm benchmark sources
+# =============================================================================
+set(LLVM_BENCHMARK_SOURCE_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
+
+# =============================================================================
+# LLVM benchmark library
+# =============================================================================
+include_directories(${LLVM_INCLUDE_DIRS})
+add_library(llvm_benchmark STATIC ${LLVM_BENCHMARK_SOURCE_FILES})
+add_dependencies(llvm_benchmark lexer util visitor)
+
+if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
+  target_compile_definitions(llvm_benchmark PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
+endif()
diff --git a/src/codegen/llvm/jit_driver.cpp b/test/benchmark/jit_driver.cpp
similarity index 97%
rename from src/codegen/llvm/jit_driver.cpp
rename to test/benchmark/jit_driver.cpp
index 2a6842d0fb..a2d8df63f4 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -7,6 +7,7 @@
 
 #include "jit_driver.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "utils/common_utils.hpp"
 
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
@@ -247,6 +248,13 @@ void JITDriver::init(std::string features,
 
     // Optionally, dump the binary to the object file.
     if (benchmark_info) {
+        std::string object_file = benchmark_info->filename + ".o";
+        if (utils::file_exists(object_file)) {
+            int status = remove(object_file.c_str());
+            if (status) {
+                throw std::runtime_error("Can not remove object file " + object_file);
+            }
+        }
         jit->getObjTransformLayer().setTransform(
             llvm::orc::DumpObjects(benchmark_info->output_dir, benchmark_info->filename));
     }
diff --git a/src/codegen/llvm/jit_driver.hpp b/test/benchmark/jit_driver.hpp
similarity index 100%
rename from src/codegen/llvm/jit_driver.hpp
rename to test/benchmark/jit_driver.hpp
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
similarity index 61%
rename from src/codegen/llvm/llvm_benchmark.cpp
rename to test/benchmark/llvm_benchmark.cpp
index adbe653f1e..f6811fd664 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -9,8 +9,8 @@
 #include <fstream>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "codegen/llvm/jit_driver.hpp"
 #include "llvm_benchmark.hpp"
+#include "test/benchmark/jit_driver.hpp"
 #include "llvm/Support/Host.h"
 
 #include "test/unit/codegen/codegen_data_helper.hpp"
@@ -42,57 +42,42 @@ void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>
     for (auto& host_feature: host_features) {
         if (feature == host_feature.substr(1)) {
             host_feature[0] = '-';
-            *log_stream << host_feature << "\n";
+            logger->info("{}", host_feature);
             return;
         }
     }
 }
 
 void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
-    // First, set the output stream for the logs.
-    set_log_output();
-
-    // Then, record the time taken for building the LLVM IR module.
-    codegen::CodegenLLVMVisitor visitor(mod_filename,
-                                        output_dir,
-                                        llvm_build_info.opt_passes,
-                                        llvm_build_info.use_single_precision,
-                                        llvm_build_info.vector_width,
-                                        llvm_build_info.vec_lib,
-                                        /*add_debug_information=*/true);
-    generate_llvm(visitor, node);
-
+    // create functions
+    generate_llvm(node);
     // Finally, run the benchmark and log the measurements.
-    run_benchmark(visitor, node);
+    run_benchmark(node);
 }
 
-void LLVMBenchmark::generate_llvm(codegen::CodegenLLVMVisitor& visitor,
-                                  const std::shared_ptr<ast::Program>& node) {
+void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
     // First, visit the AST to build the LLVM IR module and wrap the kernel function calls.
     auto start = std::chrono::high_resolution_clock::now();
-    visitor.visit_program(*node);
-    visitor.wrap_kernel_functions();
+    llvm_visitor.wrap_kernel_functions();
     auto end = std::chrono::high_resolution_clock::now();
 
     // Log the time taken to visit the AST and build LLVM IR.
     std::chrono::duration<double> diff = end - start;
-    *log_stream << "Created LLVM IR module from NMODL AST in " << std::setprecision(PRECISION)
-                << diff.count() << "\n\n";
+    logger->info("Created LLVM IR module from NMODL AST in {} sec", diff.count());
 }
 
-void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
-                                  const std::shared_ptr<ast::Program>& node) {
+void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     // Set the codegen data helper and find the kernels.
-    auto codegen_data = codegen::CodegenDataHelper(node, visitor.get_instance_struct_ptr());
+    auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
     std::vector<std::string> kernel_names;
-    visitor.find_kernel_names(kernel_names);
+    llvm_visitor.find_kernel_names(kernel_names);
 
     // Get feature's string and turn them off depending on the backend.
     std::vector<std::string> features = get_cpu_features();
-    *log_stream << "Backend: " << backend << "\n";
+    logger->info("Backend: {}", backend);
     if (backend == "avx2") {
         // Disable SSE.
-        *log_stream << "Disabling features:\n";
+        logger->info("Disabling features:");
         disable("sse", features);
         disable("sse2", features);
         disable("sse3", features);
@@ -100,16 +85,17 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
         disable("sse4.2", features);
     } else if (backend == "sse2") {
         // Disable AVX.
-        *log_stream << "Disabling features:\n";
+        logger->info("Disabling features:");
         disable("avx", features);
         disable("avx2", features);
     }
 
     std::string features_str = llvm::join(features.begin(), features.end(), ",");
-    std::unique_ptr<llvm::Module> m = visitor.get_module();
+    std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
     // Create the benchmark runner and initialize it.
-    std::string filename = "v" + std::to_string(llvm_build_info.vector_width) + "_" + mod_filename;
+    std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
+                           mod_filename;
     runner::BenchmarkRunner runner(std::move(m),
                                    filename,
                                    output_dir,
@@ -125,7 +111,7 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
         auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
 
         double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-        *log_stream << "Benchmarking kernel '" << kernel_name << ", with " << size_mbs << " MBs\n";
+        logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
 
         // For every kernel run the benchmark `num_experiments` times.
         double time_sum = 0.0;
@@ -138,32 +124,13 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
             std::chrono::duration<double> diff = end - start;
 
             // Log the time taken for each run.
-            *log_stream << "Experiment " << i << ": compute time = " << std::setprecision(9)
-                        << diff.count() << "\n";
+            logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
 
             time_sum += diff.count();
         }
         // Log the average time taken for the kernel.
-        *log_stream << "Average compute time = " << std::setprecision(PRECISION)
-                    << time_sum / num_experiments << "\n\n";
-    }
-}
-
-void LLVMBenchmark::set_log_output() {
-    // If the output directory is not specified, dump logs to the console.
-    if (output_dir == ".") {
-        log_stream = std::make_shared<std::ostream>(std::cout.rdbuf());
-        return;
+        logger->info("Average compute time = {:.6f} \n", time_sum / num_experiments);
     }
-
-    // Otherwise, dump logs to the specified file.
-    std::string filename = output_dir + "/" + mod_filename + ".log";
-    ofs.open(filename.c_str());
-
-    if (ofs.fail())
-        throw std::runtime_error("Error while opening a file '" + filename + "'");
-
-    log_stream = std::make_shared<std::ostream>(ofs.rdbuf());
 }
 
 }  // namespace benchmark
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
similarity index 76%
rename from src/codegen/llvm/llvm_benchmark.hpp
rename to test/benchmark/llvm_benchmark.hpp
index c2c781d7f0..9696191172 100644
--- a/src/codegen/llvm/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -10,19 +10,11 @@
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-
+#include "utils/logger.hpp"
 
 namespace nmodl {
 namespace benchmark {
 
-/// A struct to hold LLVM visitor information.
-struct LLVMBuildInfo {
-    int vector_width;
-    bool opt_passes;
-    bool use_single_precision;
-    std::string vec_lib;
-};
-
 /**
  * \class LLVMBenchmark
  * \brief A wrapper to execute MOD file kernels via LLVM IR backend, and
@@ -30,6 +22,9 @@ struct LLVMBuildInfo {
  */
 class LLVMBenchmark {
   private:
+    /// LLVM visitor.
+    codegen::CodegenLLVMVisitor& llvm_visitor;
+
     /// Source MOD file name.
     std::string mod_filename;
 
@@ -54,32 +49,26 @@ class LLVMBenchmark {
     /// Optimisation level for machine code generation.
     int opt_level_codegen;
 
-    /// LLVM visitor information.
-    LLVMBuildInfo llvm_build_info;
-
-    /// The log output stream (file or stdout).
-    std::shared_ptr<std::ostream> log_stream;
-
     /// Filestream for dumping logs to the file.
     std::ofstream ofs;
 
   public:
-    LLVMBenchmark(const std::string& mod_filename,
+    LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,
+                  const std::string& mod_filename,
                   const std::string& output_dir,
                   std::vector<std::string> shared_libs,
-                  LLVMBuildInfo info,
                   int num_experiments,
                   int instance_size,
                   const std::string& backend,
                   int opt_level_ir,
                   int opt_level_codegen)
-        : mod_filename(mod_filename)
+        : llvm_visitor(llvm_visitor)
+        , mod_filename(mod_filename)
         , output_dir(output_dir)
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
         , backend(backend)
-        , llvm_build_info(info)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen) {}
 
@@ -91,12 +80,10 @@ class LLVMBenchmark {
     void disable(const std::string& feature, std::vector<std::string>& host_features);
 
     /// Visits the AST to construct the LLVM IR module.
-    void generate_llvm(codegen::CodegenLLVMVisitor& visitor,
-                       const std::shared_ptr<ast::Program>& node);
+    void generate_llvm(const std::shared_ptr<ast::Program>& node);
 
     /// Runs the main body of the benchmark, executing the compute kernels.
-    void run_benchmark(codegen::CodegenLLVMVisitor& visitor,
-                       const std::shared_ptr<ast::Program>& node);
+    void run_benchmark(const std::shared_ptr<ast::Program>& node);
 
     /// Sets the log output stream (file or console).
     void set_log_output();
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index b4fa2f7837..91721010e9 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -99,8 +99,8 @@ target_link_libraries(
 if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS} codegen)
 
-  add_library(llvm_benchmark STATIC codegen/codegen_data_helper.cpp)
-  add_dependencies(llvm_benchmark lexer)
+  add_library(benchmark_data STATIC codegen/codegen_data_helper.cpp)
+  add_dependencies(benchmark_data lexer)
 
   add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp
                           codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
@@ -121,6 +121,7 @@ if(NMODL_ENABLE_LLVM)
   target_link_libraries(
     test_llvm_runner
     llvm_codegen
+    llvm_benchmark
     codegen
     visitor
     symtab
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 296417c5f3..baa370143b 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -9,9 +9,9 @@
 
 #include "ast/program.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "codegen/llvm/jit_driver.hpp"
 #include "codegen_data_helper.hpp"
 #include "parser/nmodl_driver.hpp"
+#include "test/benchmark/jit_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
 #include "visitors/neuron_solve_visitor.hpp"
 #include "visitors/solve_block_visitor.hpp"

From 5d126aa9397003951ea0b8e61ff2027de43ed9a5 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 16 May 2021 22:04:31 -0700
Subject: [PATCH 059/331] Added attributes and metadata to LLVM IR compute
 kernels (#648)

Previously, there was no metadata and attributes associated with the
instance struct pointer, compute kernels or loops. This commit fixes
this.

- New instance struct attributes

Since all pointers contained in the instance struct do not alias, we add
a `noalias` (LLVM's `__restrict` alternative) attribute to it. In addition,
we add `nocapture` (No capturing occurs in the function) and
`readonly` (Struct pointer is not written to) attributes.

This means that some load instructions can be moved out from the loop
body. Example:
```llvm
; BEFORE
for.body.lr.ph:                                   ; preds = %0
  %5 = getelementptr inbounds %avx__instance_var__type, %avx__instance_var__type* %mech1, i64 0, i32 1
  br label %for.body

for.body:                                         ; preds = %for.body.lr.ph, %for.body
  %15 = load double*, double** %5, align 8
  ; ...


; AFTER
for.body.lr.ph:                                   ; preds = %0
  %5 = getelementptr inbounds %avx__instance_var__type, %avx__instance_var__type* %mech1, i64 0, i32 1
  %6 = load double*, double** %5, align 8
  br label %for.body
```

- New function attributes

Now, compute kernels are marked with `nofree` and `nounwind`
attributes.

- Loop metadata

Also, loop metadata is added to scalar kernels, specifying that no
vectorization is needed. The reason for this is because we want to
benchmark truly scalar kernels, and disable LLVM's vectorization if
necessary.

Note that for vector loop epilogue there is no metadata that disables
vectorization.

fixes #607
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 93 +++++++++++++----------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  3 +
 src/codegen/llvm/llvm_ir_builder.cpp      | 54 ++++++++++++-
 src/codegen/llvm/llvm_ir_builder.hpp      | 13 +++-
 test/unit/codegen/codegen_llvm_ir.cpp     | 21 ++++-
 5 files changed, 134 insertions(+), 50 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 39594169f4..2124ad82c9 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -163,13 +163,11 @@ void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& argumen
 }
 
 void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
-    // By convention, only kernel functions have a return type of void and single argument. The
-    // number of arguments check is needed to avoid LLVM void intrinsics to be considered as
-    // kernels.
-    const auto& functions = module->getFunctionList();
-    for (const auto& func: functions) {
-        if (func.getReturnType()->isVoidTy() && llvm::hasSingleElement(func.args())) {
-            container.push_back(func.getName().str());
+    auto& functions = module->getFunctionList();
+    for (auto& func: functions) {
+        const std::string name = func.getName().str();
+        if (is_kernel_function(name)) {
+            container.push_back(name);
         }
     }
 }
@@ -239,6 +237,36 @@ int CodegenLLVMVisitor::get_num_elements(const ast::IndexedName& node) {
     return static_cast<int>(*macro->get_value());
 }
 
+/**
+ * Currently, functions are identified as compute kernels if they satisfy the following:
+ *   1. They have a void return type
+ *   2. They have a single argument
+ *   3. The argument is a struct type pointer
+ * This is not robust, and hence it would be better to find what functions are kernels on the NMODL
+ * AST side (e.g. via a flag, or via names list).
+ *
+ * \todo identify kernels on NMODL AST side.
+ */
+bool CodegenLLVMVisitor::is_kernel_function(const std::string& function_name) {
+    llvm::Function* function = module->getFunction(function_name);
+    if (!function)
+        throw std::runtime_error("Error: function " + function_name + " does not exist\n");
+
+    // By convention, only kernel functions have a return type of void and single argument. The
+    // number of arguments check is needed to avoid LLVM void intrinsics to be considered as
+    // kernels.
+    if (!function->getReturnType()->isVoidTy() || !llvm::hasSingleElement(function->args()))
+        return false;
+
+    // Kernel's argument is a pointer to the instance struct type.
+    llvm::Type* arg_type = function->getArg(0)->getType();
+    if (auto pointer_type = llvm::dyn_cast<llvm::PointerType>(arg_type)) {
+        if (pointer_type->getElementType()->isStructTy())
+            return true;
+    }
+    return false;
+}
+
 llvm::Value* CodegenLLVMVisitor::read_from_or_write_to_instance(const ast::CodegenInstanceVar& node,
                                                                 llvm::Value* maybe_value_to_store) {
     const auto& instance_name = node.get_instance_var()->get_node_name();
@@ -364,20 +392,8 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
     find_kernel_names(kernel_names);
 
     for (const auto& kernel_name: kernel_names) {
-        // Get the kernel function and the instance struct type.
+        // Get the kernel function.
         auto kernel = module->getFunction(kernel_name);
-        if (!kernel)
-            throw std::runtime_error("Error: kernel " + kernel_name + " is not found\n");
-
-        if (!llvm::hasSingleElement(kernel->args()))
-            throw std::runtime_error("Error: kernel " + kernel_name +
-                                     " must have a single argument\n");
-
-        auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(
-            kernel->getArg(0)->getType());
-        if (!instance_struct_ptr_type)
-            throw std::runtime_error("Error: kernel " + kernel_name +
-                                     " does not have an instance struct pointer as an argument\n");
 
         // Create a wrapper void function that takes a void pointer as a single argument.
         llvm::Type* i32_type = ir_builder.get_i32_type();
@@ -398,7 +414,7 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
         // and adding a terminator.
         llvm::Value* bitcasted = ir_builder.create_bitcast(wrapper_func->getArg(0),
-                                                           instance_struct_ptr_type);
+                                                           kernel->getArg(0)->getType());
         ValueVector args;
         args.push_back(bitcasted);
         ir_builder.create_function_call(kernel, args, /*use_result=*/false);
@@ -522,9 +538,6 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
-    // Save the vector width.
-    int tmp_vector_width = vector_width;
-
     // Check if the kernel can be vectorised. If not, generate scalar code.
     if (!can_vectorize(node, sym_tab)) {
         logger->info("Cannot vectorise the for loop in '" + ir_builder.get_current_function_name() +
@@ -534,21 +547,20 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
         ir_builder.generate_scalar_code();
     }
 
-    // First, initialise the loop in the same basic block. This block is optional. Also, reset
-    // vector width to 1 if processing the remainder of the loop.
-    if (node.get_initialization()) {
+    // First, initialise the loop in the same basic block. This block is optional. Also, generate
+    // scalar code if processing the remainder of the loop.
+    if (node.get_initialization())
         node.get_initialization()->accept(*this);
-    } else {
-        vector_width = 1;
+    else
         ir_builder.generate_scalar_code();
-    }
 
     // Branch to condition basic block and insert condition code there.
     ir_builder.create_br_and_set_insertion_point(for_cond);
 
     // Extract the condition to decide whether to branch to the loop body or loop exit.
     llvm::Value* cond = accept_and_get(node.get_condition());
-    ir_builder.create_cond_br(cond, for_body, exit);
+    llvm::BranchInst* loop_br = ir_builder.create_cond_br(cond, for_body, exit);
+    ir_builder.set_loop_metadata(loop_br);
 
     // Generate code for the loop body and create the basic block for the increment.
     ir_builder.set_insertion_point(for_body);
@@ -560,11 +572,9 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     // Process increment.
     node.get_increment()->accept(*this);
 
-    // Create a branch to condition block, then generate exit code out of the loop. Restore the
-    // vector width.
+    // Create a branch to condition block, then generate exit code out of the loop.
     ir_builder.create_br(for_cond);
     ir_builder.set_insertion_point(exit);
-    vector_width = tmp_vector_width;
     ir_builder.generate_vectorized_code();
     ir_builder.start_vectorization();
 }
@@ -578,7 +588,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
 
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
-    llvm::BasicBlock* body = ir_builder.create_block_and_set_insertion_point(func);
+    ir_builder.create_block_and_set_insertion_point(func);
 
     // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
     // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
@@ -588,14 +598,12 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     visitor::RenameVisitor v(name, return_var_name);
     block->accept(v);
 
-
     // Allocate parameters on the stack and add them to the symbol table.
     ir_builder.allocate_function_arguments(func, arguments);
 
     // Process function or procedure body. If the function is a compute kernel, then set the
-    // corresponding flags. The return statement is handled in a separate visitor.
-    bool has_void_ret_type = node.get_return_type()->get_type() == ast::AstNodeType::VOID;
-    if (has_void_ret_type) {
+    // corresponding flags. If so, the return statement is handled in a separate visitor.
+    if (is_kernel_function(name)) {
         ir_builder.start_vectorization();
         block->accept(*this);
         ir_builder.stop_vectorization();
@@ -603,9 +611,12 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
         block->accept(*this);
     }
 
-    // If function has a void return type, add a terminator not handled by CodegenReturnVar.
-    if (has_void_ret_type)
+    // If function is a compute kernel, add a void terminator explicitly, since there is no
+    // `CodegenReturnVar` node. Also, set the necessary attributes.
+    if (is_kernel_function(name)) {
+        ir_builder.set_kernel_attributes();
         ir_builder.create_return();
+    }
 
     // Clear local values stack and remove the pointer to the local symbol table.
     ir_builder.clear_function();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 990485d8e2..22505a304c 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -216,6 +216,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Returns the number of elements in the array specified by the IndexedName AST node.
     int get_num_elements(const ast::IndexedName& node);
 
+    /// Returns whether the function is an NMODL compute kernel.
+    bool is_kernel_function(const std::string& function_name);
+
     /// If the value to store is specified, writes it to the instance. Otherwise, returns the
     /// instance variable.
     llvm::Value* read_from_or_write_to_instance(const ast::CodegenInstanceVar& node,
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 04e36e50cd..06ba8d00ef 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -174,6 +174,52 @@ void IRBuilder::create_intrinsic(const std::string& name,
     }
 }
 
+void IRBuilder::set_kernel_attributes() {
+    // By convention, the compute kernel does not free memory and does not throw exceptions.
+    current_function->setDoesNotFreeMemory();
+    current_function->setDoesNotThrow();
+
+    // We also want to specify that the pointers that instance struct holds, do not alias. In order
+    // to do that, we add a `noalias` attribute to the argument. As per Clang's specification:
+    //  > The `noalias` attribute indicates that the only memory accesses inside function are loads
+    //  > and stores from objects pointed to by its pointer-typed arguments, with arbitrary
+    //  > offsets.
+    current_function->addParamAttr(0, llvm::Attribute::NoAlias);
+
+    // Finally, specify that the struct pointer does not capture and is read-only.
+    current_function->addParamAttr(0, llvm::Attribute::NoCapture);
+    current_function->addParamAttr(0, llvm::Attribute::ReadOnly);
+}
+
+/****************************************************************************************/
+/*                                LLVM metadata utilities                               */
+/****************************************************************************************/
+
+void IRBuilder::set_loop_metadata(llvm::BranchInst* branch) {
+    llvm::LLVMContext& context = builder.getContext();
+    MetadataVector loop_metadata;
+
+    // Add nullptr to reserve the first place for loop's metadata self-reference.
+    loop_metadata.push_back(nullptr);
+
+    // If `vector_width` is 1, explicitly disable vectorization for benchmarking purposes.
+    if (vector_width == 1) {
+        llvm::MDString* name = llvm::MDString::get(context, "llvm.loop.vectorize.enable");
+        llvm::Value* false_value = llvm::ConstantInt::get(get_boolean_type(), 0);
+        llvm::ValueAsMetadata* value = llvm::ValueAsMetadata::get(false_value);
+        loop_metadata.push_back(llvm::MDNode::get(context, {name, value}));
+    }
+
+    // No metadata to add.
+    if (loop_metadata.size() <= 1)
+        return;
+
+    // Add loop's metadata self-reference and attach it to the branch.
+    llvm::MDNode* metadata = llvm::MDNode::get(context, loop_metadata);
+    metadata->replaceOperandWith(0, metadata);
+    branch->setMetadata(llvm::LLVMContext::MD_loop, metadata);
+}
+
 /****************************************************************************************/
 /*                             LLVM instruction utilities                               */
 /****************************************************************************************/
@@ -412,10 +458,10 @@ void IRBuilder::create_br_and_set_insertion_point(llvm::BasicBlock* block) {
     builder.SetInsertPoint(block);
 }
 
-void IRBuilder::create_cond_br(llvm::Value* condition,
-                               llvm::BasicBlock* true_block,
-                               llvm::BasicBlock* false_block) {
-    builder.CreateCondBr(condition, true_block, false_block);
+llvm::BranchInst* IRBuilder::create_cond_br(llvm::Value* condition,
+                                            llvm::BasicBlock* true_block,
+                                            llvm::BasicBlock* false_block) {
+    return builder.CreateCondBr(condition, true_block, false_block);
 }
 
 llvm::BasicBlock* IRBuilder::get_current_block() {
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index b1b23ff0cf..e0cda2cf93 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -24,6 +24,7 @@ static constexpr const unsigned double_precision = 64;
 
 /// Some typedefs.
 using ConstantVector = std::vector<llvm::Constant*>;
+using MetadataVector = std::vector<llvm::Metadata*>;
 using TypeVector = std::vector<llvm::Type*>;
 using ValueVector = std::vector<llvm::Value*>;
 
@@ -137,9 +138,9 @@ class IRBuilder {
     void create_br_and_set_insertion_point(llvm::BasicBlock* block);
 
     /// Generates LLVM IR for conditional branch.
-    void create_cond_br(llvm::Value* condition,
-                        llvm::BasicBlock* true_block,
-                        llvm::BasicBlock* false_block);
+    llvm::BranchInst* create_cond_br(llvm::Value* condition,
+                                     llvm::BasicBlock* true_block,
+                                     llvm::BasicBlock* false_block);
 
     /// Generates LLVM IR for the boolean constant.
     void create_boolean_constant(int value);
@@ -249,6 +250,12 @@ class IRBuilder {
     /// Sets builder's insertion point to the given block.
     void set_insertion_point(llvm::BasicBlock* block);
 
+    /// Sets the necessary attributes for the kernel and its arguments.
+    void set_kernel_attributes();
+
+    /// Sets the loop metadata for the given branch from the loop.
+    void set_loop_metadata(llvm::BranchInst* branch);
+
     /// Pops the last visited value from the value stack.
     llvm::Value* pop_last_value();
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 11f2faf99b..3295411f7a 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -838,15 +838,19 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check the struct type and the kernel declaration.
+            // Check the struct type with correct attributes and the kernel declaration.
             std::regex struct_type(
                 "%.*__instance_var__type = type \\{ double\\*, double\\*, double\\*, double\\*, "
                 "double\\*, double\\*, double\\*, i32\\*, double, double, double, i32, i32 \\}");
             std::regex kernel_declaration(
-                R"(define void @nrn_state_hh\(%.*__instance_var__type\* .*\))");
+                R"(define void @nrn_state_hh\(%.*__instance_var__type\* noalias nocapture readonly .*\) #0)");
             REQUIRE(std::regex_search(module_string, m, struct_type));
             REQUIRE(std::regex_search(module_string, m, kernel_declaration));
 
+            // Check kernel attributes.
+            std::regex kernel_attributes(R"(attributes #0 = \{ nofree nounwind \})");
+            REQUIRE(std::regex_search(module_string, m, kernel_attributes));
+
             // Check for correct variables initialisation and a branch to condition block.
             std::regex id_initialisation(R"(%id = alloca i32)");
             std::regex node_id_initialisation(R"(%node_id = alloca i32)");
@@ -871,6 +875,15 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, condition));
             REQUIRE(std::regex_search(module_string, m, cond_br));
 
+            // Check that loop metadata is attached to the scalar kernel.
+            std::regex loop_metadata(R"(!llvm\.loop !0)");
+            std::regex loop_metadata_self_reference(R"(!0 = distinct !\{!0, !1\})");
+            std::regex loop_metadata_disable_vectorization(
+                R"(!1 = !\{!\"llvm\.loop\.vectorize\.enable\", i1 false\})");
+            REQUIRE(std::regex_search(module_string, m, loop_metadata));
+            REQUIRE(std::regex_search(module_string, m, loop_metadata_self_reference));
+            REQUIRE(std::regex_search(module_string, m, loop_metadata_disable_vectorization));
+
             // Check for correct loads from the struct with GEPs.
             std::regex load_from_struct(
                 "  %.* = load %.*__instance_var__type\\*, %.*__instance_var__type\\*\\* %.*\n"
@@ -934,6 +947,10 @@ SCENARIO("Vectorised simple kernel", "[visitor][llvm]") {
                                                          /*vector_width=*/4);
             std::smatch m;
 
+            // Check that no loop metadata is attached.
+            std::regex loop_metadata(R"(!llvm\.loop !.*)");
+            REQUIRE(!std::regex_search(module_string, m, loop_metadata));
+
             // Check gather intrinsic is correctly declared.
             std::regex declaration(
                 R"(declare <4 x double> @llvm\.masked\.gather\.v4f64\.v4p0f64\(<4 x double\*>, i32 immarg, <4 x i1>, <4 x double>\) )");

From ee8bbdba59ec593cbec98cf2f06beff4f8064604 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 18 May 2021 03:14:08 -0700
Subject: [PATCH 060/331] Added loaded value to the stack (#655)

- fixes the case, where loaded value was taken from the stack, but was never actually put there
---
 src/codegen/llvm/llvm_ir_builder.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 06ba8d00ef..8828aa83c5 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -319,12 +319,16 @@ llvm::Value* IRBuilder::create_index(llvm::Value* value) {
 llvm::Value* IRBuilder::create_load(const std::string& name) {
     llvm::Value* ptr = lookup_value(name);
     llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
-    return builder.CreateLoad(loaded_type, ptr);
+    llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
+    value_stack.push_back(loaded);
+    return loaded;
 }
 
 llvm::Value* IRBuilder::create_load(llvm::Value* ptr) {
     llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
-    return builder.CreateLoad(loaded_type, ptr);
+    llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
+    value_stack.push_back(loaded);
+    return loaded;
 }
 
 llvm::Value* IRBuilder::create_load_from_array(const std::string& name, llvm::Value* index) {

From 5ee761b501020657ca1c734d81ce4c42fc258496 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 20 May 2021 00:32:40 -0700
Subject: [PATCH 061/331] Basic predication support for LLVM backend (#652)

Added support for vector predication. Currently, we support a very basic
predication pattern (that will be extended in the future):
```c++
IF (/*condition*/) {
  // code here, no nested conditionals
} ELSE {
  // code here, no nested conditionals
}
```
**What has been changed and added**

1. Removed vectorization check

Before, in the `FOR` statement visitor we were checking whether the code
can be vectorized. After refactoring `llvm::IRBuilder<>` into a separate class,
there is no interface to reset the builder's vector width. Hence, this check leads
to visitor having scalar vector width of 1, and builder having the same vector width.
```c++
if (!can_vectorize(node, sym_tab)) {
    vector_width = 1;
     ir_builder.generate_scalar_code();
}
```
In order to avoid any issues, this check is simply removed and will be added in
the separate PR.

2. Predication support

- `can_vectorize` has been changed to support a single `IF` or `IF/ELSE` pair.
- A special vectorized `IF` AST node visitor has been added.
- If generating code within `IF` AST node, instructions are masked.

3. Added execution and IR tests

fixes #539
---
 .../llvm/codegen_llvm_helper_visitor.cpp      |   2 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  90 ++++++++++------
 src/codegen/llvm/codegen_llvm_visitor.hpp     |   3 +
 src/codegen/llvm/llvm_ir_builder.cpp          |  83 ++++++++++----
 src/codegen/llvm/llvm_ir_builder.hpp          |  56 ++++++----
 test/unit/codegen/codegen_llvm_execution.cpp  | 101 ++++++++++++++++++
 test/unit/codegen/codegen_llvm_ir.cpp         |  69 ++++++++++++
 7 files changed, 326 insertions(+), 78 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 10aee780ce..5974edc623 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -251,7 +251,7 @@ static void append_statements_from_block(ast::StatementVector& statements,
     for (const auto& statement: block_statements) {
         const auto& expression_statement = std::dynamic_pointer_cast<ast::ExpressionStatement>(
             statement);
-        if (!expression_statement->get_expression()->is_solve_block())
+        if (!expression_statement || !expression_statement->get_expression()->is_solve_block())
             statements.push_back(statement);
     }
 }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 2124ad82c9..ec41008da0 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -54,11 +54,15 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
             return false;
     }
 
-    // Check there is no control flow in the kernel.
-    const std::vector<ast::AstNodeType> unsupported_nodes = {ast::AstNodeType::IF_STATEMENT};
-    const auto& collected = collect_nodes(statement, unsupported_nodes);
+    // Check for simple supported control flow in the kernel (single if/else statement).
+    const std::vector<ast::AstNodeType> supported_control_flow = {ast::AstNodeType::IF_STATEMENT};
+    const auto& supported = collect_nodes(statement, supported_control_flow);
 
-    return collected.empty();
+    // Check for unsupported control flow statements.
+    const std::vector<ast::AstNodeType> unsupported_nodes = {ast::AstNodeType::ELSE_IF_STATEMENT};
+    const auto& unsupported = collect_nodes(statement, unsupported_nodes);
+
+    return unsupported.empty() && supported.size() <= 1;
 }
 
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
@@ -162,6 +166,27 @@ void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& argumen
     ir_builder.create_function_call(printf, argument_values, /*use_result=*/false);
 }
 
+void CodegenLLVMVisitor::create_vectorized_control_flow_block(const ast::IfStatement& node) {
+    // Get the true mask from the condition statement.
+    llvm::Value* true_mask = accept_and_get(node.get_condition());
+
+    // Process the true block.
+    ir_builder.set_mask(true_mask);
+    node.get_statement_block()->accept(*this);
+
+    // Note: by default, we do not support kernels with complicated control flow. This is checked
+    // prior to visiting 'CodegenForStatement`.
+    const auto& elses = node.get_elses();
+    if (elses) {
+        // If `else` statement exists, invert the mask and proceed with code generation.
+        ir_builder.invert_mask();
+        elses->get_statement_block()->accept(*this);
+    }
+
+    // Clear the mask value.
+    ir_builder.clear_mask();
+}
+
 void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
     auto& functions = module->getFunctionList();
     for (auto& func: functions) {
@@ -325,7 +350,8 @@ llvm::Value* CodegenLLVMVisitor::read_variable(const ast::VarName& node) {
     const auto& identifier = node.get_name();
 
     if (identifier->is_name()) {
-        return ir_builder.create_load(node.get_node_name());
+        return ir_builder.create_load(node.get_node_name(),
+                                      /*masked=*/ir_builder.generates_predicated_ir());
     }
 
     if (identifier->is_indexed_name()) {
@@ -522,8 +548,8 @@ void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomic
 //  | <code after for loop>     |
 //  +---------------------------+
 void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatement& node) {
-    // Disable vector code generation for condition and increment blocks.
-    ir_builder.stop_vectorization();
+    // Condition and increment blocks must be scalar.
+    ir_builder.generate_scalar_ir();
 
     // Get the current and the next blocks within the function.
     llvm::BasicBlock* curr_block = ir_builder.get_current_block();
@@ -538,21 +564,11 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
-    // Check if the kernel can be vectorised. If not, generate scalar code.
-    if (!can_vectorize(node, sym_tab)) {
-        logger->info("Cannot vectorise the for loop in '" + ir_builder.get_current_function_name() +
-                     "'");
-        logger->info("Generating scalar code...");
-        vector_width = 1;
-        ir_builder.generate_scalar_code();
-    }
-
-    // First, initialise the loop in the same basic block. This block is optional. Also, generate
-    // scalar code if processing the remainder of the loop.
-    if (node.get_initialization())
-        node.get_initialization()->accept(*this);
-    else
-        ir_builder.generate_scalar_code();
+    // First, initialize the loop in the same basic block. If processing the remainder of the loop,
+    // no initialization happens.
+    const auto& main_loop_initialization = node.get_initialization();
+    if (main_loop_initialization)
+        main_loop_initialization->accept(*this);
 
     // Branch to condition basic block and insert condition code there.
     ir_builder.create_br_and_set_insertion_point(for_cond);
@@ -561,22 +577,24 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::Value* cond = accept_and_get(node.get_condition());
     llvm::BranchInst* loop_br = ir_builder.create_cond_br(cond, for_body, exit);
     ir_builder.set_loop_metadata(loop_br);
+    ir_builder.set_insertion_point(for_body);
+
+    // If not processing remainder of the loop, start vectorization.
+    if (vector_width > 1 && main_loop_initialization)
+        ir_builder.generate_vector_ir();
 
     // Generate code for the loop body and create the basic block for the increment.
-    ir_builder.set_insertion_point(for_body);
-    ir_builder.start_vectorization();
     const auto& statement_block = node.get_statement_block();
     statement_block->accept(*this);
-    ir_builder.stop_vectorization();
+    ir_builder.generate_scalar_ir();
     ir_builder.create_br_and_set_insertion_point(for_inc);
-    // Process increment.
+
+    // Process the increment.
     node.get_increment()->accept(*this);
 
     // Create a branch to condition block, then generate exit code out of the loop.
     ir_builder.create_br(for_cond);
     ir_builder.set_insertion_point(exit);
-    ir_builder.generate_vectorized_code();
-    ir_builder.start_vectorization();
 }
 
 
@@ -601,12 +619,12 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     // Allocate parameters on the stack and add them to the symbol table.
     ir_builder.allocate_function_arguments(func, arguments);
 
-    // Process function or procedure body. If the function is a compute kernel, then set the
-    // corresponding flags. If so, the return statement is handled in a separate visitor.
-    if (is_kernel_function(name)) {
-        ir_builder.start_vectorization();
+    // Process function or procedure body. If the function is a compute kernel, enable
+    // vectorization. If so, the return statement is handled in a separate visitor.
+    if (vector_width > 1 && is_kernel_function(name)) {
+        ir_builder.generate_vector_ir();
         block->accept(*this);
-        ir_builder.stop_vectorization();
+        ir_builder.generate_scalar_ir();
     } else {
         block->accept(*this);
     }
@@ -676,6 +694,12 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
 }
 
 void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
+    // If vectorizing the compute kernel with control flow, process it separately.
+    if (vector_width > 1 && ir_builder.vectorizing()) {
+        create_vectorized_control_flow_block(node);
+        return;
+    }
+
     // Get the current and the next blocks within the function.
     llvm::BasicBlock* curr_block = ir_builder.get_current_block();
     llvm::BasicBlock* next = curr_block->getNextNode();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 22505a304c..384c20c2c7 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -204,6 +204,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Creates a call to `printf` function.
     void create_printf_call(const ast::ExpressionVector& arguments);
 
+    /// Creates a vectorized version of the LLVM IR for the simple control flow statement.
+    void create_vectorized_control_flow_block(const ast::IfStatement& node);
+
     /// Returns LLVM type for the given CodegenVarType AST node.
     llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
 
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 8828aa83c5..90e7456e33 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -92,11 +92,15 @@ llvm::Value* IRBuilder::pop_last_value() {
 /****************************************************************************************/
 
 void IRBuilder::create_boolean_constant(int value) {
-    value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_boolean_type(), value));
+    if (vector_width > 1 && vectorize) {
+        value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_boolean_type(), value));
+    } else {
+        value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_boolean_type(), value));
+    }
 }
 
 void IRBuilder::create_fp_constant(const std::string& value) {
-    if (instruction_width > 1 && vectorize) {
+    if (vector_width > 1 && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantFP>(get_fp_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantFP>(get_fp_type(), value));
@@ -108,7 +112,7 @@ llvm::Value* IRBuilder::create_global_string(const ast::String& node) {
 }
 
 void IRBuilder::create_i32_constant(int value) {
-    if (instruction_width > 1 && vectorize) {
+    if (vector_width > 1 && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_i32_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_i32_type(), value));
@@ -123,7 +127,7 @@ llvm::Value* IRBuilder::get_scalar_constant(llvm::Type* type, V value) {
 template <typename C, typename V>
 llvm::Value* IRBuilder::get_vector_constant(llvm::Type* type, V value) {
     ConstantVector constants;
-    for (unsigned i = 0; i < instruction_width; ++i) {
+    for (unsigned i = 0; i < vector_width; ++i) {
         const auto& element = C::get(type, value);
         constants.push_back(element);
     }
@@ -312,19 +316,27 @@ llvm::Value* IRBuilder::create_index(llvm::Value* value) {
     const auto& element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
     if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
         return value;
-    return builder.CreateSExtOrTrunc(value,
-                                     llvm::FixedVectorType::get(i64_type, instruction_width));
+    return builder.CreateSExtOrTrunc(value, llvm::FixedVectorType::get(i64_type, vector_width));
 }
 
-llvm::Value* IRBuilder::create_load(const std::string& name) {
+llvm::Value* IRBuilder::create_load(const std::string& name, bool masked) {
     llvm::Value* ptr = lookup_value(name);
+
+    // Check if the generated IR is vectorized and masked.
+    if (masked) {
+        return builder.CreateMaskedLoad(ptr, llvm::Align(), mask);
+    }
     llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
     llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
     value_stack.push_back(loaded);
     return loaded;
 }
 
-llvm::Value* IRBuilder::create_load(llvm::Value* ptr) {
+llvm::Value* IRBuilder::create_load(llvm::Value* ptr, bool masked) {
+    // Check if the generated IR is vectorized and masked.
+    if (masked) {
+        return builder.CreateMaskedLoad(ptr, llvm::Align(), mask);
+    }
     llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
     llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
     value_stack.push_back(loaded);
@@ -336,12 +348,23 @@ llvm::Value* IRBuilder::create_load_from_array(const std::string& name, llvm::Va
     return create_load(element_ptr);
 }
 
-void IRBuilder::create_store(const std::string& name, llvm::Value* value) {
+void IRBuilder::create_store(const std::string& name, llvm::Value* value, bool masked) {
     llvm::Value* ptr = lookup_value(name);
+
+    // Check if the generated IR is vectorized and masked.
+    if (masked) {
+        builder.CreateMaskedStore(value, ptr, llvm::Align(), mask);
+        return;
+    }
     builder.CreateStore(value, ptr);
 }
 
-void IRBuilder::create_store(llvm::Value* ptr, llvm::Value* value) {
+void IRBuilder::create_store(llvm::Value* ptr, llvm::Value* value, bool masked) {
+    // Check if the generated IR is vectorized and masked.
+    if (masked) {
+        builder.CreateMaskedStore(value, ptr, llvm::Align(), mask);
+        return;
+    }
     builder.CreateStore(value, ptr);
 }
 
@@ -364,8 +387,8 @@ void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
     // Even if generating vectorised code, some variables still need to be scalar. Particularly, the
     // induction variable "id" and remainder loop variables (that start with "epilogue" prefix).
     llvm::Type* type;
-    if (instruction_width > 1 && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
-        type = llvm::FixedVectorType::get(element_or_scalar_type, instruction_width);
+    if (vector_width > 1 && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+        type = llvm::FixedVectorType::get(element_or_scalar_type, vector_width);
     } else {
         type = element_or_scalar_type;
     }
@@ -389,6 +412,17 @@ llvm::Value* IRBuilder::get_struct_member_ptr(llvm::Value* struct_variable, int
     return builder.CreateInBoundsGEP(struct_variable, indices);
 }
 
+void IRBuilder::invert_mask() {
+    if (!mask)
+        throw std::runtime_error("Error: mask is not set\n");
+
+    // Create the vector with all `true` values.
+    create_boolean_constant(1);
+    llvm::Value* one = pop_last_value();
+
+    mask = builder.CreateXor(mask, one);
+}
+
 llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
                                                     llvm::Value* id_value,
                                                     llvm::Value* array,
@@ -396,22 +430,27 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     // First, calculate the address of the element in the array.
     llvm::Value* element_ptr = create_inbounds_gep(array, id_value);
 
+    // Find out if the vector code is generated.
+    bool generating_vector_ir = vector_width > 1 && vectorize;
+
     // If the vector code is generated, we need to distinguish between two cases. If the array is
     // indexed indirectly (i.e. not by an induction variable `kernel_id`), create a gather
     // instruction.
-    if (id_name != kernel_id && vectorize && instruction_width > 1) {
-        return maybe_value_to_store
-                   ? builder.CreateMaskedScatter(maybe_value_to_store, element_ptr, llvm::Align())
-                   : builder.CreateMaskedGather(element_ptr, llvm::Align());
+    if (id_name != kernel_id && generating_vector_ir) {
+        return maybe_value_to_store ? builder.CreateMaskedScatter(maybe_value_to_store,
+                                                                  element_ptr,
+                                                                  llvm::Align(),
+                                                                  mask)
+                                    : builder.CreateMaskedGather(element_ptr, llvm::Align(), mask);
     }
 
     llvm::Value* ptr;
-    if (vectorize && instruction_width > 1) {
+    if (generating_vector_ir) {
         // If direct indexing is used during the vectorization, we simply bitcast the scalar pointer
         // to a vector pointer
         llvm::Type* vector_type = llvm::PointerType::get(
             llvm::FixedVectorType::get(element_ptr->getType()->getPointerElementType(),
-                                       instruction_width),
+                                       vector_width),
             /*AddressSpace=*/0);
         ptr = builder.CreateBitCast(element_ptr, vector_type);
     } else {
@@ -420,21 +459,21 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     }
 
     if (maybe_value_to_store) {
-        create_store(ptr, maybe_value_to_store);
+        create_store(ptr, maybe_value_to_store, /*masked=*/mask && generating_vector_ir);
         return nullptr;
     } else {
-        return create_load(ptr);
+        return create_load(ptr, /*masked=*/mask && generating_vector_ir);
     }
 }
 
 void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     // If the value should not be vectorised, or it is already a vector, add it to the stack.
-    if (!vectorize || instruction_width == 1 || value->getType()->isVectorTy()) {
+    if (!vectorize || vector_width == 1 || value->getType()->isVectorTy()) {
         value_stack.push_back(value);
     } else {
         // Otherwise, we generate vectorized code inside the loop, so replicate the value to form a
         // vector.
-        llvm::Value* vector_value = builder.CreateVectorSplat(instruction_width, value);
+        llvm::Value* vector_value = builder.CreateVectorSplat(vector_width, value);
         value_stack.push_back(vector_value);
     }
 }
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index e0cda2cf93..ba3800fc66 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -52,13 +52,12 @@ class IRBuilder {
     /// Precision of the floating-point numbers (32 or 64 bit).
     unsigned fp_precision;
 
-    /// If 1, indicates that the scalar code is generated. Otherwise, the current vectorization
-    /// width.
-    unsigned instruction_width;
-
     /// The vector width used for the vectorized code.
     unsigned vector_width;
 
+    /// Masked value used to predicate vector instructions.
+    llvm::Value* mask;
+
     /// The name of induction variable used in kernel loops.
     std::string kernel_id;
 
@@ -72,7 +71,7 @@ class IRBuilder {
         , vectorize(false)
         , fp_precision(use_single_precision ? single_precision : double_precision)
         , vector_width(vector_width)
-        , instruction_width(vector_width)
+        , mask(nullptr)
         , kernel_id("") {}
 
     /// Initializes the builder with the symbol table and the kernel induction variable id.
@@ -81,26 +80,21 @@ class IRBuilder {
         this->kernel_id = kernel_id;
     }
 
-    /// Explicitly sets the builder to produce scalar code (even during vectorization).
-    void generate_scalar_code() {
-        instruction_width = 1;
+    /// Explicitly sets the builder to produce scalar IR.
+    void generate_scalar_ir() {
+        vectorize = false;
     }
 
-    /// Explicitly sets the builder to produce vectorized code.
-    void generate_vectorized_code() {
-        instruction_width = vector_width;
+    /// Indicates whether the builder generates vectorized IR.
+    bool vectorizing() {
+        return vectorize;
     }
 
-    /// Turns on vectorization mode.
-    void start_vectorization() {
+    /// Explicitly sets the builder to produce vectorized IR.
+    void generate_vector_ir() {
         vectorize = true;
     }
 
-    /// Turns off vectorization mode.
-    void stop_vectorization() {
-        vectorize = false;
-    }
-
     /// Sets the current function for which LLVM IR is generated.
     void set_function(llvm::Function* function) {
         current_function = function;
@@ -112,6 +106,21 @@ class IRBuilder {
         current_function = nullptr;
     }
 
+    /// Sets the value to be the mask for vector code generation.
+    void set_mask(llvm::Value* value) {
+        mask = value;
+    }
+
+    /// Clears the mask for vector code generation.
+    void clear_mask() {
+        mask = nullptr;
+    }
+
+    /// Indicates whether the vectorized IR is predicated.
+    bool generates_predicated_ir() {
+        return vectorize && mask;
+    }
+
     /// Generates LLVM IR to allocate the arguments of the function on the stack.
     void allocate_function_arguments(llvm::Function* function,
                                      const ast::CodegenVarWithTypeVector& nmodl_arguments);
@@ -168,20 +177,20 @@ class IRBuilder {
     void create_i32_constant(int value);
 
     /// Generates LLVM IR to load the value specified by its name and returns it.
-    llvm::Value* create_load(const std::string& name);
+    llvm::Value* create_load(const std::string& name, bool masked = false);
 
     /// Generates LLVM IR to load the value from the pointer and returns it.
-    llvm::Value* create_load(llvm::Value* ptr);
+    llvm::Value* create_load(llvm::Value* ptr, bool masked = false);
 
     /// Generates LLVM IR to load the element at the specified index from the given array name and
     /// returns it.
     llvm::Value* create_load_from_array(const std::string& name, llvm::Value* index);
 
     /// Generates LLVM IR to store the value to the location specified by the name.
-    void create_store(const std::string& name, llvm::Value* value);
+    void create_store(const std::string& name, llvm::Value* value, bool masked = false);
 
     /// Generates LLVM IR to store the value to the location specified by the pointer.
-    void create_store(llvm::Value* ptr, llvm::Value* value);
+    void create_store(llvm::Value* ptr, llvm::Value* value, bool masked = false);
 
     /// Generates LLVM IR to store the value to the array element, where array is specified by the
     /// name.
@@ -234,6 +243,9 @@ class IRBuilder {
     /// Creates a pointer to struct type with the given name and given members.
     llvm::Type* get_struct_ptr_type(const std::string& struct_type_name, TypeVector& member_types);
 
+    /// Inverts the mask for vector code generation by xoring it.
+    void invert_mask();
+
     /// Generates IR that loads the elements of the array even during vectorization. If the value is
     /// specified, then it is stored to the array at the given index.
     llvm::Value* load_to_or_store_from_array(const std::string& id_name,
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index baa370143b..aa77a4e493 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -508,3 +508,104 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
         }
     }
 }
+
+//=============================================================================
+// Vectorised kernel with control flow.
+//=============================================================================
+
+SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
+    GIVEN("Simple MOD file with if statement") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+            }
+
+            STATE {
+                w x y z
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+                IF (v > 0) {
+                    w = v * w
+                }
+
+                IF (x < 0) {
+                    x = 7
+                }
+
+                IF (0 <= y && y < 10 || z == 0) {
+                    y = 2 * y
+                } ELSE {
+                    z = z - y
+                }
+
+            }
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/false,
+                                                 /*use_single_precision=*/false,
+                                                 /*vector_width=*/2);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_functions();
+
+        // Create the instance struct data.
+        int num_elements = 5;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // Fill the instance struct data with some values.
+        std::vector<double> x = {-1.0, 2.0, -3.0, 4.0, -5.0};
+        std::vector<double> y = {11.0, 2.0, -3.0, 4.0, 100.0};
+        std::vector<double> z = {0.0, 1.0, 20.0, 0.0, 40.0};
+
+        std::vector<double> w = {10.0, 20.0, 30.0, 40.0, 50.0};
+        std::vector<double> voltage = {-1.0, 2.0, -1.0, 2.0, -1.0};
+        std::vector<int> node_index = {1, 2, 3, 4, 0};
+
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
+        initialise_instance_variable(instance_info, w, "w");
+        initialise_instance_variable(instance_info, voltage, "voltage");
+        initialise_instance_variable(instance_info, node_index, "node_index");
+
+        initialise_instance_variable(instance_info, x, "x");
+        initialise_instance_variable(instance_info, y, "y");
+        initialise_instance_variable(instance_info, z, "z");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
+
+        THEN("Masked instructions are generated") {
+            runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
+                                                 instance_data.base_ptr);
+            std::vector<double> w_expected = {20.0, 20.0, 60.0, 40.0, 50.0};
+            REQUIRE(check_instance_variable(instance_info, w_expected, "w"));
+
+            std::vector<double> x_expected = {7.0, 2.0, 7.0, 4.0, 7.0};
+            REQUIRE(check_instance_variable(instance_info, x_expected, "x"));
+
+            std::vector<double> y_expected = {22.0, 4.0, -3.0, 8.0, 100.0};
+            std::vector<double> z_expected = {0.0, 1.0, 23.0, 0.0, -60.0};
+            REQUIRE(check_instance_variable(instance_info, y_expected, "y"));
+            REQUIRE(check_instance_variable(instance_info, z_expected, "z"));
+        }
+    }
+}
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 3295411f7a..4920a26c4c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1020,6 +1020,75 @@ SCENARIO("Vectorised simple kernel with ion writes", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// Vectorised kernel with simple control flow
+//=============================================================================
+
+SCENARIO("Vectorised simple kernel with control flow", "[visitor][llvm]") {
+    GIVEN("A single if/else statement") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+            }
+
+            STATE {
+                y
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+                IF (y < 0) {
+                    y = y + 7
+                } ELSE {
+                    y = v
+                }
+            }
+        )";
+
+        THEN("masked load and stores are created") {
+            std::string module_string = run_llvm_visitor(nmodl_text,
+                                                         /*opt=*/false,
+                                                         /*use_single_precision=*/true,
+                                                         /*vector_width=*/8);
+            std::smatch m;
+
+            // Check masked load/store intrinsics are correctly declared.
+            std::regex masked_load(
+                R"(declare <8 x float> @llvm\.masked\.load\.v8f32\.p0v8f32\(<8 x float>\*, i32 immarg, <8 x i1>, <8 x float>\))");
+            std::regex masked_store(
+                R"(declare void @llvm.masked\.store\.v8f32\.p0v8f32\(<8 x float>, <8 x float>\*, i32 immarg, <8 x i1>\))");
+            REQUIRE(std::regex_search(module_string, m, masked_load));
+            REQUIRE(std::regex_search(module_string, m, masked_store));
+
+            // Check true direction instructions are predicated with mask.
+            // IF (mech->y[id] < 0) {
+            //     mech->y[id] = mech->y[id] + 7
+            std::regex mask(R"(%30 = fcmp olt <8 x float> %.*, zeroinitializer)");
+            std::regex true_load(
+                R"(call <8 x float> @llvm\.masked\.load\.v8f32\.p0v8f32\(<8 x float>\* %.*, i32 1, <8 x i1> %30, <8 x float> undef\))");
+            std::regex true_store(
+                R"(call void @llvm\.masked\.store\.v8f32\.p0v8f32\(<8 x float> %.*, <8 x float>\* %.*, i32 1, <8 x i1> %30\))");
+            REQUIRE(std::regex_search(module_string, m, mask));
+            REQUIRE(std::regex_search(module_string, m, true_load));
+            REQUIRE(std::regex_search(module_string, m, true_store));
+
+            // Check false direction instructions are predicated with inverted mask.
+            // } ELSE {
+            //     mech->y[id] = v
+            // }
+            std::regex inverted_mask(
+                R"(%47 = xor <8 x i1> %30, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)");
+            std::regex false_load(
+                R"(call <8 x float> @llvm\.masked\.load\.v8f32\.p0v8f32\(<8 x float>\* %v, i32 1, <8 x i1> %47, <8 x float> undef\))");
+            std::regex false_store(
+                R"(call void @llvm\.masked\.store\.v8f32\.p0v8f32\(<8 x float> %.*, <8 x float>\* %.*, i32 1, <8 x i1> %47\))");
+        }
+    }
+}
+
 //=============================================================================
 // Derivative block : test optimization
 //=============================================================================

From 8bee7de4596cef9ffc25ed486aaf7400ec0f911e Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 20 May 2021 07:32:50 -0700
Subject: [PATCH 062/331] Improvements for LLVM code generation and
 benchmarking (#661)

* Improved cmake versioning of LLVM
* Added ^ support
* Added more math functions intrinsics with tests
* Added compute time variance and min/max times in benchmarking output
---
 CMakeLists.txt                            |   3 -
 src/codegen/llvm/codegen_llvm_visitor.cpp |   4 +-
 src/codegen/llvm/codegen_llvm_visitor.hpp |   2 +-
 src/codegen/llvm/llvm_ir_builder.cpp      |  24 ++++-
 test/benchmark/llvm_benchmark.cpp         |  29 ++++--
 test/unit/codegen/codegen_llvm_ir.cpp     | 117 ++++++++++++++++++++--
 6 files changed, 154 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dd11e2be8d..07bfde7ba5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,9 +150,6 @@ if(NMODL_ENABLE_LLVM)
   include(LLVMHelper)
   include_directories(${LLVM_INCLUDE_DIRS})
   add_definitions(-DNMODL_LLVM_BACKEND)
-  if(LLVM_VERSION VERSION_LESS_EQUAL 12)
-    add_definitions(-DLLVM_VERSION_LESS_THAN_13)
-  endif()
 endif()
 
 # =============================================================================
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index ec41008da0..ba28361e09 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ToolOutputFile.h"
 
-#ifndef LLVM_VERSION_LESS_THAN_13
+#if LLVM_VERSION_MAJOR >= 13
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
 #endif
 
@@ -819,7 +819,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     // Optionally, replace LLVM's maths intrinsics with vector library calls.
     if (vector_width > 1 && vector_library != llvm::TargetLibraryInfoImpl::NoLibrary) {
-#ifdef LLVM_VERSION_LESS_THAN_13
+#if LLVM_VERSION_MAJOR < 13
         logger->warn(
             "This version of LLVM does not support replacement of LLVM intrinsics with vector "
             "library calls");
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 384c20c2c7..a97e73030a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -51,7 +51,7 @@ namespace codegen {
 /// A map to query vector library by its string value.
 static const std::map<std::string, llvm::TargetLibraryInfoImpl::VectorLibrary> veclib_map = {
     {"Accelerate", llvm::TargetLibraryInfoImpl::Accelerate},
-#ifndef LLVM_VERSION_LESS_THAN_13
+#if LLVM_VERSION_MAJOR >= 13
     {"libmvec", llvm::TargetLibraryInfoImpl::LIBMVEC_X86},
 #endif
     {"MASSV", llvm::TargetLibraryInfoImpl::MASSV},
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 90e7456e33..c67941df3e 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -165,9 +165,26 @@ void IRBuilder::create_function_call(llvm::Function* callee,
 void IRBuilder::create_intrinsic(const std::string& name,
                                  ValueVector& argument_values,
                                  TypeVector& argument_types) {
+    // Process 'pow' call separately.
+    if (name == "pow") {
+        llvm::Value* pow_intrinsic = builder.CreateIntrinsic(llvm::Intrinsic::pow,
+                                                             {argument_types.front()},
+                                                             argument_values);
+        value_stack.push_back(pow_intrinsic);
+        return;
+    }
+
+    // Create other intrinsics.
     unsigned intrinsic_id = llvm::StringSwitch<llvm::Intrinsic::ID>(name)
+                                .Case("ceil", llvm::Intrinsic::ceil)
+                                .Case("cos", llvm::Intrinsic::cos)
                                 .Case("exp", llvm::Intrinsic::exp)
-                                .Case("pow", llvm::Intrinsic::pow)
+                                .Case("fabs", llvm::Intrinsic::fabs)
+                                .Case("floor", llvm::Intrinsic::floor)
+                                .Case("log", llvm::Intrinsic::log)
+                                .Case("log10", llvm::Intrinsic::log10)
+                                .Case("sin", llvm::Intrinsic::sin)
+                                .Case("sqrt", llvm::Intrinsic::sqrt)
                                 .Default(llvm::Intrinsic::not_intrinsic);
     if (intrinsic_id) {
         llvm::Value* intrinsic =
@@ -267,6 +284,11 @@ void IRBuilder::create_binary_op(llvm::Value* lhs, llvm::Value* rhs, ast::Binary
 
 #undef DISPATCH
 
+    // Separately replace ^ with the `pow` intrinsic.
+    case ast::BinaryOp::BOP_POWER:
+        result = builder.CreateIntrinsic(llvm::Intrinsic::pow, {lhs->getType()}, {lhs, rhs});
+        break;
+
     // Logical instructions.
     case ast::BinaryOp::BOP_AND:
         result = builder.CreateAnd(lhs, rhs);
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index f6811fd664..b9f2fdeced 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -107,15 +107,21 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
-        // Initialise the data.
-        auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-
-        double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-        logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
-
         // For every kernel run the benchmark `num_experiments` times.
+        double time_min = std::numeric_limits<double>::max();
+        double time_max = 0.0;
         double time_sum = 0.0;
+        double time_squared_sum = 0.0;
         for (int i = 0; i < num_experiments; ++i) {
+            // Initialise the data.
+            auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+
+            // Log instance size once.
+            if (i == 0) {
+                double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
+                logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
+            }
+
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::high_resolution_clock::now();
@@ -126,10 +132,19 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             // Log the time taken for each run.
             logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
 
+            // Update statistics.
             time_sum += diff.count();
+            time_squared_sum += diff.count() * diff.count();
+            time_min = std::min(time_min, diff.count());
+            time_max = std::max(time_max, diff.count());
         }
         // Log the average time taken for the kernel.
-        logger->info("Average compute time = {:.6f} \n", time_sum / num_experiments);
+        double time_mean = time_sum / num_experiments;
+        logger->info("Average compute time = {:.6f}", time_mean);
+        logger->info("Compute time variance = {:g}",
+                     time_squared_sum / num_experiments - time_mean * time_mean);
+        logger->info("Minimum compute time = {:.6f}", time_min);
+        logger->info("Minimum compute time = {:.6f}\n", time_max);
     }
 }
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 4920a26c4c..0a3facf6fc 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -97,7 +97,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             std::regex lhs(R"(%2 = load float, float\* %a)");
             std::regex res(R"(%3 = fadd float %2, %1)");
 
-            // Check the float values are loaded correctly and added
+            // Check the float values are loaded correctly and added.
             REQUIRE(std::regex_search(module_string, m, rhs));
             REQUIRE(std::regex_search(module_string, m, lhs));
             REQUIRE(std::regex_search(module_string, m, res));
@@ -116,7 +116,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check rhs
+            // Check rhs.
             std::regex rr(R"(%1 = load double, double\* %b)");
             std::regex rl(R"(%2 = load double, double\* %a)");
             std::regex x(R"(%3 = fadd double %2, %1)");
@@ -124,7 +124,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, rl));
             REQUIRE(std::regex_search(module_string, m, x));
 
-            // Check lhs
+            // Check lhs.
             std::regex lr(R"(%4 = load double, double\* %b)");
             std::regex ll(R"(%5 = load double, double\* %a)");
             std::regex y(R"(%6 = fsub double %5, %4)");
@@ -132,7 +132,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, ll));
             REQUIRE(std::regex_search(module_string, m, y));
 
-            // Check result
+            // Check result.
             std::regex res(R"(%7 = fdiv double %6, %3)");
             REQUIRE(std::regex_search(module_string, m, res));
         }
@@ -150,13 +150,36 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check store immediate is created
+            // Check store immediate is created.
             std::regex allocation(R"(%i = alloca double)");
             std::regex assignment(R"(store double 2.0*e\+00, double\* %i)");
             REQUIRE(std::regex_search(module_string, m, allocation));
             REQUIRE(std::regex_search(module_string, m, assignment));
         }
     }
+
+    GIVEN("Function with power operator") {
+        std::string nmodl_text = R"(
+            FUNCTION power() {
+                LOCAL i, j
+                i = 2
+                j = 4
+                power = i ^ j
+            }
+        )";
+
+        THEN("'pow' intrinsic is created") {
+            std::string module_string =
+                run_llvm_visitor(nmodl_text, /*opt=*/false, /*use_single_precision=*/true);
+            std::smatch m;
+
+            // Check 'pow' intrinsic.
+            std::regex declaration(R"(declare float @llvm\.pow\.f32\(float, float\))");
+            std::regex pow(R"(call float @llvm\.pow\.f32\(float %.*, float %.*\))");
+            REQUIRE(std::regex_search(module_string, m, declaration));
+            REQUIRE(std::regex_search(module_string, m, pow));
+        }
+    }
 }
 
 //=============================================================================
@@ -492,8 +515,44 @@ SCENARIO("Function call", "[visitor][llvm]") {
 
     GIVEN("A call to external method") {
         std::string nmodl_text = R"(
-            FUNCTION bar(i) {
-                bar = exp(i)
+            FUNCTION nmodl_ceil(x) {
+                nmodl_ceil = ceil(x)
+            }
+
+            FUNCTION nmodl_cos(x) {
+                nmodl_cos = cos(x)
+            }
+
+            FUNCTION nmodl_exp(x) {
+                nmodl_exp = exp(x)
+            }
+
+            FUNCTION nmodl_fabs(x) {
+                nmodl_fabs = fabs(x)
+            }
+
+            FUNCTION nmodl_floor(x) {
+                nmodl_floor = floor(x)
+            }
+
+            FUNCTION nmodl_log(x) {
+                nmodl_log = log(x)
+            }
+
+            FUNCTION nmodl_log10(x) {
+                nmodl_log10 = log10(x)
+            }
+
+            FUNCTION nmodl_pow(x, y) {
+                nmodl_pow = pow(x, y)
+            }
+
+            FUNCTION nmodl_sin(x) {
+                nmodl_sin = sin(x)
+            }
+
+            FUNCTION nmodl_sqrt(x) {
+                nmodl_sqrt = sqrt(x)
             }
         )";
 
@@ -501,13 +560,49 @@ SCENARIO("Function call", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check for intrinsic declaration.
+            // Check for intrinsic declarations.
+            std::regex ceil(R"(declare double @llvm\.ceil\.f64\(double\))");
+            std::regex cos(R"(declare double @llvm\.cos\.f64\(double\))");
             std::regex exp(R"(declare double @llvm\.exp\.f64\(double\))");
+            std::regex fabs(R"(declare double @llvm\.fabs\.f64\(double\))");
+            std::regex floor(R"(declare double @llvm\.floor\.f64\(double\))");
+            std::regex log(R"(declare double @llvm\.log\.f64\(double\))");
+            std::regex log10(R"(declare double @llvm\.log10\.f64\(double\))");
+            std::regex pow(R"(declare double @llvm\.pow\.f64\(double, double\))");
+            std::regex sin(R"(declare double @llvm\.sin\.f64\(double\))");
+            std::regex sqrt(R"(declare double @llvm\.sqrt\.f64\(double\))");
+            REQUIRE(std::regex_search(module_string, m, ceil));
+            REQUIRE(std::regex_search(module_string, m, cos));
             REQUIRE(std::regex_search(module_string, m, exp));
+            REQUIRE(std::regex_search(module_string, m, fabs));
+            REQUIRE(std::regex_search(module_string, m, floor));
+            REQUIRE(std::regex_search(module_string, m, log));
+            REQUIRE(std::regex_search(module_string, m, log10));
+            REQUIRE(std::regex_search(module_string, m, pow));
+            REQUIRE(std::regex_search(module_string, m, sin));
+            REQUIRE(std::regex_search(module_string, m, sqrt));
 
             // Check the correct call is made.
-            std::regex call(R"(call double @llvm\.exp\.f64\(double %[0-9]+\))");
-            REQUIRE(std::regex_search(module_string, m, call));
+            std::regex ceil_call(R"(call double @llvm\.ceil\.f64\(double %[0-9]+\))");
+            std::regex cos_call(R"(call double @llvm\.cos\.f64\(double %[0-9]+\))");
+            std::regex exp_call(R"(call double @llvm\.exp\.f64\(double %[0-9]+\))");
+            std::regex fabs_call(R"(call double @llvm\.fabs\.f64\(double %[0-9]+\))");
+            std::regex floor_call(R"(call double @llvm\.floor\.f64\(double %[0-9]+\))");
+            std::regex log_call(R"(call double @llvm\.log\.f64\(double %[0-9]+\))");
+            std::regex log10_call(R"(call double @llvm\.log10\.f64\(double %[0-9]+\))");
+            std::regex pow_call(R"(call double @llvm\.pow\.f64\(double %[0-9]+, double %[0-9]+\))");
+            std::regex sin_call(R"(call double @llvm\.sin\.f64\(double %[0-9]+\))");
+            std::regex sqrt_call(R"(call double @llvm\.sqrt\.f64\(double %[0-9]+\))");
+            REQUIRE(std::regex_search(module_string, m, ceil_call));
+            REQUIRE(std::regex_search(module_string, m, cos_call));
+            REQUIRE(std::regex_search(module_string, m, exp_call));
+            REQUIRE(std::regex_search(module_string, m, fabs_call));
+            REQUIRE(std::regex_search(module_string, m, floor_call));
+            REQUIRE(std::regex_search(module_string, m, log_call));
+            REQUIRE(std::regex_search(module_string, m, log10_call));
+            REQUIRE(std::regex_search(module_string, m, pow_call));
+            REQUIRE(std::regex_search(module_string, m, sin_call));
+            REQUIRE(std::regex_search(module_string, m, sqrt_call));
         }
     }
 
@@ -1230,7 +1325,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             REQUIRE(std::regex_search(no_library_module_str, m, exp_decl));
             REQUIRE(std::regex_search(no_library_module_str, m, exp_call));
 
-#ifndef LLVM_VERSION_LESS_THAN_13
+#if LLVM_VERSION_MAJOR >= 13
             // Check exponential calls are replaced with calls to SVML library.
             std::string svml_library_module_str = run_llvm_visitor(nmodl_text,
                                                                    /*opt=*/false,

From 1461d8f3cd013fa23f4ece6979f6780bbc9c6bdd Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 20 May 2021 14:19:41 -0700
Subject: [PATCH 063/331] Fixed `alloca`s insertion point for LLVM backend
 (#663)

* With this PR alloca instructions are always inserted in the beginning
   of the function entry block. This is done to avoid them in the while or
   for loops, where allocations per iteration cause stack overflow
   (if the IR is not optimized).
* Insertion point for allocas is the enetry block now

See #653
---
 src/codegen/llvm/codegen_llvm_visitor.cpp |  4 +--
 src/codegen/llvm/llvm_ir_builder.cpp      | 38 +++++++++++++++++++++--
 src/codegen/llvm/llvm_ir_builder.hpp      |  7 +++++
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index ba28361e09..6df5820d42 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -601,12 +601,12 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
 void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node) {
     const auto& name = node.get_node_name();
     const auto& arguments = node.get_arguments();
-    llvm::Function* func = module->getFunction(name);
-    ir_builder.set_function(func);
 
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
+    llvm::Function* func = module->getFunction(name);
     ir_builder.create_block_and_set_insertion_point(func);
+    ir_builder.set_function(func);
 
     // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
     // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index c67941df3e..004f28d857 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -144,7 +144,7 @@ void IRBuilder::allocate_function_arguments(llvm::Function* function,
     for (auto& arg: function->args()) {
         std::string arg_name = nmodl_arguments[i++].get()->get_node_name();
         llvm::Type* arg_type = arg.getType();
-        llvm::Value* alloca = builder.CreateAlloca(arg_type, /*ArraySize=*/nullptr, arg_name);
+        llvm::Value* alloca = create_alloca(arg_name, arg_type);
         arg.setName(arg_name);
         builder.CreateStore(&arg, alloca);
     }
@@ -245,11 +245,43 @@ void IRBuilder::set_loop_metadata(llvm::BranchInst* branch) {
 /*                             LLVM instruction utilities                               */
 /****************************************************************************************/
 
+llvm::Value* IRBuilder::create_alloca(const std::string& name, llvm::Type* type) {
+    // If insertion point for `alloca` instructions is not set, then create the instruction in the
+    // entry block and set it to be the insertion point.
+    if (!alloca_ip) {
+        // Get the entry block and insert the `alloca` instruction there.
+        llvm::BasicBlock* current_block = builder.GetInsertBlock();
+        llvm::BasicBlock& entry_block = current_block->getParent()->getEntryBlock();
+        builder.SetInsertPoint(&entry_block);
+        llvm::Value* alloca = builder.CreateAlloca(type, /*ArraySize=*/nullptr, name);
+
+        // Set the `alloca` instruction insertion point and restore the insertion point for the next
+        // set of instructions.
+        alloca_ip = llvm::cast<llvm::AllocaInst>(alloca);
+        builder.SetInsertPoint(current_block);
+        return alloca;
+    }
+
+    // Create `alloca` instruction.
+    llvm::BasicBlock* alloca_block = alloca_ip->getParent();
+    const auto& data_layout = alloca_block->getModule()->getDataLayout();
+    auto* alloca = new llvm::AllocaInst(type,
+                                        data_layout.getAllocaAddrSpace(),
+                                        /*ArraySize=*/nullptr,
+                                        data_layout.getPrefTypeAlign(type),
+                                        name);
+
+    // Insert `alloca` at the specified insertion point and reset it for the next instructions.
+    alloca_block->getInstList().insertAfter(alloca_ip->getIterator(), alloca);
+    alloca_ip = alloca;
+    return alloca;
+}
+
 void IRBuilder::create_array_alloca(const std::string& name,
                                     llvm::Type* element_type,
                                     int num_elements) {
     llvm::Type* array_type = llvm::ArrayType::get(element_type, num_elements);
-    builder.CreateAlloca(array_type, /*ArraySize=*/nullptr, name);
+    create_alloca(name, array_type);
 }
 
 void IRBuilder::create_binary_op(llvm::Value* lhs, llvm::Value* rhs, ast::BinaryOp op) {
@@ -414,7 +446,7 @@ void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
     } else {
         type = element_or_scalar_type;
     }
-    builder.CreateAlloca(type, /*ArraySize=*/nullptr, name);
+    create_alloca(name, type);
 }
 
 void IRBuilder::create_unary_op(llvm::Value* value, ast::UnaryOp op) {
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index ba3800fc66..744b737392 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -46,6 +46,9 @@ class IRBuilder {
     /// Symbol table of the NMODL AST.
     symtab::SymbolTable* symbol_table;
 
+    /// Insertion point for `alloca` instructions.
+    llvm::Instruction* alloca_ip;
+
     /// Flag to indicate that the generated IR should be vectorized.
     bool vectorize;
 
@@ -69,6 +72,7 @@ class IRBuilder {
         , symbol_table(nullptr)
         , current_function(nullptr)
         , vectorize(false)
+        , alloca_ip(nullptr)
         , fp_precision(use_single_precision ? single_precision : double_precision)
         , vector_width(vector_width)
         , mask(nullptr)
@@ -104,6 +108,7 @@ class IRBuilder {
     void clear_function() {
         value_stack.clear();
         current_function = nullptr;
+        alloca_ip = nullptr;
     }
 
     /// Sets the value to be the mask for vector code generation.
@@ -125,6 +130,8 @@ class IRBuilder {
     void allocate_function_arguments(llvm::Function* function,
                                      const ast::CodegenVarWithTypeVector& nmodl_arguments);
 
+    llvm::Value* create_alloca(const std::string& name, llvm::Type* type);
+
     /// Generates IR for allocating an array.
     void create_array_alloca(const std::string& name, llvm::Type* element_type, int num_elements);
 

From 908779ee2daae7cff95b46a745d4b5d1c58516b2 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 20 May 2021 23:53:04 -0700
Subject: [PATCH 064/331] Fast math flags for LLVM backend (#662)

Added support for fast math flags in LLVM backend. Currently,
the user can specify them via command-line (this approach was
chosen for easier benchmarking). The specified flags are named
exactly  the same as in LLVM. This feature is useful to enable
previously unsafe FP-math optimizations. For example, fused-multiply-add
instructions can now be generated when lowering LLVM IR to assembly
or executing via JIT.

Example:
```c++
// fma.mod
FUNCTION fma(a, b, c) {
    fma = (a * b) + c
}
```
```bash
$ ./nmodl fma.mod --verbose debug llvm --ir --fmf nnan contract afn --opt
```
```llvm
define double @fma(double %a, double %b, double %c) {
  %1 = fmul nnan contract afn double %a, %b
  %2 = fadd nnan contract afn double %1, %c
  ret double %2
}
```
---
 src/codegen/llvm/codegen_llvm_visitor.hpp |  5 +--
 src/codegen/llvm/llvm_ir_builder.hpp      | 28 +++++++++++++++--
 src/main.cpp                              |  9 +++++-
 test/unit/codegen/codegen_llvm_ir.cpp     | 38 +++++++++++++++++++++--
 4 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index a97e73030a..c3beb53640 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -113,14 +113,15 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        bool use_single_precision = false,
                        int vector_width = 1,
                        std::string vec_lib = "none",
-                       bool add_debug_information = false)
+                       bool add_debug_information = false,
+                       std::vector<std::string> fast_math_flags = {})
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
         , add_debug_information(add_debug_information)
-        , ir_builder(*context, use_single_precision, vector_width)
+        , ir_builder(*context, use_single_precision, vector_width, fast_math_flags)
         , debug_builder(*module)
         , codegen_pm(module.get())
         , opt_pm(module.get()) {}
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index 744b737392..b9736e2846 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -64,10 +64,14 @@ class IRBuilder {
     /// The name of induction variable used in kernel loops.
     std::string kernel_id;
 
+    /// Fast math flags for floating-point IR instructions.
+    std::vector<std::string> fast_math_flags;
+
   public:
     IRBuilder(llvm::LLVMContext& context,
               bool use_single_precision = false,
-              unsigned vector_width = 1)
+              unsigned vector_width = 1,
+              std::vector<std::string> fast_math_flags = {})
         : builder(context)
         , symbol_table(nullptr)
         , current_function(nullptr)
@@ -76,10 +80,30 @@ class IRBuilder {
         , fp_precision(use_single_precision ? single_precision : double_precision)
         , vector_width(vector_width)
         , mask(nullptr)
-        , kernel_id("") {}
+        , kernel_id("")
+        , fast_math_flags(fast_math_flags) {}
+
+    /// Transforms the fast math flags provided to the builder into LLVM's representation.
+    llvm::FastMathFlags transform_to_fmf(std::vector<std::string>& flags) {
+        static const std::map<std::string, void (llvm::FastMathFlags::*)(bool)> set_flag = {
+            {"nnan", &llvm::FastMathFlags::setNoNaNs},
+            {"ninf", &llvm::FastMathFlags::setNoInfs},
+            {"nsz", &llvm::FastMathFlags::setNoSignedZeros},
+            {"contract", &llvm::FastMathFlags::setAllowContract},
+            {"afn", &llvm::FastMathFlags::setApproxFunc},
+            {"reassoc", &llvm::FastMathFlags::setAllowReassoc},
+            {"fast", &llvm::FastMathFlags::setFast}};
+        llvm::FastMathFlags fmf;
+        for (const auto& flag: flags) {
+            (fmf.*(set_flag.at(flag)))(true);
+        }
+        return fmf;
+    }
 
     /// Initializes the builder with the symbol table and the kernel induction variable id.
     void initialize(symtab::SymbolTable& symbol_table, std::string& kernel_id) {
+        if (!fast_math_flags.empty())
+            builder.setFastMathFlags(transform_to_fmf(fast_math_flags));
         this->symbol_table = &symbol_table;
         this->kernel_id = kernel_id;
     }
diff --git a/src/main.cpp b/src/main.cpp
index ee781444c8..b3f86c9e0b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -183,6 +183,9 @@ int main(int argc, const char* argv[]) {
     /// disable debug information generation for the IR
     bool disable_debug_information(false);
 
+    /// fast math flags for LLVM backend
+    std::vector<std::string> llvm_fast_math_flags;
+
     /// run llvm benchmark
     bool run_llvm_benchmark(false);
 
@@ -330,6 +333,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_option("--veclib",
                          vector_library,
                          "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
+    llvm_opt->add_option("--fmf",
+                         llvm_fast_math_flags,
+                         "Fast math flags for floating-point optimizations (none)")->check(CLI::IsMember({"afn", "arcp", "contract", "ninf", "nnan", "nsz", "reassoc", "fast"}));
 
     // LLVM IR benchmark options.
     auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
@@ -659,7 +665,8 @@ int main(int argc, const char* argv[]) {
                                            llvm_float_type,
                                            llvm_vec_width,
                                            vector_library,
-                                           !disable_debug_information);
+                                           !disable_debug_information,
+                                           llvm_fast_math_flags);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 0a3facf6fc..f338e13234 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -37,7 +37,8 @@ std::string run_llvm_visitor(const std::string& text,
                              bool opt = false,
                              bool use_single_precision = false,
                              int vector_width = 1,
-                             std::string vec_lib = "none") {
+                             std::string vec_lib = "none",
+                             std::vector<std::string> fast_math_flags = {}) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
@@ -50,7 +51,9 @@ std::string run_llvm_visitor(const std::string& text,
                                              opt,
                                              use_single_precision,
                                              vector_width,
-                                             vec_lib);
+                                             vec_lib,
+                                             /*add_debug_information=*/false,
+                                             fast_math_flags);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();
 }
@@ -1378,6 +1381,37 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
     }
 }
 
+//=============================================================================
+// Fast math flags
+//=============================================================================
+
+SCENARIO("Fast math flags", "[visitor][llvm]") {
+    GIVEN("A function to produce fma and specified math flags") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(a, b, c) {
+                foo = (a * b) + c
+            }
+        )";
+
+        THEN("instructions are generated with the flags set") {
+            std::string module_string =
+                run_llvm_visitor(nmodl_text,
+                                 /*opt=*/true,
+                                 /*use_single_precision=*/false,
+                                 /*vector_width=*/1,
+                                 /*vec_lib=*/"none",
+                                 /*fast_math_flags=*/{"nnan", "contract", "afn"});
+            std::smatch m;
+
+            // Check flags for produced 'fmul' and 'fadd' instructions.
+            std::regex fmul(R"(fmul nnan contract afn double %.*, %.*)");
+            std::regex fadd(R"(fadd nnan contract afn double %.*, %.*)");
+            REQUIRE(std::regex_search(module_string, m, fmul));
+            REQUIRE(std::regex_search(module_string, m, fadd));
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From 2ca85e55e18b8eb2d381041aaf8df80ccbda012b Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 21 May 2021 14:04:39 +0200
Subject: [PATCH 065/331] Avoid generating LLVM IR for Functions and Procedures
 if inlined (#664)

---
 CMakeLists.txt                                | 71 +++++++++----------
 .../llvm/codegen_llvm_helper_visitor.cpp      | 18 +++++
 .../llvm/codegen_llvm_helper_visitor.hpp      | 10 ++-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  2 +-
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  7 +-
 src/main.cpp                                  |  3 +-
 test/unit/codegen/codegen_llvm_ir.cpp         | 66 ++++++++++++++++-
 7 files changed, 132 insertions(+), 45 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 07bfde7ba5..8e47a4bd76 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -241,41 +241,40 @@ endif()
 message(STATUS "")
 message(STATUS "Configured NMODL ${PROJECT_VERSION} (${GIT_REVISION})")
 message(STATUS "")
-string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower)
-if(cmake_generator_tolower MATCHES "makefile")
-  message(STATUS "Some things you can do now:")
-  message(STATUS "--------------------+--------------------------------------------------------")
-  message(STATUS "Command             |   Description")
-  message(STATUS "--------------------+--------------------------------------------------------")
-  message(STATUS "make                | Build the project")
-  message(STATUS "make test           | Run unit tests")
-  message(STATUS "make install        | Will install NMODL to: ${CMAKE_INSTALL_PREFIX}")
-  message(STATUS "--------------------+--------------------------------------------------------")
-  message(STATUS " Build option       | Status")
-  message(STATUS "--------------------+--------------------------------------------------------")
-  message(STATUS "CXX COMPILER        | ${CMAKE_CXX_COMPILER}")
-  message(STATUS "COMPILE FLAGS       | ${COMPILER_FLAGS}")
-  message(STATUS "Build Type          | ${CMAKE_BUILD_TYPE}")
-  message(STATUS "Legacy Units        | ${NMODL_ENABLE_LEGACY_UNITS}")
-  message(STATUS "Python Bindings     | ${NMODL_ENABLE_PYTHON_BINDINGS}")
-  message(STATUS "Flex                | ${FLEX_EXECUTABLE}")
-  message(STATUS "Bison               | ${BISON_EXECUTABLE}")
-  message(STATUS "Python              | ${PYTHON_EXECUTABLE}")
-  message(STATUS "LLVM Codegen        | ${NMODL_ENABLE_LLVM}")
-  if(NMODL_ENABLE_LLVM)
-    message(STATUS "  VERSION           | ${LLVM_PACKAGE_VERSION}")
-    message(STATUS "  INCLUDE           | ${LLVM_INCLUDE_DIRS}")
-    message(STATUS "  CMAKE             | ${LLVM_CMAKE_DIR}")
-    message(STATUS "  JIT LISTENERS     | ${NMODL_ENABLE_JIT_EVENT_LISTENERS}")
-  endif()
-  if(NMODL_CLANG_FORMAT)
-    message(STATUS "Clang Format        | ${ClangFormat_EXECUTABLE}")
-  endif()
-  if(NMODL_CMAKE_FORMAT)
-    message(STATUS "Cmake Format        | ${CMakeFormat_EXECUTABLE}")
-  endif()
-  message(STATUS "--------------+--------------------------------------------------------------")
-  message(STATUS " See documentation : https://github.com/BlueBrain/nmodl/")
-  message(STATUS "--------------+--------------------------------------------------------------")
+
+message(STATUS "Some things you can do now:")
+message(STATUS "--------------------+--------------------------------------------------------")
+message(STATUS "Command             |   Description")
+message(STATUS "--------------------+--------------------------------------------------------")
+message(STATUS "make                | Build the project")
+message(STATUS "make test           | Run unit tests")
+message(STATUS "make install        | Will install NMODL to: ${CMAKE_INSTALL_PREFIX}")
+message(STATUS "--------------------+--------------------------------------------------------")
+message(STATUS " Build option       | Status")
+message(STATUS "--------------------+--------------------------------------------------------")
+message(STATUS "CXX COMPILER        | ${CMAKE_CXX_COMPILER}")
+message(STATUS "COMPILE FLAGS       | ${COMPILER_FLAGS}")
+message(STATUS "Build Type          | ${CMAKE_BUILD_TYPE}")
+message(STATUS "Legacy Units        | ${NMODL_ENABLE_LEGACY_UNITS}")
+message(STATUS "Python Bindings     | ${NMODL_ENABLE_PYTHON_BINDINGS}")
+message(STATUS "Flex                | ${FLEX_EXECUTABLE}")
+message(STATUS "Bison               | ${BISON_EXECUTABLE}")
+message(STATUS "Python              | ${PYTHON_EXECUTABLE}")
+message(STATUS "LLVM Codegen        | ${NMODL_ENABLE_LLVM}")
+if(NMODL_ENABLE_LLVM)
+  message(STATUS "  VERSION           | ${LLVM_PACKAGE_VERSION}")
+  message(STATUS "  INCLUDE           | ${LLVM_INCLUDE_DIRS}")
+  message(STATUS "  CMAKE             | ${LLVM_CMAKE_DIR}")
+  message(STATUS "  JIT LISTENERS     | ${NMODL_ENABLE_JIT_EVENT_LISTENERS}")
+endif()
+if(NMODL_CLANG_FORMAT)
+  message(STATUS "Clang Format        | ${ClangFormat_EXECUTABLE}")
 endif()
+if(NMODL_CMAKE_FORMAT)
+  message(STATUS "Cmake Format        | ${CMakeFormat_EXECUTABLE}")
+endif()
+message(STATUS "--------------+--------------------------------------------------------------")
+message(STATUS " See documentation : https://github.com/BlueBrain/nmodl/")
+message(STATUS "--------------+--------------------------------------------------------------")
+
 message(STATUS "")
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 5974edc623..ee9387be94 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -198,6 +198,7 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     }
     codegen_functions.push_back(function);
 }
+
 /**
  * \note : Order of variables is not important but we assume all pointers
  * are added first and then scalar variables like t, dt, second_order etc.
@@ -536,11 +537,17 @@ void CodegenLLVMHelperVisitor::rename_local_variables(ast::StatementBlock& node)
 
 
 void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
+    // if the Procedure block is already inlined, there is no reason to generate the LLVM IR code
+    if (nmodl_inline)
+        return;
     node.visit_children(*this);
     create_function_for_node(node);
 }
 
 void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
+    // if the Function block is already inlined, there is no reason to generate the LLVM IR code
+    if (nmodl_inline)
+        return;
     node.visit_children(*this);
     create_function_for_node(node);
 }
@@ -786,6 +793,17 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
+    // Remove Function and Procedure blocks from the Program since they are already inlined
+    if (nmodl_inline) {
+        const auto& func_proc_nodes =
+            collect_nodes(node,
+                          {ast::AstNodeType::FUNCTION_BLOCK, ast::AstNodeType::PROCEDURE_BLOCK});
+        std::unordered_set<ast::Node*> nodes_to_erase;
+        for (const auto& ast_node: func_proc_nodes) {
+            nodes_to_erase.insert(static_cast<ast::Node*>(ast_node.get()));
+        }
+        node.erase_node(nodes_to_erase);
+    }
 }
 
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index bbff588675..3619cbc32e 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -100,9 +100,12 @@ struct InstanceVarHelper {
  * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
-    // explicit vectorisation width
+    /// explicit vectorisation width
     int vector_width;
 
+    /// variable to check whether Function and Procedures blocks are inline by NMODL passes
+    bool nmodl_inline;
+
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
 
@@ -134,8 +137,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     static const std::string VOLTAGE_VAR;
     static const std::string NODE_INDEX_VAR;
 
-    CodegenLLVMHelperVisitor(int vector_width)
-        : vector_width(vector_width){};
+    CodegenLLVMHelperVisitor(int vector_width, bool nmodl_inline)
+        : vector_width(vector_width)
+        , nmodl_inline(nmodl_inline) {}
 
     const InstanceVarHelper& get_instance_var_helper() {
         return instance_var_helper;
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6df5820d42..515949e329 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -770,7 +770,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{vector_width};
+    CodegenLLVMHelperVisitor v{vector_width, nmodl_inline};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index c3beb53640..cbc0f9b949 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -69,6 +69,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Output directory for code generation.
     std::string output_dir;
 
+    /// Variable to check if Functions and Procedures are inlined by NMODL passes
+    bool nmodl_inline;
+
   private:
     /// Underlying LLVM context.
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
@@ -114,9 +117,11 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        int vector_width = 1,
                        std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {})
+                       std::vector<std::string> fast_math_flags = {},
+                       bool nmodl_inline = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
+        , nmodl_inline(nmodl_inline)
         , opt_passes(opt_passes)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
diff --git a/src/main.cpp b/src/main.cpp
index b3f86c9e0b..fc2922a73e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -666,7 +666,8 @@ int main(int argc, const char* argv[]) {
                                            llvm_vec_width,
                                            vector_library,
                                            !disable_debug_information,
-                                           llvm_fast_math_flags);
+                                           llvm_fast_math_flags,
+                                           nmodl_inline);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index f338e13234..0953034c99 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -16,6 +16,7 @@
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
+#include "visitors/inline_visitor.hpp"
 #include "visitors/neuron_solve_visitor.hpp"
 #include "visitors/solve_block_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
@@ -38,11 +39,15 @@ std::string run_llvm_visitor(const std::string& text,
                              bool use_single_precision = false,
                              int vector_width = 1,
                              std::string vec_lib = "none",
-                             std::vector<std::string> fast_math_flags = {}) {
+                             std::vector<std::string> fast_math_flags = {},
+                             bool nmodl_inline = false) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
+    if (nmodl_inline) {
+        InlineVisitor().visit_program(*ast);
+    }
     NeuronSolveVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
 
@@ -53,7 +58,9 @@ std::string run_llvm_visitor(const std::string& text,
                                              vector_width,
                                              vec_lib,
                                              /*add_debug_information=*/false,
-                                             fast_math_flags);
+                                             fast_math_flags,
+                                             nmodl_inline);
+
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();
 }
@@ -71,7 +78,7 @@ std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
 
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
+    CodegenLLVMHelperVisitor(vector_width, /*nmodl_inline=*/false).visit_program(*ast);
 
     const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 
@@ -1436,3 +1443,56 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
         }
     }
 }
+
+//=============================================================================
+// Inlining: remove inline code blocks
+//=============================================================================
+
+SCENARIO("Removal of inlined functions and procedures", "[visitor][llvm][inline]") {
+    GIVEN("Simple breakpoint block calling a function and a procedure") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test_inline
+                RANGE a, b, s
+            }
+            ASSIGNED {
+                a
+                b
+                s
+            }
+            PROCEDURE test_add(a, b) {
+                LOCAL i
+                i = a + b
+            }
+            FUNCTION test_sub(a, b) {
+                test_sub = a - b
+            }
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+            DERIVATIVE states {
+                a = 1
+                b = 2
+                test_add(a, b)
+                s = test_sub(a, b)
+            }
+        )";
+
+        THEN("when the code is inlined the procedure and function blocks are removed") {
+            std::string module_string = run_llvm_visitor(nmodl_text,
+                                                         /*opt=*/false,
+                                                         /*use_single_precision=*/false,
+                                                         /*vector_width=*/1,
+                                                         /*vec_lib=*/"none",
+                                                         /*fast_math_flags=*/{},
+                                                         /*nmodl_inline=*/true);
+            std::smatch m;
+
+            // Check if the procedure and function declarations are removed
+            std::regex add_proc(R"(define i32 @test_add\(double %a[0-9].*, double %b[0-9].*\))");
+            REQUIRE(!std::regex_search(module_string, m, add_proc));
+            std::regex sub_func(R"(define double @test_sub\(double %a[0-9].*, double %b[0-9].*\))");
+            REQUIRE(!std::regex_search(module_string, m, sub_func));
+        }
+    }
+}

From 7fdbb4f2f6900e5a605992b482a6dbfba296c07c Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 21 May 2021 08:21:07 -0700
Subject: [PATCH 066/331] Fixed typo in benchmarking metrics (#665)

---
 test/benchmark/llvm_benchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index b9f2fdeced..e48df0d457 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -144,7 +144,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         logger->info("Compute time variance = {:g}",
                      time_squared_sum / num_experiments - time_mean * time_mean);
         logger->info("Minimum compute time = {:.6f}", time_min);
-        logger->info("Minimum compute time = {:.6f}\n", time_max);
+        logger->info("Maximum compute time = {:.6f}\n", time_max);
     }
 }
 

From 4c585f396df70bb50ed322d52554cf2c2c5a061a Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 21 May 2021 23:04:19 +0200
Subject: [PATCH 067/331] Remove only inlined blocks from AST based on symtab
 properties (#668)

---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 36 ++++++++++---------
 .../llvm/codegen_llvm_helper_visitor.hpp      | 11 +++---
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  2 +-
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  7 +---
 src/main.cpp                                  |  3 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  5 ++-
 6 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index ee9387be94..654afd8ef5 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -10,6 +10,7 @@
 
 #include "ast/all.hpp"
 #include "codegen/codegen_helper_visitor.hpp"
+#include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
@@ -19,6 +20,8 @@ namespace codegen {
 
 using namespace fmt::literals;
 
+using symtab::syminfo::Status;
+
 /// initialize static member variables
 const ast::AstNodeType CodegenLLVMHelperVisitor::INTEGER_TYPE = ast::AstNodeType::INTEGER;
 const ast::AstNodeType CodegenLLVMHelperVisitor::FLOAT_TYPE = ast::AstNodeType::DOUBLE;
@@ -537,17 +540,11 @@ void CodegenLLVMHelperVisitor::rename_local_variables(ast::StatementBlock& node)
 
 
 void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
-    // if the Procedure block is already inlined, there is no reason to generate the LLVM IR code
-    if (nmodl_inline)
-        return;
     node.visit_children(*this);
     create_function_for_node(node);
 }
 
 void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
-    // if the Function block is already inlined, there is no reason to generate the LLVM IR code
-    if (nmodl_inline)
-        return;
     node.visit_children(*this);
     create_function_for_node(node);
 }
@@ -780,6 +777,21 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     std::cout << nmodl::to_nmodl(function) << std::endl;
 }
 
+void CodegenLLVMHelperVisitor::remove_inlined_nodes(ast::Program& node) {
+    auto program_symtab = node.get_model_symbol_table();
+    const auto& func_proc_nodes =
+        collect_nodes(node, {ast::AstNodeType::FUNCTION_BLOCK, ast::AstNodeType::PROCEDURE_BLOCK});
+    std::unordered_set<ast::Node*> nodes_to_erase;
+    for (const auto& ast_node: func_proc_nodes) {
+        if (program_symtab->lookup(ast_node->get_node_name())
+                .get()
+                ->has_all_status(Status::inlined)) {
+            nodes_to_erase.insert(static_cast<ast::Node*>(ast_node.get()));
+        }
+    }
+    node.erase_node(nodes_to_erase);
+}
+
 void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     /// run codegen helper visitor to collect information
     CodegenHelperVisitor v;
@@ -789,21 +801,11 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     node.emplace_back_node(instance_var_helper.instance);
 
     logger->info("Running CodegenLLVMHelperVisitor");
+    remove_inlined_nodes(node);
     node.visit_children(*this);
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
-    // Remove Function and Procedure blocks from the Program since they are already inlined
-    if (nmodl_inline) {
-        const auto& func_proc_nodes =
-            collect_nodes(node,
-                          {ast::AstNodeType::FUNCTION_BLOCK, ast::AstNodeType::PROCEDURE_BLOCK});
-        std::unordered_set<ast::Node*> nodes_to_erase;
-        for (const auto& ast_node: func_proc_nodes) {
-            nodes_to_erase.insert(static_cast<ast::Node*>(ast_node.get()));
-        }
-        node.erase_node(nodes_to_erase);
-    }
 }
 
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 3619cbc32e..9d79e24803 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -103,9 +103,6 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     /// explicit vectorisation width
     int vector_width;
 
-    /// variable to check whether Function and Procedures blocks are inline by NMODL passes
-    bool nmodl_inline;
-
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
 
@@ -137,9 +134,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     static const std::string VOLTAGE_VAR;
     static const std::string NODE_INDEX_VAR;
 
-    CodegenLLVMHelperVisitor(int vector_width, bool nmodl_inline)
-        : vector_width(vector_width)
-        , nmodl_inline(nmodl_inline) {}
+    CodegenLLVMHelperVisitor(int vector_width)
+        : vector_width(vector_width) {}
 
     const InstanceVarHelper& get_instance_var_helper() {
         return instance_var_helper;
@@ -169,6 +165,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void convert_local_statement(ast::StatementBlock& node);
     void rename_local_variables(ast::StatementBlock& node);
 
+    /// Remove Function and Procedure blocks from the node since they are already inlined
+    void remove_inlined_nodes(ast::Program& node);
+
     void visit_procedure_block(ast::ProcedureBlock& node) override;
     void visit_function_block(ast::FunctionBlock& node) override;
     void visit_nrn_state_block(ast::NrnStateBlock& node) override;
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 515949e329..6df5820d42 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -770,7 +770,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{vector_width, nmodl_inline};
+    CodegenLLVMHelperVisitor v{vector_width};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index cbc0f9b949..c3beb53640 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -69,9 +69,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Output directory for code generation.
     std::string output_dir;
 
-    /// Variable to check if Functions and Procedures are inlined by NMODL passes
-    bool nmodl_inline;
-
   private:
     /// Underlying LLVM context.
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
@@ -117,11 +114,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        int vector_width = 1,
                        std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {},
-                       bool nmodl_inline = false)
+                       std::vector<std::string> fast_math_flags = {})
         : mod_filename(mod_filename)
         , output_dir(output_dir)
-        , nmodl_inline(nmodl_inline)
         , opt_passes(opt_passes)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
diff --git a/src/main.cpp b/src/main.cpp
index fc2922a73e..b3f86c9e0b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -666,8 +666,7 @@ int main(int argc, const char* argv[]) {
                                            llvm_vec_width,
                                            vector_library,
                                            !disable_debug_information,
-                                           llvm_fast_math_flags,
-                                           nmodl_inline);
+                                           llvm_fast_math_flags);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 0953034c99..a0a4af297c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -58,8 +58,7 @@ std::string run_llvm_visitor(const std::string& text,
                                              vector_width,
                                              vec_lib,
                                              /*add_debug_information=*/false,
-                                             fast_math_flags,
-                                             nmodl_inline);
+                                             fast_math_flags);
 
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();
@@ -78,7 +77,7 @@ std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
 
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(vector_width, /*nmodl_inline=*/false).visit_program(*ast);
+    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
 
     const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 

From f0a3afc70e23a277259bd2543588788ef9b5f633 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Tue, 25 May 2021 12:22:14 +0200
Subject: [PATCH 068/331] Use VarName on the RHS of assignment expression
 (#669)

- NMODL parser uses VarName on the LHS of assignment expression
- Inline visitor was using Name on the LHS of assignment expression

Related to #667
---
 src/visitors/inline_visitor.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/visitors/inline_visitor.cpp b/src/visitors/inline_visitor.cpp
index cb723f0f1c..d628233dd7 100644
--- a/src/visitors/inline_visitor.cpp
+++ b/src/visitors/inline_visitor.cpp
@@ -298,6 +298,8 @@ void InlineVisitor::visit_statement_block(StatementBlock& node) {
 /** Visit all wrapped expressions which can contain function calls.
  *  If a function call is replaced then the wrapped expression is
  *  also replaced with new variable node from the inlining result.
+ *  Note that we use `VarName` so that LHS of assignment expression
+ *  is `VarName`, similar to parser.
  */
 void InlineVisitor::visit_wrapped_expression(WrappedExpression& node) {
     node.visit_children(*this);
@@ -306,7 +308,9 @@ void InlineVisitor::visit_wrapped_expression(WrappedExpression& node) {
         auto expression = dynamic_cast<FunctionCall*>(e.get());
         if (replaced_fun_calls.find(expression) != replaced_fun_calls.end()) {
             auto var = replaced_fun_calls[expression];
-            node.set_expression(std::make_shared<Name>(new String(var)));
+            node.set_expression(std::make_shared<VarName>(new Name(new String(var)),
+                                                          /*at=*/nullptr,
+                                                          /*index=*/nullptr));
         }
     }
 }

From 2609f877ae63b457cf59cfbffa5a367dca22414b Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 30 May 2021 07:07:19 -0700
Subject: [PATCH 069/331] [LLVM] SLEEF and libsystem_m vector libraries support
 (#674)

* Added support for `libsystem_m` and `SLEEF` vector libraries. The
first one is supported by LLVM internally, so it comes for free with
LLVM 13. For `SLEEF`, basic support was added for AArch64 and
x86 architectures. Currently, we support
- `exp`
- `pow`

* Added corresponding IR checks for `libsystem_m` and
`SLEEF` (both AArch64 and x86).

* Updated LLVM binaries for MAC OS CI, as well as for latest LLVM 13
(trunk) to fix link errors for Darwin vector library.

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 azure-pipelines.yml                       |  4 +-
 ci/bb5-pr.sh                              |  2 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 79 +++++++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp | 21 +++---
 src/main.cpp                              |  2 +-
 test/unit/codegen/codegen_llvm_ir.cpp     | 30 +++++++++
 6 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index ffe744d6f9..59f5d5bb04 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -134,13 +134,13 @@ jobs:
     displayName: 'Install Dependencies'
   - script: |
       cd $HOME
-      git clone https://github.com/pramodk/llvm-nightly.git
+      git clone --depth 1 https://github.com/pramodk/llvm-nightly.git
     displayName: 'Setup LLVM v13'
   - script: |
       export PATH=/usr/local/opt/flex/bin:/usr/local/opt/bison/bin:$PATH;
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$HOME/llvm-nightly/0421/osx/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$HOME/llvm-nightly/0621/osx/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
       make -j 2
       if [ $? -ne 0 ]
       then
diff --git a/ci/bb5-pr.sh b/ci/bb5-pr.sh
index 6ecff76144..abdce2d867 100755
--- a/ci/bb5-pr.sh
+++ b/ci/bb5-pr.sh
@@ -42,7 +42,7 @@ function build_with() {
              -DNMODL_FORMATTING:BOOL=ON \
              -DClangFormat_EXECUTABLE=$clang_format_exe \
              -DNMODL_ENABLE_JIT_EVENT_LISTENERS=ON \
-             -DLLVM_DIR=/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0521/lib/cmake/llvm
+             -DLLVM_DIR=/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621/lib/cmake/llvm
     make -j6
     popd
 }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6df5820d42..1e5ca89c6d 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -65,6 +65,68 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
+#if LLVM_VERSION_MAJOR >= 13
+void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
+                                                                 llvm::Triple& triple) {
+    // Since LLVM does not support SLEEF as a vector library yet, process it separately.
+    if (vector_library == "SLEEF") {
+        // Populate function definitions of only exp and pow (for now)
+#define FIXED(w)                        llvm::ElementCount::getFixed(w)
+#define DISPATCH(func, vec_func, width) {func, vec_func, width},
+        const llvm::VecDesc aarch64_functions[] = {
+            // clang-format off
+            DISPATCH("llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4))
+            DISPATCH("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2))
+            DISPATCH("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4))
+            DISPATCH("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2))
+            // clang-format on
+        };
+        const llvm::VecDesc x86_functions[] = {
+            // clang-format off
+            DISPATCH("llvm.exp.f64", "_ZGVbN2v_exp", FIXED(2))
+            DISPATCH("llvm.exp.f64", "_ZGVdN4v_exp", FIXED(4))
+            DISPATCH("llvm.exp.f64", "_ZGVeN8v_exp", FIXED(8))
+            DISPATCH("llvm.pow.f64", "_ZGVbN2vv_pow", FIXED(2))
+            DISPATCH("llvm.pow.f64", "_ZGVdN4vv_pow", FIXED(4))
+            DISPATCH("llvm.pow.f64", "_ZGVeN8vv_pow", FIXED(8))
+            // clang-format on
+        };
+#undef DISPATCH
+
+        if (triple.isAArch64()) {
+            tli.addVectorizableFunctions(aarch64_functions);
+        }
+        if (triple.isX86() && triple.isArch64Bit()) {
+            tli.addVectorizableFunctions(x86_functions);
+        }
+
+    } else {
+        // A map to query vector library by its string value.
+        using VecLib = llvm::TargetLibraryInfoImpl::VectorLibrary;
+        static const std::map<std::string, VecLib> llvm_supported_vector_libraries = {
+            {"Accelerate", VecLib::Accelerate},
+            {"libmvec", VecLib::LIBMVEC_X86},
+            {"libsystem_m", VecLib ::DarwinLibSystemM},
+            {"MASSV", VecLib::MASSV},
+            {"none", VecLib::NoLibrary},
+            {"SVML", VecLib::SVML}};
+        const auto& library = llvm_supported_vector_libraries.find(vector_library);
+        if (library == llvm_supported_vector_libraries.end())
+            throw std::runtime_error("Error: unknown vector library - " + vector_library + "\n");
+
+        // Add vectorizable functions to the target library info.
+        switch (library->second) {
+        case VecLib::LIBMVEC_X86:
+            if (!triple.isX86() || !triple.isArch64Bit())
+                break;
+        default:
+            tli.addVectorizableFunctionsFromVecLib(library->second);
+            break;
+        }
+    }
+}
+#endif
+
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
     node->accept(*this);
     return ir_builder.pop_last_value();
@@ -817,25 +879,20 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         run_ir_opt_passes();
     }
 
-    // Optionally, replace LLVM's maths intrinsics with vector library calls.
-    if (vector_width > 1 && vector_library != llvm::TargetLibraryInfoImpl::NoLibrary) {
+    // Optionally, replace LLVM math intrinsics with vector library calls.
+    if (vector_width > 1) {
 #if LLVM_VERSION_MAJOR < 13
         logger->warn(
             "This version of LLVM does not support replacement of LLVM intrinsics with vector "
             "library calls");
 #else
-        // First, get the target library information.
+        // First, get the target library information and add vectorizable functions for the
+        // specified vector library.
         llvm::Triple triple(llvm::sys::getDefaultTargetTriple());
         llvm::TargetLibraryInfoImpl target_lib_info = llvm::TargetLibraryInfoImpl(triple);
+        add_vectorizable_functions_from_vec_lib(target_lib_info, triple);
 
-        // Populate target library information with vectorisable functions. Since libmvec is
-        // supported for x86_64 only, have a check to catch other architectures.
-        if (vector_library != llvm::TargetLibraryInfoImpl::LIBMVEC_X86 ||
-            (triple.isX86() && triple.isArch64Bit())) {
-            target_lib_info.addVectorizableFunctionsFromVecLib(vector_library);
-        }
-
-        // Run the codegen optimisation passes that replace maths intrinsics.
+        // Run passes that replace math intrinsics.
         codegen_pm.add(new llvm::TargetLibraryInfoWrapperPass(target_lib_info));
         codegen_pm.add(new llvm::ReplaceWithVeclibLegacy);
         codegen_pm.doInitialization();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index c3beb53640..49285f9941 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -48,15 +48,6 @@ namespace codegen {
  * @{
  */
 
-/// A map to query vector library by its string value.
-static const std::map<std::string, llvm::TargetLibraryInfoImpl::VectorLibrary> veclib_map = {
-    {"Accelerate", llvm::TargetLibraryInfoImpl::Accelerate},
-#if LLVM_VERSION_MAJOR >= 13
-    {"libmvec", llvm::TargetLibraryInfoImpl::LIBMVEC_X86},
-#endif
-    {"MASSV", llvm::TargetLibraryInfoImpl::MASSV},
-    {"SVML", llvm::TargetLibraryInfoImpl::SVML},
-    {"none", llvm::TargetLibraryInfoImpl::NoLibrary}};
 
 /**
  * \class CodegenLLVMVisitor
@@ -100,8 +91,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Pass manager for optimisation passes that are used for target code generation.
     llvm::legacy::FunctionPassManager codegen_pm;
 
-    /// Vector library used for maths functions.
-    llvm::TargetLibraryInfoImpl::VectorLibrary vector_library;
+    /// Vector library used for math functions.
+    std::string vector_library;
 
     /// Explicit vectorisation width.
     int vector_width;
@@ -119,7 +110,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , vector_width(vector_width)
-        , vector_library(veclib_map.at(vec_lib))
+        , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
         , ir_builder(*context, use_single_precision, vector_width, fast_math_flags)
         , debug_builder(*module)
@@ -183,6 +174,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void wrap_kernel_functions();
 
   private:
+#if LLVM_VERSION_MAJOR >= 13
+    /// Populates target library info with the vector library definitions.
+    void add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
+                                                 llvm::Triple& triple);
+#endif
+
     /// Accepts the given AST node and returns the processed value.
     llvm::Value* accept_and_get(const std::shared_ptr<ast::Node>& node);
 
diff --git a/src/main.cpp b/src/main.cpp
index b3f86c9e0b..64a5a99fca 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -332,7 +332,7 @@ int main(int argc, const char* argv[]) {
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
     llvm_opt->add_option("--veclib",
                          vector_library,
-                         "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
+                         "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libsystem_m", "libmvec", "MASSV", "SLEEF", "SVML", "none"}));
     llvm_opt->add_option("--fmf",
                          llvm_fast_math_flags,
                          "Fast math flags for floating-point optimizations (none)")->check(CLI::IsMember({"afn", "arcp", "contract", "ninf", "nnan", "nsz", "reassoc", "fast"}));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index a0a4af297c..fa0a649f2d 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1382,6 +1382,36 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             REQUIRE(std::regex_search(accelerate_library_module_str, m, accelerate_exp_decl));
             REQUIRE(std::regex_search(accelerate_library_module_str, m, accelerate_exp_call));
             REQUIRE(!std::regex_search(accelerate_library_module_str, m, fexp_call));
+
+            // Check correct replacement of @llvm.exp.v2f64 into @_ZGV?N?v_exp when using SLEEF.
+            std::string sleef_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                    /*opt=*/false,
+                                                                    /*use_single_precision=*/false,
+                                                                    /*vector_width=*/2,
+                                                                    /*vec_lib=*/"SLEEF");
+#if defined(__arm64__) || defined(__aarch64__)
+            std::regex sleef_exp_decl(R"(declare <2 x double> @_ZGVnN2v_exp\(<2 x double>\))");
+            std::regex sleef_exp_call(R"(call <2 x double> @_ZGVnN2v_exp\(<2 x double> .*\))");
+#else
+            std::regex sleef_exp_decl(R"(declare <2 x double> @_ZGVbN2v_exp\(<2 x double>\))");
+            std::regex sleef_exp_call(R"(call <2 x double> @_ZGVbN2v_exp\(<2 x double> .*\))");
+#endif
+            REQUIRE(std::regex_search(sleef_library_module_str, m, sleef_exp_decl));
+            REQUIRE(std::regex_search(sleef_library_module_str, m, sleef_exp_call));
+            REQUIRE(!std::regex_search(sleef_library_module_str, m, fexp_call));
+
+            // Check the replacements when using Darwin's libsystem_m.
+            std::string libsystem_m_library_module_str =
+                run_llvm_visitor(nmodl_text,
+                                 /*opt=*/false,
+                                 /*use_single_precision=*/true,
+                                 /*vector_width=*/4,
+                                 /*vec_lib=*/"libsystem_m");
+            std::regex libsystem_m_exp_decl(R"(declare <4 x float> @_simd_exp_f4\(<4 x float>\))");
+            std::regex libsystem_m_exp_call(R"(call <4 x float> @_simd_exp_f4\(<4 x float> .*\))");
+            REQUIRE(std::regex_search(libsystem_m_library_module_str, m, libsystem_m_exp_decl));
+            REQUIRE(std::regex_search(libsystem_m_library_module_str, m, libsystem_m_exp_call));
+            REQUIRE(!std::regex_search(libsystem_m_library_module_str, m, fexp_call));
 #endif
         }
     }

From da1ed52c5b6621d22f9ea381cef175e2b17110c8 Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Tue, 18 May 2021 11:00:57 +0200
Subject: [PATCH 070/331] hacky support for timing JIT vs statically compiled
 kernels

---
 src/CMakeLists.txt                        |  3 ++-
 test/benchmark/llvm_benchmark.cpp         | 24 +++++++++++++++++++++++
 test/unit/codegen/codegen_data_helper.cpp | 13 ++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e4da0b713c..bdbd2b0879 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -29,7 +29,8 @@ target_link_libraries(
   lexer
   ${NMODL_WRAPPER_LIBS})
 if(NMODL_ENABLE_LLVM)
-  target_link_libraries(nmodl llvm_codegen llvm_benchmark benchmark_data ${LLVM_LIBS_TO_LINK})
+  target_link_libraries(nmodl llvm_codegen llvm_benchmark benchmark_data ${LLVM_LIBS_TO_LINK}) 
+  target_link_libraries(nmodl "/gpfs/bbp.cscs.ch/home/gcastigl/project16/nmodl-llvm-test/intel-kernel/libintelkernel.so")
 endif()
 
 # =============================================================================
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index e48df0d457..112212c3dd 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -16,6 +16,9 @@
 #include "test/unit/codegen/codegen_data_helper.hpp"
 
 
+void nrn_state_hh_intel(void*);
+
+
 namespace nmodl {
 namespace benchmark {
 
@@ -146,6 +149,27 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         logger->info("Minimum compute time = {:.6f}", time_min);
         logger->info("Maximum compute time = {:.6f}\n", time_max);
     }
+    // benchmark intel kernel
+    logger->info("Benchmarking external intel kernel");
+    // Initialise the data.
+    auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+    double time_sum = 0.0;
+    for (int i = 0; i < num_experiments; ++i) {
+        // Record the execution time of the kernel.
+        auto start = std::chrono::high_resolution_clock::now();
+        nrn_state_hh_intel(instance_data.base_ptr);
+        auto end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diff = end - start;
+
+        // Log the time taken for each run.
+        logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
+
+        time_sum += diff.count();
+    }
+    // Log the average time taken for the kernel.
+    logger->info("Average compute time = {:.6f} \n", time_sum / num_experiments);
+
+    // For every kernel run the benchmark `num_experiments` times.
 }
 
 }  // namespace benchmark
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index a0ee6ec957..caa8705dad 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -96,6 +96,7 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
 
     // allocate each variable and allocate memory at particular offset in base pointer
     for (auto& var: variables) {
+        
         // only process until first non-pointer variable
         if (!var->get_is_pointer()) {
             break;
@@ -139,6 +140,18 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
         variable_index++;
     }
 
+
+    int cnt{};
+    for (auto& var: variables) {
+        // printout vars 
+        std::cout << cnt++ 
+            << ":\t" << to_string(var->get_type()->get_type())  
+            << '\t' << var->get_is_pointer()
+            << '\t' << var->get_name()->get_node_name()  << '\n';
+    }
+
+
+
     // we are now switching from pointer type to next member type (e.g. double)
     // ideally we should use padding but switching from double* to double should
     // already meet alignment requirements

From 6ca88c2d9076a6b410b41200fbd2db6049076c01 Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Wed, 19 May 2021 11:08:12 +0200
Subject: [PATCH 071/331] renamed folder

---
 src/CMakeLists.txt                | 2 +-
 test/benchmark/llvm_benchmark.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bdbd2b0879..2b4e09097a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,7 +30,7 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 if(NMODL_ENABLE_LLVM)
   target_link_libraries(nmodl llvm_codegen llvm_benchmark benchmark_data ${LLVM_LIBS_TO_LINK}) 
-  target_link_libraries(nmodl "/gpfs/bbp.cscs.ch/home/gcastigl/project16/nmodl-llvm-test/intel-kernel/libintelkernel.so")
+  target_link_libraries(nmodl "/gpfs/bbp.cscs.ch/home/gcastigl/project16/nmodl-llvm-timing/external-kernel/libextkernel.so")
 endif()
 
 # =============================================================================
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 112212c3dd..5024cca43c 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -150,7 +150,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         logger->info("Maximum compute time = {:.6f}\n", time_max);
     }
     // benchmark intel kernel
-    logger->info("Benchmarking external intel kernel");
+    logger->info("Benchmarking external kernel");
     // Initialise the data.
     auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
     double time_sum = 0.0;

From bf8c9d51f81c177ff6924b76c167191a465168a9 Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Thu, 20 May 2021 15:43:51 +0200
Subject: [PATCH 072/331] slightly better stub for ext kernel, init data at
 every iteration

---
 src/CMakeLists.txt                |  1 -
 test/benchmark/CMakeLists.txt     |  7 +++++++
 test/benchmark/ext_kernel.cpp     | 15 +++++++++++++++
 test/benchmark/ext_kernel.hpp     |  9 +++++++++
 test/benchmark/llvm_benchmark.cpp | 23 ++++++++++++++++-------
 5 files changed, 47 insertions(+), 8 deletions(-)
 create mode 100644 test/benchmark/ext_kernel.cpp
 create mode 100644 test/benchmark/ext_kernel.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2b4e09097a..aa1675c9c4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,7 +30,6 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 if(NMODL_ENABLE_LLVM)
   target_link_libraries(nmodl llvm_codegen llvm_benchmark benchmark_data ${LLVM_LIBS_TO_LINK}) 
-  target_link_libraries(nmodl "/gpfs/bbp.cscs.ch/home/gcastigl/project16/nmodl-llvm-timing/external-kernel/libextkernel.so")
 endif()
 
 # =============================================================================
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index 4441d53251..84f98512b5 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -15,3 +15,10 @@ add_dependencies(llvm_benchmark lexer util visitor)
 if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
   target_compile_definitions(llvm_benchmark PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
 endif()
+
+# =============================================================================
+# external kernel stub
+# =============================================================================
+add_library(extkernel SHARED ext_kernel.cpp)
+set_target_properties(extkernel PROPERTIES CXX_VISIBILITY_PRESET default)
+target_link_libraries(llvm_benchmark PUBLIC extkernel)
\ No newline at end of file
diff --git a/test/benchmark/ext_kernel.cpp b/test/benchmark/ext_kernel.cpp
new file mode 100644
index 0000000000..333891bb59
--- /dev/null
+++ b/test/benchmark/ext_kernel.cpp
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "ext_kernel.hpp"
+
+#include <iostream>
+
+// external kernel stub
+void nrn_state_hh_ext(void* ){
+    std::cout << "stub kernel" << '\n';
+}
\ No newline at end of file
diff --git a/test/benchmark/ext_kernel.hpp b/test/benchmark/ext_kernel.hpp
new file mode 100644
index 0000000000..faf7895a09
--- /dev/null
+++ b/test/benchmark/ext_kernel.hpp
@@ -0,0 +1,9 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+#pragma once 
+
+void nrn_state_hh_ext(void*);
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 5024cca43c..1ee95229bf 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -10,14 +10,12 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "llvm_benchmark.hpp"
+#include "ext_kernel.hpp"
 #include "test/benchmark/jit_driver.hpp"
-#include "llvm/Support/Host.h"
-
 #include "test/unit/codegen/codegen_data_helper.hpp"
+#include "llvm/Support/Host.h"
 
 
-void nrn_state_hh_intel(void*);
-
 
 namespace nmodl {
 namespace benchmark {
@@ -110,6 +108,14 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
+<<<<<<< HEAD
+=======
+
+        // double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
+        // logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
+
+        logger->info("Benchmarking kernel '{}'", kernel_name);
+>>>>>>> slightly better stub for ext kernel, init data at every iteration
         // For every kernel run the benchmark `num_experiments` times.
         double time_min = std::numeric_limits<double>::max();
         double time_max = 0.0;
@@ -118,6 +124,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         for (int i = 0; i < num_experiments; ++i) {
             // Initialise the data.
             auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+<<<<<<< HEAD
 
             // Log instance size once.
             if (i == 0) {
@@ -125,6 +132,8 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
                 logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
             }
 
+=======
+>>>>>>> slightly better stub for ext kernel, init data at every iteration
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::high_resolution_clock::now();
@@ -149,15 +158,15 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         logger->info("Minimum compute time = {:.6f}", time_min);
         logger->info("Maximum compute time = {:.6f}\n", time_max);
     }
-    // benchmark intel kernel
+    // benchmark external kernel
     logger->info("Benchmarking external kernel");
     // Initialise the data.
-    auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
     double time_sum = 0.0;
     for (int i = 0; i < num_experiments; ++i) {
+        auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
         // Record the execution time of the kernel.
         auto start = std::chrono::high_resolution_clock::now();
-        nrn_state_hh_intel(instance_data.base_ptr);
+        nrn_state_hh_ext(instance_data.base_ptr);
         auto end = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double> diff = end - start;
 

From 57d785cb64bd0267589585d268b6685af9d3e3e7 Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Thu, 20 May 2021 17:06:01 +0200
Subject: [PATCH 073/331] added script and simple kernels

---
 test/benchmark/kernels/compute-bound.cpp |  29 +++++
 test/benchmark/kernels/compute-bound.mod |  24 ++++
 test/benchmark/kernels/memory-bound.cpp  |  38 +++++++
 test/benchmark/kernels/memory-bound.mod  |  25 +++++
 test/benchmark/nmodl-llvm-time.sh        | 136 +++++++++++++++++++++++
 5 files changed, 252 insertions(+)
 create mode 100644 test/benchmark/kernels/compute-bound.cpp
 create mode 100644 test/benchmark/kernels/compute-bound.mod
 create mode 100644 test/benchmark/kernels/memory-bound.cpp
 create mode 100644 test/benchmark/kernels/memory-bound.mod
 create mode 100755 test/benchmark/nmodl-llvm-time.sh

diff --git a/test/benchmark/kernels/compute-bound.cpp b/test/benchmark/kernels/compute-bound.cpp
new file mode 100644
index 0000000000..a43edbb03e
--- /dev/null
+++ b/test/benchmark/kernels/compute-bound.cpp
@@ -0,0 +1,29 @@
+#include <cmath>
+
+struct hh_Instance  {
+    double* __restrict__ minf;
+    double* __restrict__ mtau;
+    double* __restrict__ m;
+    double* __restrict__ Dm;
+    double* __restrict__ v_unused;
+    double* __restrict__ g_unused;
+    double* __restrict__ voltage;
+    int* __restrict__ node_index;
+    double t;
+    double dt;
+    double celsius;
+    int secondorder;
+    int node_count;
+};
+
+void nrn_state_hh_ext(void* __restrict__ mech){
+    auto inst = static_cast<hh_Instance*>(mech);
+    int id;
+    int node_id;
+    double v;
+    for(int id = 0; id<inst->node_count; ++id) {
+        node_id = inst->node_index[id];
+        v = inst->voltage[node_id];
+        inst->m[id] = exp(inst->m[id])+exp(inst->minf[id])+(inst->minf[id]-inst->m[id])/inst->mtau[id]+inst->m[id]+inst->minf[id]*inst->mtau[id];
+    }
+}
diff --git a/test/benchmark/kernels/compute-bound.mod b/test/benchmark/kernels/compute-bound.mod
new file mode 100644
index 0000000000..ded2618cf4
--- /dev/null
+++ b/test/benchmark/kernels/compute-bound.mod
@@ -0,0 +1,24 @@
+NEURON {
+    SUFFIX hh
+    NONSPECIFIC_CURRENT il
+    RANGE minf, mtau, gl, el
+}
+
+STATE {
+    m
+}
+
+ASSIGNED {
+    v (mV)
+    minf
+    mtau (ms)
+}
+
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+    il = gl*(v - el)
+}
+
+DERIVATIVE states {
+     m = exp(m) + exp(minf) + (minf-m)/mtau + m + minf * mtau
+}
diff --git a/test/benchmark/kernels/memory-bound.cpp b/test/benchmark/kernels/memory-bound.cpp
new file mode 100644
index 0000000000..8beead4fde
--- /dev/null
+++ b/test/benchmark/kernels/memory-bound.cpp
@@ -0,0 +1,38 @@
+
+
+struct hh_Instance  {
+    double* __restrict__ minf;
+    double* __restrict__ mtau;
+    double* __restrict__ m;
+    double* __restrict__ nai;
+    double* __restrict__ Dm;
+    double* __restrict__ v_unused;
+    double* __restrict__ g_unused;
+    double* __restrict__ ion_nai;
+    double* __restrict__ style_na;
+    int* __restrict__ ion_nai_index;
+    int* __restrict__ style_na_index;
+    double* __restrict__ voltage;
+    int* __restrict__ node_index;
+    double t;
+    double dt;
+    double celsius;
+    int secondorder;
+    int node_count;
+};
+
+void nrn_state_hh_ext(void* __restrict__ mech){
+    auto inst = static_cast<hh_Instance*>(mech);
+    int id;
+    int node_id, nai_id, ion_nai_id;
+    double v;
+    for(int id = 0; id<inst->node_count; ++id) {
+        node_id = inst->node_index[id];
+        nai_id = inst->ion_nai_index[id];
+        ion_nai_id = inst->ion_nai_index[id];
+        v = inst->voltage[node_id];
+        inst->nai[id] = inst->ion_nai[nai_id];
+        inst->m[id] = (inst->minf[id]-inst->m[id])/inst->mtau[id];
+        inst->ion_nai[ion_nai_id] = inst->nai[id];
+    }
+}
diff --git a/test/benchmark/kernels/memory-bound.mod b/test/benchmark/kernels/memory-bound.mod
new file mode 100644
index 0000000000..1e3df520a9
--- /dev/null
+++ b/test/benchmark/kernels/memory-bound.mod
@@ -0,0 +1,25 @@
+NEURON {
+    SUFFIX hh
+    NONSPECIFIC_CURRENT il
+    RANGE x, minf, mtau, gl, el
+    USEION na WRITE nai
+}
+
+STATE {
+    m
+}
+
+ASSIGNED {
+    v (mV)
+    minf
+    mtau (ms)
+}
+
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+    il = gl*(v - el)
+}
+
+DERIVATIVE states {
+     m =  (minf-m)/mtau    
+}
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
new file mode 100755
index 0000000000..0b7ce7e32f
--- /dev/null
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+# set -x
+#
+# Driver for nmodl-llvm benchmarking
+#
+
+
+# default params
+inst_size=100000000
+num_exp=10
+vec_width=8
+
+# version
+version="0.0.1"
+version_date="20-5-2021"
+version_string="nmodl-llvm-time $version ($version_date)"
+
+# show usage and handle arguments
+function showusage {
+    echo "usage: nmodl-llvm-time [options].
+-n NUMBER, --instance-size NUMBER
+-e NUMBER, --num-exeperiment NUMBER
+-v NUMBER, --vec-width NUMBER
+-d, --dry-run
+-h, --help       Display this usage information.
+-V, --version    Show version and exit.
+Driver for benchmarking.
+"
+}
+
+
+while [[ "$1" != "" ]]; do
+    case $1 in
+        "")
+            shift
+            ;;
+        -n|--instance-size)
+            inst_size=$2
+            shift
+            shift
+            ;;
+        -e|--num-exeperiment)
+            num_exp=$2
+            shift
+            shift
+            ;;
+        -v|--vec-width)
+            vec_width=$2
+            shift
+            shift
+            ;;
+        -d|--dry-run)
+            echo "debug mode"
+            debug=echo
+            shift
+            ;;
+        -V|--version)
+            echo "$version_string"
+            exit 0
+            ;;
+        -h|-\?|--help)
+            showusage
+            exit 0
+            ;;
+        *)
+            showusage
+            exit 1
+            ;;
+    esac
+
+done
+
+# vec libs
+vec_lib_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/intel-20.0.2-ilowey/lib/intel64_lin"
+vec_lib="libsvml.so"
+
+# nmodl
+nmodl_exe="/gpfs/bbp.cscs.ch/home/gcastigl/project16/nmodl-llvm/build/install/bin/nmodl"
+
+# external kernel
+nmodl_src_path="/gpfs/bbp.cscs.ch/home/gcastigl/project16/nmodl-llvm"
+kernels_path=${nmodl_src_path}/"test/benchmark/kernels"
+ext_lib="libextkernel.so"
+
+# compilers
+icpc_exe=icpc
+declare -a icpc_flags=(
+    # "-O2"
+    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -fimf-use-svml"
+    "-O2 -qopt-zmm-usage=high -xCORE-AVX512 -fimf-use-svml"
+    )
+
+clang_bin_path="/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0521/bin"
+clang_exe=${clang_bin_path}/clang++
+declare -a clang_flags=(
+    # "-O3"
+    "-O3 -march=skylake-avx512 -fveclib=SVML"
+    "-O3 -march=skylake-avx512 -ffast-math -fveclib=SVML"
+    "-O3 -mavx512f -ffast-math -fveclib=SVML"
+    )
+
+# loop over options
+for kernel_target in compute-bound memory-bound; do # add here hh
+    echo "kernel: "${kernel_target}
+    
+    for compiler in icpc clang; do
+        echo "|  compiler: "${compiler}
+
+        compiler_exe=${compiler}_exe
+        compiler_flags=${compiler}_flags[@]
+        
+        for flags in "${!compiler_flags}"; do
+            echo "|  |  flags: "${flags}
+
+            spec=${compiler}_${flags//[[:blank:]]/}
+            rel_ext_path=${kernel_target}_${spec}
+
+            ${debug} mkdir ${rel_ext_path}
+            ${debug} cd ${rel_ext_path}
+            ext_path=$(pwd)
+            ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp \
+            -shared -fpic -o ${ext_lib}
+            ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
+            ${debug} cd ..
+
+            nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --vector-width ${vec_width} --veclib SVML benchmark \
+            --opt-level-ir 3 --opt-level-codegen 3 --run --instance-size ${inst_size} \
+            --repeat ${num_exp} \
+            --libs  ${vec_lib_path}/${vec_lib} \
+            --backend default"
+
+            # run experiment
+            ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path} ${nmodl_exe} ${nmodl_args} &> ${kernel_target}_${spec}.log"
+        done
+    done
+done
\ No newline at end of file

From a7149037461d9ed505630ac698b25c0f7718d0dd Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Thu, 20 May 2021 17:50:54 +0200
Subject: [PATCH 074/331] updated benchmark stats for hacky ext kernel

---
 test/benchmark/llvm_benchmark.cpp | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 1ee95229bf..88543c5b83 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -108,14 +108,6 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
-<<<<<<< HEAD
-=======
-
-        // double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-        // logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
-
-        logger->info("Benchmarking kernel '{}'", kernel_name);
->>>>>>> slightly better stub for ext kernel, init data at every iteration
         // For every kernel run the benchmark `num_experiments` times.
         double time_min = std::numeric_limits<double>::max();
         double time_max = 0.0;
@@ -124,7 +116,6 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         for (int i = 0; i < num_experiments; ++i) {
             // Initialise the data.
             auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-<<<<<<< HEAD
 
             // Log instance size once.
             if (i == 0) {
@@ -132,8 +123,6 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
                 logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
             }
 
-=======
->>>>>>> slightly better stub for ext kernel, init data at every iteration
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::high_resolution_clock::now();
@@ -160,10 +149,14 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     }
     // benchmark external kernel
     logger->info("Benchmarking external kernel");
-    // Initialise the data.
+    double time_min = std::numeric_limits<double>::max();
+    double time_max = 0.0;
     double time_sum = 0.0;
+    double time_squared_sum = 0.0;
     for (int i = 0; i < num_experiments; ++i) {
+        // Initialise the data.
         auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+        
         // Record the execution time of the kernel.
         auto start = std::chrono::high_resolution_clock::now();
         nrn_state_hh_ext(instance_data.base_ptr);
@@ -173,12 +166,20 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         // Log the time taken for each run.
         logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
 
+        // Update statistics.
         time_sum += diff.count();
+        time_squared_sum += diff.count() * diff.count();
+        time_min = std::min(time_min, diff.count());
+        time_max = std::max(time_max, diff.count());
     }
     // Log the average time taken for the kernel.
-    logger->info("Average compute time = {:.6f} \n", time_sum / num_experiments);
+    double time_mean = time_sum / num_experiments;
+    logger->info("Average compute time = {:.6f}", time_mean);
+    logger->info("Compute time variance = {:g}",
+                    time_squared_sum / num_experiments - time_mean * time_mean);
+    logger->info("Minimum compute time = {:.6f}", time_min);
+    logger->info("Minimum compute time = {:.6f}\n", time_max);
 
-    // For every kernel run the benchmark `num_experiments` times.
 }
 
 }  // namespace benchmark

From 2853d230074a65553d844fbfc977c8e4cd32aa96 Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Thu, 20 May 2021 17:58:36 +0200
Subject: [PATCH 075/331] fixed typo, cleaned up stub

---
 test/benchmark/ext_kernel.cpp     | 4 +---
 test/benchmark/llvm_benchmark.cpp | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/test/benchmark/ext_kernel.cpp b/test/benchmark/ext_kernel.cpp
index 333891bb59..e0ce026338 100644
--- a/test/benchmark/ext_kernel.cpp
+++ b/test/benchmark/ext_kernel.cpp
@@ -10,6 +10,4 @@
 #include <iostream>
 
 // external kernel stub
-void nrn_state_hh_ext(void* ){
-    std::cout << "stub kernel" << '\n';
-}
\ No newline at end of file
+void nrn_state_hh_ext(void* ){}
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 88543c5b83..0b6c6babfb 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -178,7 +178,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     logger->info("Compute time variance = {:g}",
                     time_squared_sum / num_experiments - time_mean * time_mean);
     logger->info("Minimum compute time = {:.6f}", time_min);
-    logger->info("Minimum compute time = {:.6f}\n", time_max);
+    logger->info("Maximum compute time = {:.6f}\n", time_max);
 
 }
 

From 6b5e0225457153029d3244bd6f78b50a33280198 Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Fri, 21 May 2021 11:16:32 +0200
Subject: [PATCH 076/331] added hh kernel

---
 test/benchmark/kernels/hh.cpp     |  83 ++++++++++++++++++++++
 test/benchmark/kernels/hh.mod     | 114 ++++++++++++++++++++++++++++++
 test/benchmark/nmodl-llvm-time.sh |   2 +-
 3 files changed, 198 insertions(+), 1 deletion(-)
 create mode 100644 test/benchmark/kernels/hh.cpp
 create mode 100644 test/benchmark/kernels/hh.mod

diff --git a/test/benchmark/kernels/hh.cpp b/test/benchmark/kernels/hh.cpp
new file mode 100644
index 0000000000..eac7f58818
--- /dev/null
+++ b/test/benchmark/kernels/hh.cpp
@@ -0,0 +1,83 @@
+#include <cmath>
+
+struct hh_Instance  {
+    double* __restrict__ gnabar;
+    double* __restrict__ gkbar;
+    double* __restrict__ gl;
+    double* __restrict__ el;
+    double* __restrict__ gna;
+    double* __restrict__ gk;
+    double* __restrict__ il;
+    double* __restrict__ minf;
+    double* __restrict__ hinf;
+    double* __restrict__ ninf;
+    double* __restrict__ mtau;
+    double* __restrict__ htau;
+    double* __restrict__ ntau;
+    double* __restrict__ m;
+    double* __restrict__ h;
+    double* __restrict__ n;
+    double* __restrict__ Dm;
+    double* __restrict__ Dh;
+    double* __restrict__ Dn;
+    double* __restrict__ ena;
+    double* __restrict__ ek;
+    double* __restrict__ ina;
+    double* __restrict__ ik;
+    double* __restrict__ v_unused;
+    double* __restrict__ g_unused;
+    double* __restrict__ ion_ena;
+    double* __restrict__ ion_ina;
+    double* __restrict__ ion_dinadv;
+    double* __restrict__ ion_ek;
+    double* __restrict__ ion_ik;
+    double* __restrict__ ion_dikdv;
+    int* __restrict__ ion_ena_index;
+    int* __restrict__ ion_ina_index;
+    int* __restrict__ ion_dinadv_index;
+    int* __restrict__ ion_ek_index;
+    int* __restrict__ ion_ik_index;
+    int* __restrict__ ion_dikdv_index;
+    double* __restrict__ voltage;
+    int* __restrict__ node_index;
+    double t;
+    double dt;
+    double celsius;
+    int secondorder;
+    int node_count;
+};
+
+void nrn_state_hh_ext(void* __restrict__ mech){
+    auto inst = static_cast<hh_Instance*>(mech);
+    int id;
+    int node_id, ena_id, ek_id;
+    double v;
+    for(id = 0; id<inst->node_count; id = ++id) {
+        node_id = inst->node_index[id];
+        ena_id = inst->ion_ena_index[id];
+        ek_id = inst->ion_ek_index[id];
+        v = inst->voltage[node_id];
+        inst->ena[id] = inst->ion_ena[ena_id];
+        inst->ek[id] = inst->ion_ek[ek_id];
+        {
+            double alpha, beta, sum, q10, vtrap_in_0, v_in_1;
+            v_in_1 = v;
+            q10 = 3*((inst->celsius-6.3)/10);
+            alpha = .07*exp(-(v_in_1+65)/20);
+            beta = 1/(exp(-(v_in_1+35)/10)+1);
+            sum = alpha+beta;
+            inst->htau[id] = 1/(q10*sum);
+            inst->hinf[id] = alpha/sum;
+            {
+                double x_in_0, y_in_0;
+                x_in_0 = alpha;
+                y_in_0 = alpha;
+                vtrap_in_0 = y_in_0*(1-x_in_0/y_in_0/2);
+            }
+            inst->hinf[id] = vtrap_in_0;
+        }
+        inst->m[id] = inst->m[id]+(1.0-exp(inst->dt*((((-1.0)))/inst->mtau[id])))*(-(((inst->minf[id]))/inst->mtau[id])/((((-1.0)))/inst->mtau[id])-inst->m[id]);
+        inst->h[id] = inst->h[id]+(1.0-exp(inst->dt*((((-1.0)))/inst->htau[id])))*(-(((inst->hinf[id]))/inst->htau[id])/((((-1.0)))/inst->htau[id])-inst->h[id]);
+        inst->n[id] = inst->n[id]+(1.0-exp(inst->dt*((((-1.0)))/inst->ntau[id])))*(-(((inst->ninf[id]))/inst->ntau[id])/((((-1.0)))/inst->ntau[id])-inst->n[id]);
+    }
+}
diff --git a/test/benchmark/kernels/hh.mod b/test/benchmark/kernels/hh.mod
new file mode 100644
index 0000000000..5e807ff2ec
--- /dev/null
+++ b/test/benchmark/kernels/hh.mod
@@ -0,0 +1,114 @@
+TITLE hh.mod   squid sodium, potassium, and leak channels
+COMMENT
+    This is the original Hodgkin-Huxley treatment for the set of sodium,
+    potassium, and leakage channels found in the squid giant axon membrane.
+    ("A quantitative description of membrane current and its application
+    conduction and excitation in nerve" J.Physiol. (Lond.) 117:500-544 (1952).)
+    Membrane voltage is in absolute mV and has been reversed in polarity
+    from the original HH convention and shifted to reflect a resting potential
+    of -65 mV.
+    Remember to set celsius=6.3 (or whatever) in your HOC file.
+    See squid.hoc for an example of a simulation using this model.
+    SW Jaslove  6 March, 1992
+ENDCOMMENT
+UNITS {
+    (mA) = (milliamp)
+    (mV) = (millivolt)
+    (S) = (siemens)
+}
+NEURON {
+    SUFFIX hh
+    USEION na READ ena WRITE ina
+    USEION k READ ek WRITE ik
+    NONSPECIFIC_CURRENT il
+    RANGE gnabar, gkbar, gl, el, gna, gk
+    RANGE minf, hinf, ninf, mtau, htau, ntau
+    THREADSAFE
+}
+PARAMETER {
+    gnabar = .12 (S/cm2) <0,1e9>
+    gkbar = .036 (S/cm2) <0,1e9>
+    gl = .0003 (S/cm2) <0,1e9>
+    el = -54.3 (mV)
+}
+STATE {
+    m
+    h
+    n
+}
+ASSIGNED {
+    v (mV)
+    celsius (degC)
+    ena (mV)
+    ek (mV)
+    gna (S/cm2)
+    gk (S/cm2)
+    ina (mA/cm2)
+    ik (mA/cm2)
+    il (mA/cm2)
+    minf
+    hinf
+    ninf
+    mtau (ms)
+    htau (ms)
+    ntau (ms)
+}
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+    gna = gnabar*m*m*m*h
+    ina = gna*(v-ena)
+    gk = gkbar*n*n*n*n
+    ik = gk*(v-ek)
+    il = gl*(v-el)
+}
+INITIAL {
+    {
+        : inlined rates
+        LOCAL alpha, beta, sum, q10, vtrap_in_0, v_in_0
+        v_in_0 = v
+        q10 = 3*((celsius-6.3)/10)
+        alpha = .07*exp(-(v_in_0+65)/20)
+        beta = 1/(exp(-(v_in_0+35)/10)+1)
+        sum = alpha+beta
+        htau = 1/(q10*sum)
+        hinf = alpha/sum
+        {
+            : inlined vtrap
+            LOCAL x_in_0, y_in_0
+            x_in_0 = alpha
+            y_in_0 = alpha
+            : no control flow
+            vtrap_in_0 = y_in_0*(1-x_in_0/y_in_0/2)
+        }
+        hinf = vtrap_in_0
+    }
+    m = minf
+    h = hinf
+    n = ninf
+}
+DERIVATIVE states {
+    {
+        : inlined rates
+        LOCAL alpha, beta, sum, q10, vtrap_in_0, v_in_1
+        v_in_1 = v
+        q10 = 3*((celsius-6.3)/10)
+        alpha = .07*exp(-(v_in_1+65)/20)
+        beta = 1/(exp(-(v_in_1+35)/10)+1)
+        sum = alpha+beta
+        htau = 1/(q10*sum)
+        hinf = alpha/sum
+        {
+           : inlined vtrap
+            LOCAL x_in_0, y_in_0
+            x_in_0 = alpha
+            y_in_0 = alpha
+            : no control flow
+            vtrap_in_0 = y_in_0*(1-x_in_0/y_in_0/2)  
+        }
+        hinf = vtrap_in_0
+    }
+    m = m+(1.0-exp(dt*((((-1.0)))/mtau)))*(-(((minf))/mtau)/((((-1.0)))/mtau)-m)
+    h = h+(1.0-exp(dt*((((-1.0)))/htau)))*(-(((hinf))/htau)/((((-1.0)))/htau)-h)
+    n = n+(1.0-exp(dt*((((-1.0)))/ntau)))*(-(((ninf))/ntau)/((((-1.0)))/ntau)-n)
+}
+UNITSON
\ No newline at end of file
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 0b7ce7e32f..38dd3c2feb 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -100,7 +100,7 @@ declare -a clang_flags=(
     )
 
 # loop over options
-for kernel_target in compute-bound memory-bound; do # add here hh
+for kernel_target in compute-bound memory-bound hh; do
     echo "kernel: "${kernel_target}
     
     for compiler in icpc clang; do

From 7b7b33b719f6e514cfdcc0993d4dedf2d02bb6bd Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Fri, 21 May 2021 11:21:44 +0200
Subject: [PATCH 077/331] add fast math for JIT in script

---
 test/benchmark/nmodl-llvm-time.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 38dd3c2feb..b9a8240d5c 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -123,7 +123,7 @@ for kernel_target in compute-bound memory-bound hh; do
             ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
             ${debug} cd ..
 
-            nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --vector-width ${vec_width} --veclib SVML benchmark \
+            nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --fmf nnan contract afn --vector-width ${vec_width} --veclib SVML benchmark \
             --opt-level-ir 3 --opt-level-codegen 3 --run --instance-size ${inst_size} \
             --repeat ${num_exp} \
             --libs  ${vec_lib_path}/${vec_lib} \

From a5da1d1eca7d8257a74d57210909605d26949123 Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Tue, 25 May 2021 17:32:03 +0200
Subject: [PATCH 078/331] added precise division flag to icpc

---
 test/benchmark/nmodl-llvm-time.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index b9a8240d5c..e240005522 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -86,8 +86,8 @@ ext_lib="libextkernel.so"
 icpc_exe=icpc
 declare -a icpc_flags=(
     # "-O2"
-    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -fimf-use-svml"
-    "-O2 -qopt-zmm-usage=high -xCORE-AVX512 -fimf-use-svml"
+    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml"
+    "-O2 -qopt-zmm-usage=high -xCORE-AVX512 -prec-div -fimf-use-svml"
     )
 
 clang_bin_path="/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0521/bin"

From b1b851c07e2fe66d9b931dcb235c868af4227a6b Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Wed, 26 May 2021 15:58:16 +0200
Subject: [PATCH 079/331] added pragmas to cpp kernel, add ext kernel sse2 and
 avx2

---
 test/benchmark/kernels/compute-bound.cpp | 29 ++++++++++++------------
 test/benchmark/kernels/hh.cpp            |  1 +
 test/benchmark/kernels/memory-bound.cpp  |  1 +
 test/benchmark/nmodl-llvm-time.sh        |  7 ++++++
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/test/benchmark/kernels/compute-bound.cpp b/test/benchmark/kernels/compute-bound.cpp
index a43edbb03e..db5cfb7cd2 100644
--- a/test/benchmark/kernels/compute-bound.cpp
+++ b/test/benchmark/kernels/compute-bound.cpp
@@ -1,19 +1,19 @@
 #include <cmath>
 
-struct hh_Instance  {
-    double* __restrict__ minf;
-    double* __restrict__ mtau;
-    double* __restrict__ m;
-    double* __restrict__ Dm;
-    double* __restrict__ v_unused;
-    double* __restrict__ g_unused;
-    double* __restrict__ voltage;
-    int* __restrict__ node_index;
-    double t;
-    double dt;
-    double celsius;
-    int secondorder;
-    int node_count;
+struct hh_Instance  {               // address
+    double* __restrict__ minf;      //  0
+    double* __restrict__ mtau;      //  8
+    double* __restrict__ m;         // 16
+    double* __restrict__ Dm;        // 24
+    double* __restrict__ v_unused;  // 32
+    double* __restrict__ g_unused;  // 40
+    double* __restrict__ voltage;   // 48
+    int* __restrict__ node_index;   // 56
+    double t;                       // 64
+    double dt;                      // 72
+    double celsius;                 // 80
+    int secondorder;                // 88
+    int node_count;                 // 92
 };
 
 void nrn_state_hh_ext(void* __restrict__ mech){
@@ -21,6 +21,7 @@ void nrn_state_hh_ext(void* __restrict__ mech){
     int id;
     int node_id;
     double v;
+    #pragma ivdep
     for(int id = 0; id<inst->node_count; ++id) {
         node_id = inst->node_index[id];
         v = inst->voltage[node_id];
diff --git a/test/benchmark/kernels/hh.cpp b/test/benchmark/kernels/hh.cpp
index eac7f58818..2c441712ad 100644
--- a/test/benchmark/kernels/hh.cpp
+++ b/test/benchmark/kernels/hh.cpp
@@ -52,6 +52,7 @@ void nrn_state_hh_ext(void* __restrict__ mech){
     int id;
     int node_id, ena_id, ek_id;
     double v;
+    #pragma ivdep
     for(id = 0; id<inst->node_count; id = ++id) {
         node_id = inst->node_index[id];
         ena_id = inst->ion_ena_index[id];
diff --git a/test/benchmark/kernels/memory-bound.cpp b/test/benchmark/kernels/memory-bound.cpp
index 8beead4fde..8f2c165f72 100644
--- a/test/benchmark/kernels/memory-bound.cpp
+++ b/test/benchmark/kernels/memory-bound.cpp
@@ -26,6 +26,7 @@ void nrn_state_hh_ext(void* __restrict__ mech){
     int id;
     int node_id, nai_id, ion_nai_id;
     double v;
+    #pragma ivdep
     for(int id = 0; id<inst->node_count; ++id) {
         node_id = inst->node_index[id];
         nai_id = inst->ion_nai_index[id];
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index e240005522..53b2bbe9a6 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -88,6 +88,9 @@ declare -a icpc_flags=(
     # "-O2"
     "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml"
     "-O2 -qopt-zmm-usage=high -xCORE-AVX512 -prec-div -fimf-use-svml"
+    "-O2 -mavx512f -prec-div -fimf-use-svml"
+    "-O2 -mavx2 -prec-div -fimf-use-svml"
+    "-O2 -msse2 -prec-div -fimf-use-svml"
     )
 
 clang_bin_path="/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0521/bin"
@@ -97,12 +100,15 @@ declare -a clang_flags=(
     "-O3 -march=skylake-avx512 -fveclib=SVML"
     "-O3 -march=skylake-avx512 -ffast-math -fveclib=SVML"
     "-O3 -mavx512f -ffast-math -fveclib=SVML"
+    "-O3 -mavx2 -ffast-math -fveclib=SVML"
+    "-O3 -msse2 -ffast-math -fveclib=SVML"
     )
 
 # loop over options
 for kernel_target in compute-bound memory-bound hh; do
     echo "kernel: "${kernel_target}
     
+    # loop over other compilers
     for compiler in icpc clang; do
         echo "|  compiler: "${compiler}
 
@@ -133,4 +139,5 @@ for kernel_target in compute-bound memory-bound hh; do
             ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path} ${nmodl_exe} ${nmodl_args} &> ${kernel_target}_${spec}.log"
         done
     done
+
 done
\ No newline at end of file

From 7620651a9544fa88707d5eefcbf41ec31f7c156a Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Thu, 27 May 2021 10:03:35 +0200
Subject: [PATCH 080/331] pragm/flag to vec hh with clang, addded gcc

---
 test/benchmark/kernels/hh.cpp     |  5 +++--
 test/benchmark/nmodl-llvm-time.sh | 19 +++++++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/test/benchmark/kernels/hh.cpp b/test/benchmark/kernels/hh.cpp
index 2c441712ad..9fa11beadf 100644
--- a/test/benchmark/kernels/hh.cpp
+++ b/test/benchmark/kernels/hh.cpp
@@ -52,8 +52,9 @@ void nrn_state_hh_ext(void* __restrict__ mech){
     int id;
     int node_id, ena_id, ek_id;
     double v;
-    #pragma ivdep
-    for(id = 0; id<inst->node_count; id = ++id) {
+    #pragma ivdep     // icpc vec helper
+    #pragma omp simd  // clang vec helper
+    for(id = 0; id<inst->node_count; ++id) {
         node_id = inst->node_index[id];
         ena_id = inst->ion_ena_index[id];
         ek_id = inst->ion_ek_index[id];
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 53b2bbe9a6..439f51e4d8 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -85,7 +85,6 @@ ext_lib="libextkernel.so"
 # compilers
 icpc_exe=icpc
 declare -a icpc_flags=(
-    # "-O2"
     "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml"
     "-O2 -qopt-zmm-usage=high -xCORE-AVX512 -prec-div -fimf-use-svml"
     "-O2 -mavx512f -prec-div -fimf-use-svml"
@@ -96,12 +95,20 @@ declare -a icpc_flags=(
 clang_bin_path="/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0521/bin"
 clang_exe=${clang_bin_path}/clang++
 declare -a clang_flags=(
-    # "-O3"
-    "-O3 -march=skylake-avx512 -fveclib=SVML"
     "-O3 -march=skylake-avx512 -ffast-math -fveclib=SVML"
+    "-O3 -mavx512f -ffast-math -fopemp -fveclib=SVML"
     "-O3 -mavx512f -ffast-math -fveclib=SVML"
-    "-O3 -mavx2 -ffast-math -fveclib=SVML"
-    "-O3 -msse2 -ffast-math -fveclib=SVML"
+    "-O3 -mavx512f -fveclib=SVML"
+    "-O3 -mavx2 -ffast-math -fopemp -fveclib=SVML"
+    "-O3 -msse2 -ffast-math -fopemp -fveclib=SVML"
+    )
+
+gcc_bin_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/gcc-9.3.0-45gzrp/bin"
+gcc_exe=${gcc_bin_path}/g++
+declare -a gcc_flags=(
+    "-O3 -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml"
+    "-O3 -mavx2 -ffast-math -ftree-vectorize -mveclibabi=svml"
+    "-O3 -msse2 -ffast-math -ftree-vectorize -mveclibabi=svml"
     )
 
 # loop over options
@@ -109,7 +116,7 @@ for kernel_target in compute-bound memory-bound hh; do
     echo "kernel: "${kernel_target}
     
     # loop over other compilers
-    for compiler in icpc clang; do
+    for compiler in icpc clang gcc; do
         echo "|  compiler: "${compiler}
 
         compiler_exe=${compiler}_exe

From 80e85bf22acab3e4a972f722c7af8c7358910d39 Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Fri, 28 May 2021 09:56:17 +0200
Subject: [PATCH 081/331] flag typo

---
 test/benchmark/nmodl-llvm-time.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 439f51e4d8..f73db98384 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -96,11 +96,11 @@ clang_bin_path="/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0521
 clang_exe=${clang_bin_path}/clang++
 declare -a clang_flags=(
     "-O3 -march=skylake-avx512 -ffast-math -fveclib=SVML"
-    "-O3 -mavx512f -ffast-math -fopemp -fveclib=SVML"
+    "-O3 -mavx512f -ffast-math -fopenmp -fveclib=SVML"
     "-O3 -mavx512f -ffast-math -fveclib=SVML"
     "-O3 -mavx512f -fveclib=SVML"
-    "-O3 -mavx2 -ffast-math -fopemp -fveclib=SVML"
-    "-O3 -msse2 -ffast-math -fopemp -fveclib=SVML"
+    "-O3 -mavx2 -ffast-math -fopenmp -fveclib=SVML"
+    "-O3 -msse2 -ffast-math -fopenmp -fveclib=SVML"
     )
 
 gcc_bin_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/gcc-9.3.0-45gzrp/bin"

From 102210e936668c6383d619e0f84a72565653ad64 Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Mon, 31 May 2021 10:05:31 +0200
Subject: [PATCH 082/331] rebased and updated llvm path

---
 test/benchmark/nmodl-llvm-time.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index f73db98384..810389b2a4 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -92,7 +92,7 @@ declare -a icpc_flags=(
     "-O2 -msse2 -prec-div -fimf-use-svml"
     )
 
-clang_bin_path="/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0521/bin"
+clang_bin_path="/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0621/bin"
 clang_exe=${clang_bin_path}/clang++
 declare -a clang_flags=(
     "-O3 -march=skylake-avx512 -ffast-math -fveclib=SVML"
@@ -112,11 +112,13 @@ declare -a gcc_flags=(
     )
 
 # loop over options
-for kernel_target in compute-bound memory-bound hh; do
+# for kernel_target in compute-bound memory-bound hh; do
+for kernel_target in hh; do
     echo "kernel: "${kernel_target}
     
     # loop over other compilers
-    for compiler in icpc clang gcc; do
+    # for compiler in icpc clang gcc; do
+    for compiler in clang; do
         echo "|  compiler: "${compiler}
 
         compiler_exe=${compiler}_exe

From e3a25becc7888ecb55d817f9cda5dd77d7717e3f Mon Sep 17 00:00:00 2001
From: Castiglioni Giacomo <giacomo.castiglioni@epfl.ch>
Date: Mon, 31 May 2021 18:03:14 +0200
Subject: [PATCH 083/331] fixed llvm path

---
 test/benchmark/nmodl-llvm-time.sh | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 810389b2a4..d0998e8f3e 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -92,15 +92,16 @@ declare -a icpc_flags=(
     "-O2 -msse2 -prec-div -fimf-use-svml"
     )
 
-clang_bin_path="/gpfs/bbp.cscs.ch/data/project/proj16/software/llvm/install/0621/bin"
-clang_exe=${clang_bin_path}/clang++
+llvm_path="/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621"
+llvm_lib=${llvm_path}/lib
+clang_exe=${llvm_path}/bin/clang++
 declare -a clang_flags=(
-    "-O3 -march=skylake-avx512 -ffast-math -fveclib=SVML"
     "-O3 -mavx512f -ffast-math -fopenmp -fveclib=SVML"
-    "-O3 -mavx512f -ffast-math -fveclib=SVML"
-    "-O3 -mavx512f -fveclib=SVML"
     "-O3 -mavx2 -ffast-math -fopenmp -fveclib=SVML"
     "-O3 -msse2 -ffast-math -fopenmp -fveclib=SVML"
+    "-O3 -mavx512f -ffast-math -fveclib=SVML"
+    "-O3 -mavx512f -fveclib=SVML"
+    "-O3 -march=skylake-avx512 -ffast-math -fopenmp -fveclib=SVML"
     )
 
 gcc_bin_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/gcc-9.3.0-45gzrp/bin"
@@ -113,7 +114,8 @@ declare -a gcc_flags=(
 
 # loop over options
 # for kernel_target in compute-bound memory-bound hh; do
-for kernel_target in hh; do
+for kernel_target in compute-bound memory-bound; do
+# for kernel_target in hh; do
     echo "kernel: "${kernel_target}
     
     # loop over other compilers
@@ -145,7 +147,7 @@ for kernel_target in hh; do
             --backend default"
 
             # run experiment
-            ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path} ${nmodl_exe} ${nmodl_args} &> ${kernel_target}_${spec}.log"
+            ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${kernel_target}_${spec}.log"
         done
     done
 

From db55a30c2d8d462d7043892922e06a38c5e2fa75 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 1 Jun 2021 15:37:52 +0200
Subject: [PATCH 084/331] Edited run script for generating graphs

---
 test/benchmark/nmodl-llvm-time.sh | 98 ++++++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 20 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index d0998e8f3e..8e869ff0c9 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -4,11 +4,15 @@
 # Driver for nmodl-llvm benchmarking
 #
 
-
+# sh nmodl-llvm-time.sh -vec-sweep -mod-dir /gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/bbp_mod -n 100000000
 # default params
 inst_size=100000000
 num_exp=10
 vec_width=8
+external_kernel_exec=false
+modfile_directory=$(pwd)
+vec_width_sweep=false
+output_dir=$(pwd)
 
 # version
 version="0.0.1"
@@ -49,6 +53,24 @@ while [[ "$1" != "" ]]; do
             shift
             shift
             ;;
+        -ext|--external-kernel)
+            external_kernel_exec=true
+            shift
+            ;;
+        -vec-sweep|--vec-width-sweep)
+            vec_width_sweep=true
+            shift
+            ;;
+        -mod-dir|--modfile-directory)
+            modfile_directory=$2
+            shift
+            shift
+            ;;
+        -o|--output-directory)
+            output_dir=$2
+            shift
+            shift
+            ;;
         -d|--dry-run)
             echo "debug mode"
             debug=echo
@@ -75,12 +97,17 @@ vec_lib_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06
 vec_lib="libsvml.so"
 
 # nmodl
-nmodl_exe="/gpfs/bbp.cscs.ch/home/gcastigl/project16/nmodl-llvm/build/install/bin/nmodl"
+nmodl_exe="/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/build_llvm/install/bin/nmodl"
 
 # external kernel
 nmodl_src_path="/gpfs/bbp.cscs.ch/home/gcastigl/project16/nmodl-llvm"
 kernels_path=${nmodl_src_path}/"test/benchmark/kernels"
 ext_lib="libextkernel.so"
+if ${external_kernel_exec}; then
+    modfile_directory=${kernels_path}
+fi
+
+mkdir -p ${output_dir}
 
 # compilers
 icpc_exe=icpc
@@ -114,7 +141,9 @@ declare -a gcc_flags=(
 
 # loop over options
 # for kernel_target in compute-bound memory-bound hh; do
-for kernel_target in compute-bound memory-bound; do
+#for kernel_target in compute-bound memory-bound; do
+#for kernel_target in Ca_HVA2 can2 cat DetAMPANMDA DetGABAAB SKv3_1; do
+for kernel_target in ; do
 # for kernel_target in hh; do
     echo "kernel: "${kernel_target}
     
@@ -126,29 +155,58 @@ for kernel_target in compute-bound memory-bound; do
         compiler_exe=${compiler}_exe
         compiler_flags=${compiler}_flags[@]
         
-        for flags in "${!compiler_flags}"; do
-            echo "|  |  flags: "${flags}
-
-            spec=${compiler}_${flags//[[:blank:]]/}
-            rel_ext_path=${kernel_target}_${spec}
-
-            ${debug} mkdir ${rel_ext_path}
-            ${debug} cd ${rel_ext_path}
-            ext_path=$(pwd)
-            ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp \
-            -shared -fpic -o ${ext_lib}
-            ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
-            ${debug} cd ..
-
-            nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --fmf nnan contract afn --vector-width ${vec_width} --veclib SVML benchmark \
+         if $external_kernel_exec; then
+            for flags in "${!compiler_flags}"; do
+                echo "|  |  flags: "${flags}
+
+           
+                spec=${compiler}_${flags//[[:blank:]]/}
+                rel_ext_path=${kernel_target}_${spec}
+
+                ${debug} mkdir ${rel_ext_path}
+                ${debug} cd ${rel_ext_path}
+                ext_path=$(pwd)
+                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp \
+                -shared -fpic -o ${ext_lib}
+                ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
+                ${debug} cd ..
+            done
+        fi
+
+        if $vec_width_sweep; then
+            for power_of_two in $(seq 1 3); do
+                vec_width=$((2**${power_of_two}))
+                echo "|  | Running JIT for vec width ${vec_width}"
+                nmodl_args="${modfile_directory}/${kernel_target}.mod passes --inline \
+                llvm --ir --fmf nnan contract afn --vector-width ${vec_width} --veclib SVML \
+                benchmark \
+                --opt-level-ir 3 --opt-level-codegen 3 --run --instance-size ${inst_size} \
+                --repeat ${num_exp} \
+                --libs  ${vec_lib_path}/${vec_lib} \
+                --backend default"
+
+                # run experiment
+                if $external_kernel_exec; then
+                    ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${kernel_target}_${spec}_v${vec_width}.log"
+                else
+                    ${debug} eval "LD_LIBRARY_PATH=${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${kernel_target}_${spec}_v${vec_width}.log"
+                fi
+            done
+        else
+            nmodl_args="${modfile_directory}/${kernel_target}.mod passes --inline \
+            llvm --ir --fmf nnan contract afn --vector-width ${vec_width} --veclib SVML benchmark \
             --opt-level-ir 3 --opt-level-codegen 3 --run --instance-size ${inst_size} \
             --repeat ${num_exp} \
             --libs  ${vec_lib_path}/${vec_lib} \
             --backend default"
 
             # run experiment
-            ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${kernel_target}_${spec}.log"
-        done
+            if $external_kernel_exec; then
+                ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${kernel_target}_${spec}_v${vec_width}.log"
+            else
+                ${debug} eval "LD_LIBRARY_PATH=${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${kernel_target}_${spec}_v${vec_width}.log"
+            fi
+        fi
     done
 
 done
\ No newline at end of file

From ca97d232dc5d442329ef87421445ddf49eff35fb Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 00:01:58 -0700
Subject: [PATCH 085/331] [LLVM] Enhancements for optimization pipeline  (#683)

Added several improvements to the way optimizations are run
for the LLVM code generation pipeline and benchmarking.

1. Created `llvm_utils` files that currently have logic/implementations
for optimizing the IR. In future, things like dumping IR to file will also go
there. This allows to share optimizing infrastructure between benchmarking
and LLVM visitor.

2. Replaced`--opt` with `--opt-level-ir` for LLVM visitor. The `--opt` option
was duplicated by `--opt-level-ir` in the benchmarking infrastructure.
With new `llvm_utils` package, we can simply reuse the optimizing routines
and use optimization levels instead.

3. Added IPO and AggressiveInstCombine passes

Importantly, if running the benchmark, the IR is still optimized after the
`targetMachine` is created to benefit from target-specific optimizations.

Example:
```bash
bin/nmodl test.mod llvm --ir --single-precision --vector-width 4 --opt-level-ir 3 \
                                  benchmark --run --opt-level-codegen 3
```
Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 ci/bb5-pr.sh                                  |  5 +-
 cmake/LLVMHelper.cmake                        |  1 +
 src/codegen/llvm/CMakeLists.txt               |  4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 37 +++------
 src/codegen/llvm/codegen_llvm_visitor.hpp     | 27 ++-----
 src/codegen/llvm/llvm_utils.cpp               | 79 +++++++++++++++++++
 src/codegen/llvm/llvm_utils.hpp               | 23 ++++++
 src/codegen/llvm/main.cpp                     |  2 +-
 src/main.cpp                                  | 18 ++---
 test/benchmark/jit_driver.cpp                 | 65 +--------------
 test/benchmark/jit_driver.hpp                 |  2 +-
 test/benchmark/llvm_benchmark.hpp             |  2 +-
 test/unit/codegen/codegen_llvm_execution.cpp  | 12 +--
 .../codegen/codegen_llvm_instance_struct.cpp  |  6 +-
 test/unit/codegen/codegen_llvm_ir.cpp         | 38 ++++-----
 15 files changed, 166 insertions(+), 155 deletions(-)
 create mode 100644 src/codegen/llvm/llvm_utils.cpp
 create mode 100644 src/codegen/llvm/llvm_utils.hpp

diff --git a/ci/bb5-pr.sh b/ci/bb5-pr.sh
index abdce2d867..c1e7414192 100755
--- a/ci/bb5-pr.sh
+++ b/ci/bb5-pr.sh
@@ -20,9 +20,8 @@ function bb5_pr_setup_virtualenv() {
 }
 
 function find_clang_format() {
-    module load llvm
-    clang_format_exe=$(which clang-format)
-    module unload llvm
+    # bb5 has only llvm-12 which is not compatible with hpc-coding-convention
+    clang_format_exe=/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/externals/2021-01-06/linux-rhel7-x86_64/gcc-9.3.0/llvm-11.0.0-kzl4o5/bin/clang-format
 }
 
 function build_with() {
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 780ae29cfa..9e4af5d503 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -6,6 +6,7 @@ find_package(LLVM REQUIRED CONFIG)
 
 # include LLVM libraries
 set(NMODL_LLVM_COMPONENTS
+    aggressiveinstcombine
     analysis
     codegen
     core
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index b927475f15..5c7eadc91c 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -9,7 +9,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 1e5ca89c6d..ffbedbb063 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -6,6 +6,7 @@
  *************************************************************************/
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/llvm_utils.hpp"
 
 #include "ast/all.hpp"
 #include "visitors/rename_visitor.hpp"
@@ -15,6 +16,7 @@
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -431,25 +433,6 @@ llvm::Value* CodegenLLVMVisitor::read_variable(const ast::VarName& node) {
                              "' is not supported\n");
 }
 
-void CodegenLLVMVisitor::run_ir_opt_passes() {
-    // Run some common optimisation passes that are commonly suggested.
-    opt_pm.add(llvm::createInstructionCombiningPass());
-    opt_pm.add(llvm::createReassociatePass());
-    opt_pm.add(llvm::createGVNPass());
-    opt_pm.add(llvm::createCFGSimplificationPass());
-
-    // Initialize pass manager.
-    opt_pm.doInitialization();
-
-    // Iterate over all functions and run the optimisation passes.
-    auto& functions = module->getFunctionList();
-    for (auto& function: functions) {
-        llvm::verifyFunction(function);
-        opt_pm.run(function);
-    }
-    opt_pm.doFinalization();
-}
-
 void CodegenLLVMVisitor::write_to_variable(const ast::VarName& node, llvm::Value* value) {
     const auto& identifier = node.get_name();
     if (!identifier->is_name() && !identifier->is_indexed_name() &&
@@ -874,9 +857,10 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         throw std::runtime_error("Error: incorrect IR has been generated!\n" + ostream.str());
     }
 
-    if (opt_passes) {
+    if (opt_level_ir) {
         logger->info("Running LLVM optimisation passes");
-        run_ir_opt_passes();
+        utils::initialise_optimisation_passes();
+        utils::optimise_module(*module, opt_level_ir);
     }
 
     // Optionally, replace LLVM math intrinsics with vector library calls.
@@ -893,14 +877,15 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         add_vectorizable_functions_from_vec_lib(target_lib_info, triple);
 
         // Run passes that replace math intrinsics.
-        codegen_pm.add(new llvm::TargetLibraryInfoWrapperPass(target_lib_info));
-        codegen_pm.add(new llvm::ReplaceWithVeclibLegacy);
-        codegen_pm.doInitialization();
+        llvm::legacy::FunctionPassManager fpm(module.get());
+        fpm.add(new llvm::TargetLibraryInfoWrapperPass(target_lib_info));
+        fpm.add(new llvm::ReplaceWithVeclibLegacy);
+        fpm.doInitialization();
         for (auto& function: module->getFunctionList()) {
             if (!function.isDeclaration())
-                codegen_pm.run(function);
+                fpm.run(function);
         }
-        codegen_pm.doFinalization();
+        fpm.doFinalization();
 #endif
     }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 49285f9941..5dd8eda15c 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -28,12 +28,8 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
 
 namespace nmodl {
 namespace codegen {
@@ -82,14 +78,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Instance variable helper.
     InstanceVarHelper instance_var_helper;
 
-    /// Run optimisation passes if true.
-    bool opt_passes;
-
-    /// Pass manager for optimisation passes that are run on IR and are not related to target.
-    llvm::legacy::FunctionPassManager opt_pm;
-
-    /// Pass manager for optimisation passes that are used for target code generation.
-    llvm::legacy::FunctionPassManager codegen_pm;
+    /// Optimisation level for LLVM IR transformations.
+    int opt_level_ir;
 
     /// Vector library used for math functions.
     std::string vector_library;
@@ -100,7 +90,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
   public:
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
-                       bool opt_passes,
+                       int opt_level_ir,
                        bool use_single_precision = false,
                        int vector_width = 1,
                        std::string vec_lib = "none",
@@ -108,14 +98,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        std::vector<std::string> fast_math_flags = {})
         : mod_filename(mod_filename)
         , output_dir(output_dir)
-        , opt_passes(opt_passes)
+        , opt_level_ir(opt_level_ir)
         , vector_width(vector_width)
         , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
         , ir_builder(*context, use_single_precision, vector_width, fast_math_flags)
-        , debug_builder(*module)
-        , codegen_pm(module.get())
-        , opt_pm(module.get()) {}
+        , debug_builder(*module) {}
 
     /// Dumps the generated LLVM IR module to string.
     std::string dump_module() const {
@@ -228,11 +216,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Reads the given variable and returns the processed value.
     llvm::Value* read_variable(const ast::VarName& node);
 
-
-    /// Run multiple LLVM optimisation passes on generated IR.
-    /// TODO: this can be moved to a dedicated file or deprecated.
-    void run_ir_opt_passes();
-
     //// Writes the value to the given variable.
     void write_to_variable(const ast::VarName& node, llvm::Value* value);
 };
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
new file mode 100644
index 0000000000..684f962b76
--- /dev/null
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -0,0 +1,79 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/llvm_utils.hpp"
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+
+namespace nmodl {
+namespace utils {
+
+/// Populates pass managers with passes for the given optimisation levels.
+static void populate_pms(llvm::legacy::FunctionPassManager& func_pm,
+                         llvm::legacy::PassManager& module_pm,
+                         int opt_level,
+                         int size_level,
+                         llvm::TargetMachine* tm) {
+    // First, set the pass manager builder with some basic optimisation information.
+    llvm::PassManagerBuilder pm_builder;
+    pm_builder.OptLevel = opt_level;
+    pm_builder.SizeLevel = size_level;
+    pm_builder.DisableUnrollLoops = opt_level == 0;
+
+    // If target machine is defined, then initialise the TargetTransformInfo for the target.
+    if (tm) {
+        module_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
+        func_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
+    }
+
+    // Populate pass managers.
+    pm_builder.populateModulePassManager(module_pm);
+    pm_builder.populateFunctionPassManager(func_pm);
+}
+
+/// Runs the function and module passes on the provided module.
+static void run_optimisation_passes(llvm::Module& module,
+                                    llvm::legacy::FunctionPassManager& func_pm,
+                                    llvm::legacy::PassManager& module_pm) {
+    func_pm.doInitialization();
+    auto& functions = module.getFunctionList();
+    for (auto& function: functions) {
+        llvm::verifyFunction(function);
+        func_pm.run(function);
+    }
+    func_pm.doFinalization();
+    module_pm.run(module);
+}
+
+/****************************************************************************************/
+/*                             Optimisation utils                                       */
+/****************************************************************************************/
+
+void initialise_optimisation_passes() {
+    auto& registry = *llvm::PassRegistry::getPassRegistry();
+    llvm::initializeCore(registry);
+    llvm::initializeTransformUtils(registry);
+    llvm::initializeScalarOpts(registry);
+    llvm::initializeIPO(registry);
+    llvm::initializeInstCombine(registry);
+    llvm::initializeAggressiveInstCombine(registry);
+    llvm::initializeAnalysis(registry);
+}
+
+void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm) {
+    llvm::legacy::FunctionPassManager func_pm(&module);
+    llvm::legacy::PassManager module_pm;
+    populate_pms(func_pm, module_pm, opt_level, /*size_level=*/0, tm);
+    run_optimisation_passes(module, func_pm, module_pm);
+}
+}  // namespace utils
+}  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
new file mode 100644
index 0000000000..81dc30d97f
--- /dev/null
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -0,0 +1,23 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace nmodl {
+namespace utils {
+
+/// Initialises some LLVM optimisation passes.
+void initialise_optimisation_passes();
+
+/// Optimises the given LLVM IR module.
+void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm = nullptr);
+
+}  // namespace utils
+}  // namespace nmodl
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 2f4e1f653d..6d374999c3 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -48,7 +48,7 @@ int main(int argc, const char* argv[]) {
     visitor::SymtabVisitor().visit_program(*ast);
 
     logger->info("Running LLVM Visitor");
-    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", /*opt_passes=*/false);
+    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", /*opt_level_ir=*/0);
     llvm_visitor.visit_program(*ast);
     std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
 
diff --git a/src/main.cpp b/src/main.cpp
index 64a5a99fca..4192a44f57 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -171,9 +171,6 @@ int main(int argc, const char* argv[]) {
     /// use single precision floating-point types
     bool llvm_float_type(false);
 
-    /// run llvm optimisation passes
-    bool llvm_ir_opt_passes(false);
-
     /// llvm vector width
     int llvm_vec_width = 1;
 
@@ -321,9 +318,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--disable-debug-info",
                        disable_debug_information,
                        "Disable debug information ({})"_format(disable_debug_information))->ignore_case();
-    llvm_opt->add_flag("--opt",
-                       llvm_ir_opt_passes,
-                       "Run few common LLVM IR optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
+    llvm_opt->add_option("--opt-level-ir",
+                              llvm_opt_level_ir,
+                              "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
@@ -342,9 +339,6 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_flag("--run",
                             run_llvm_benchmark,
                             "Run LLVM benchmark ({})"_format(run_llvm_benchmark))->ignore_case();
-    benchmark_opt->add_option("--opt-level-ir",
-                              llvm_opt_level_ir,
-                              "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
     benchmark_opt->add_option("--opt-level-codegen",
                               llvm_opt_level_codegen,
                               "Machine code optimisation level (O{})"_format(llvm_opt_level_codegen))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
@@ -658,10 +652,14 @@ int main(int argc, const char* argv[]) {
 
 #ifdef NMODL_LLVM_BACKEND
             if (llvm_ir || run_llvm_benchmark) {
+                // If benchmarking, we want to optimize the IR with target information and not in
+                // LLVM visitor.
+                int llvm_opt_level = run_llvm_benchmark ? 0 : llvm_opt_level_ir;
+
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
                                            output_dir,
-                                           llvm_ir_opt_passes,
+                                           llvm_opt_level,
                                            llvm_float_type,
                                            llvm_vec_width,
                                            vector_library,
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index a2d8df63f4..e5a7cd8928 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -7,9 +7,9 @@
 
 #include "jit_driver.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/llvm_utils.hpp"
 #include "utils/common_utils.hpp"
 
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
@@ -21,12 +21,10 @@
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 
 namespace nmodl {
 namespace runner {
@@ -35,63 +33,6 @@ namespace runner {
 /*                            Utilities for JIT driver                                  */
 /****************************************************************************************/
 
-/// Initialises some LLVM optimisation passes.
-static void initialise_optimisation_passes() {
-    auto& registry = *llvm::PassRegistry::getPassRegistry();
-    llvm::initializeCore(registry);
-    llvm::initializeTransformUtils(registry);
-    llvm::initializeScalarOpts(registry);
-    llvm::initializeInstCombine(registry);
-    llvm::initializeAnalysis(registry);
-}
-
-/// Populates pass managers with passes for the given optimisation levels.
-static void populate_pms(llvm::legacy::FunctionPassManager& func_pm,
-                         llvm::legacy::PassManager& module_pm,
-                         int opt_level,
-                         int size_level,
-                         llvm::TargetMachine* tm) {
-    // First, set the pass manager builder with some basic optimisation information.
-    llvm::PassManagerBuilder pm_builder;
-    pm_builder.OptLevel = opt_level;
-    pm_builder.SizeLevel = size_level;
-    pm_builder.DisableUnrollLoops = opt_level == 0;
-
-    // If target machine is defined, then initialise the TargetTransformInfo for the target.
-    if (tm) {
-        module_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
-        func_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
-    }
-
-    // Populate pass managers.
-    pm_builder.populateModulePassManager(module_pm);
-    pm_builder.populateFunctionPassManager(func_pm);
-}
-
-/// Runs the function and module passes on the provided module.
-static void run_optimisation_passes(llvm::Module& module,
-                                    llvm::legacy::FunctionPassManager& func_pm,
-                                    llvm::legacy::PassManager& module_pm) {
-    func_pm.doInitialization();
-    auto& functions = module.getFunctionList();
-    for (auto& function: functions) {
-        llvm::verifyFunction(function);
-        func_pm.run(function);
-    }
-    func_pm.doFinalization();
-    module_pm.run(module);
-}
-
-/// Optimises the given LLVM IR module.
-static void optimise_module(llvm::Module& module,
-                            int opt_level,
-                            llvm::TargetMachine* tm = nullptr) {
-    llvm::legacy::FunctionPassManager func_pm(&module);
-    llvm::legacy::PassManager module_pm;
-    populate_pms(func_pm, module_pm, opt_level, /*size_level=*/0, tm);
-    run_optimisation_passes(module, func_pm, module_pm);
-}
-
 /// Sets the target triple and the data layout of the module.
 static void set_triple_and_data_layout(llvm::Module& module, const std::string& features) {
     // Get the default target triple for the host.
@@ -149,7 +90,7 @@ void JITDriver::init(std::string features,
                      BenchmarkInfo* benchmark_info) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
-    initialise_optimisation_passes();
+    utils::initialise_optimisation_passes();
 
     // Set the target triple and the data layout for the module.
     set_triple_and_data_layout(*module, features);
@@ -211,7 +152,7 @@ void JITDriver::init(std::string features,
 
         // Optimise the LLVM IR module and save it to .ll file if benchmarking.
         if (benchmark_info) {
-            optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
+            utils::optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
 
             std::error_code error_code;
             std::unique_ptr<llvm::ToolOutputFile> out =
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index afb1317cd8..d8e1127417 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -29,7 +29,7 @@ struct BenchmarkInfo {
     /// Object file output directory.
     std::string output_dir;
 
-    /// Optimisation level for generated IR.
+    /// Optimisation level for IT.
     int opt_level_ir;
 
     /// Optimisation level for machine code generation.
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index 9696191172..4a66de52fc 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -43,7 +43,7 @@ class LLVMBenchmark {
     /// Benchmarking backend
     std::string backend;
 
-    /// Optimisation level for LLVM IR transformations.
+    /// Optimisation level for IR generation.
     int opt_level_ir;
 
     /// Optimisation level for machine code generation.
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index aa77a4e493..41605ecbd3 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -126,7 +126,7 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/false);
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
@@ -228,7 +228,7 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/true);
+                                                 /*opt_level_ir=*/3);
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
@@ -301,7 +301,7 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         SolveBlockVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/false,
+                                                 /*opt_level_ir=*/0,
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/1);
         llvm_visitor.visit_program(*ast);
@@ -383,7 +383,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         SolveBlockVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/true,
+                                                 /*opt_level_ir=*/3,
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/4);
         llvm_visitor.visit_program(*ast);
@@ -465,7 +465,7 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
         SolveBlockVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/false,
+                                                 /*opt_level_ir=*/0,
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/2);
         llvm_visitor.visit_program(*ast);
@@ -556,7 +556,7 @@ SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
         SolveBlockVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/false,
+                                                 /*opt_level_ir=*/0,
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/2);
         llvm_visitor.visit_program(*ast);
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index e77b6844ae..6042aecfc8 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -27,7 +27,7 @@ using nmodl::parser::NmodlDriver;
 //=============================================================================
 
 codegen::CodegenInstanceData generate_instance_data(const std::string& text,
-                                                    bool opt = false,
+                                                    int opt_level = 0,
                                                     bool use_single_precision = false,
                                                     int vector_width = 1,
                                                     size_t num_elements = 100,
@@ -41,7 +41,7 @@ codegen::CodegenInstanceData generate_instance_data(const std::string& text,
 
     codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"test",
                                              /*output_dir=*/".",
-                                             opt,
+                                             opt_level,
                                              use_single_precision,
                                              vector_width);
     llvm_visitor.visit_program(*ast);
@@ -104,7 +104,7 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
             const size_t num_elements = 10;
             constexpr static double seed = 42;
             auto instance_data = generate_instance_data(nmodl_text,
-                                                        /*opt=*/false,
+                                                        /*opt_level=*/0,
                                                         /*use_single_precision=*/true,
                                                         /*vector_width*/ 1,
                                                         num_elements,
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index fa0a649f2d..d43d99282d 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -35,7 +35,7 @@ using nmodl::parser::NmodlDriver;
 //=============================================================================
 
 std::string run_llvm_visitor(const std::string& text,
-                             bool opt = false,
+                             int opt_level = 0,
                              bool use_single_precision = false,
                              int vector_width = 1,
                              std::string vec_lib = "none",
@@ -53,7 +53,7 @@ std::string run_llvm_visitor(const std::string& text,
 
     codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                              /*output_dir=*/".",
-                                             opt,
+                                             opt_level,
                                              use_single_precision,
                                              vector_width,
                                              vec_lib,
@@ -99,7 +99,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
 
         THEN("variables are loaded and add instruction is created") {
             std::string module_string =
-                run_llvm_visitor(nmodl_text, /*opt=*/false, /*use_single_precision=*/true);
+                run_llvm_visitor(nmodl_text, /*opt_level=*/0, /*use_single_precision=*/true);
             std::smatch m;
 
             std::regex rhs(R"(%1 = load float, float\* %b)");
@@ -179,7 +179,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
 
         THEN("'pow' intrinsic is created") {
             std::string module_string =
-                run_llvm_visitor(nmodl_text, /*opt=*/false, /*use_single_precision=*/true);
+                run_llvm_visitor(nmodl_text, /*opt_level=*/0, /*use_single_precision=*/true);
             std::smatch m;
 
             // Check 'pow' intrinsic.
@@ -1046,7 +1046,7 @@ SCENARIO("Vectorised simple kernel", "[visitor][llvm]") {
 
         THEN("a gather instructions is created") {
             std::string module_string = run_llvm_visitor(nmodl_text,
-                                                         /*opt=*/false,
+                                                         /*opt_level=*/0,
                                                          /*use_single_precision=*/false,
                                                          /*vector_width=*/4);
             std::smatch m;
@@ -1098,7 +1098,7 @@ SCENARIO("Vectorised simple kernel with ion writes", "[visitor][llvm]") {
 
         THEN("a scatter instructions is created") {
             std::string module_string = run_llvm_visitor(nmodl_text,
-                                                         /*opt=*/false,
+                                                         /*opt_level=*/0,
                                                          /*use_single_precision=*/false,
                                                          /*vector_width=*/4);
             std::smatch m;
@@ -1154,7 +1154,7 @@ SCENARIO("Vectorised simple kernel with control flow", "[visitor][llvm]") {
 
         THEN("masked load and stores are created") {
             std::string module_string = run_llvm_visitor(nmodl_text,
-                                                         /*opt=*/false,
+                                                         /*opt_level=*/0,
                                                          /*use_single_precision=*/true,
                                                          /*vector_width=*/8);
             std::smatch m;
@@ -1326,7 +1326,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
 
             // Check exponential intrinsic is created.
             std::string no_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                 /*opt=*/false,
+                                                                 /*opt_level=*/0,
                                                                  /*use_single_precision=*/false,
                                                                  /*vector_width=*/2);
             std::regex exp_decl(R"(declare <2 x double> @llvm\.exp\.v2f64\(<2 x double>\))");
@@ -1337,7 +1337,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
 #if LLVM_VERSION_MAJOR >= 13
             // Check exponential calls are replaced with calls to SVML library.
             std::string svml_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                   /*opt=*/false,
+                                                                   /*opt_level=*/0,
                                                                    /*use_single_precision=*/false,
                                                                    /*vector_width=*/2,
                                                                    /*vec_lib=*/"SVML");
@@ -1350,7 +1350,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             // Check that supported exponential calls are replaced with calls to MASSV library (i.e.
             // operating on vector of width 2).
             std::string massv2_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                     /*opt=*/false,
+                                                                     /*opt_level=*/0,
                                                                      /*use_single_precision=*/false,
                                                                      /*vector_width=*/2,
                                                                      /*vec_lib=*/"MASSV");
@@ -1362,7 +1362,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
 
             // Check no replacement for MASSV happens for non-supported vector widths.
             std::string massv4_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                     /*opt=*/false,
+                                                                     /*opt_level=*/0,
                                                                      /*use_single_precision=*/false,
                                                                      /*vector_width=*/4,
                                                                      /*vec_lib=*/"MASSV");
@@ -1372,7 +1372,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             // Check correct replacement of @llvm.exp.v4f32 into @vexpf when using Accelerate.
             std::string accelerate_library_module_str =
                 run_llvm_visitor(nmodl_text,
-                                 /*opt=*/false,
+                                 /*opt_level=*/0,
                                  /*use_single_precision=*/true,
                                  /*vector_width=*/4,
                                  /*vec_lib=*/"Accelerate");
@@ -1385,7 +1385,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
 
             // Check correct replacement of @llvm.exp.v2f64 into @_ZGV?N?v_exp when using SLEEF.
             std::string sleef_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                    /*opt=*/false,
+                                                                    /*opt_level=*/0,
                                                                     /*use_single_precision=*/false,
                                                                     /*vector_width=*/2,
                                                                     /*vec_lib=*/"SLEEF");
@@ -1403,7 +1403,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             // Check the replacements when using Darwin's libsystem_m.
             std::string libsystem_m_library_module_str =
                 run_llvm_visitor(nmodl_text,
-                                 /*opt=*/false,
+                                 /*opt_level=*/0,
                                  /*use_single_precision=*/true,
                                  /*vector_width=*/4,
                                  /*vec_lib=*/"libsystem_m");
@@ -1432,7 +1432,7 @@ SCENARIO("Fast math flags", "[visitor][llvm]") {
         THEN("instructions are generated with the flags set") {
             std::string module_string =
                 run_llvm_visitor(nmodl_text,
-                                 /*opt=*/true,
+                                 /*opt_level=*/3,
                                  /*use_single_precision=*/false,
                                  /*vector_width=*/1,
                                  /*vec_lib=*/"none",
@@ -1462,12 +1462,12 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
         )";
 
         THEN("with optimisation enabled, all ops are eliminated") {
-            std::string module_string = run_llvm_visitor(nmodl_text, true);
+            std::string module_string = run_llvm_visitor(nmodl_text, /*opt_level=*/3);
             std::smatch m;
 
-            // Check if the values are optimised out
+            // Check if the values are optimised out.
             std::regex empty_proc(
-                R"(define i32 @add\(double %a[0-9].*, double %b[0-9].*\) \{\n(\s)*ret i32 0\n\})");
+                R"(define i32 @add\(double %a[0-9].*, double %b[0-9].*\).*\{\n(\s)*ret i32 0\n\})");
             REQUIRE(std::regex_search(module_string, m, empty_proc));
         }
     }
@@ -1509,7 +1509,7 @@ SCENARIO("Removal of inlined functions and procedures", "[visitor][llvm][inline]
 
         THEN("when the code is inlined the procedure and function blocks are removed") {
             std::string module_string = run_llvm_visitor(nmodl_text,
-                                                         /*opt=*/false,
+                                                         /*opt_level=*/0,
                                                          /*use_single_precision=*/false,
                                                          /*vector_width=*/1,
                                                          /*vec_lib=*/"none",

From 7f90db706a7af08f23f9b867f768813a8647524b Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 09:57:11 +0300
Subject: [PATCH 086/331] Added may-alias and cpu options

---
 src/codegen/llvm/codegen_llvm_visitor.hpp |  5 +-
 src/codegen/llvm/llvm_ir_builder.cpp      |  9 ++-
 src/codegen/llvm/llvm_ir_builder.hpp      |  9 ++-
 src/main.cpp                              | 21 ++++---
 test/benchmark/jit_driver.cpp             | 75 +++++++++++++----------
 test/benchmark/jit_driver.hpp             | 26 ++++----
 test/benchmark/llvm_benchmark.cpp         | 60 ++----------------
 test/benchmark/llvm_benchmark.hpp         | 12 ++--
 8 files changed, 94 insertions(+), 123 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 5dd8eda15c..c3abc3ca0a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -95,14 +95,15 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        int vector_width = 1,
                        std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {})
+                       std::vector<std::string> fast_math_flags = {},
+                       bool llvm_assume_alias = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_level_ir(opt_level_ir)
         , vector_width(vector_width)
         , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
-        , ir_builder(*context, use_single_precision, vector_width, fast_math_flags)
+        , ir_builder(*context, use_single_precision, vector_width, fast_math_flags, !llvm_assume_alias)
         , debug_builder(*module) {}
 
     /// Dumps the generated LLVM IR module to string.
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 004f28d857..a585c95b3b 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -200,12 +200,15 @@ void IRBuilder::set_kernel_attributes() {
     current_function->setDoesNotFreeMemory();
     current_function->setDoesNotThrow();
 
-    // We also want to specify that the pointers that instance struct holds, do not alias. In order
-    // to do that, we add a `noalias` attribute to the argument. As per Clang's specification:
+    // We also want to specify that the pointers that instance struct holds do not alias, unless
+    // specified otherwise. In order to do that, we add a `noalias` attribute to the argument. As
+    // per Clang's specification:
     //  > The `noalias` attribute indicates that the only memory accesses inside function are loads
     //  > and stores from objects pointed to by its pointer-typed arguments, with arbitrary
     //  > offsets.
-    current_function->addParamAttr(0, llvm::Attribute::NoAlias);
+    if (assume_noalias) {
+        current_function->addParamAttr(0, llvm::Attribute::NoAlias);
+    }
 
     // Finally, specify that the struct pointer does not capture and is read-only.
     current_function->addParamAttr(0, llvm::Attribute::NoCapture);
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index b9736e2846..b3005db0c7 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -58,6 +58,9 @@ class IRBuilder {
     /// The vector width used for the vectorized code.
     unsigned vector_width;
 
+    /// Instance struct fields do not alias.
+    bool assume_noalias;
+
     /// Masked value used to predicate vector instructions.
     llvm::Value* mask;
 
@@ -71,7 +74,8 @@ class IRBuilder {
     IRBuilder(llvm::LLVMContext& context,
               bool use_single_precision = false,
               unsigned vector_width = 1,
-              std::vector<std::string> fast_math_flags = {})
+              std::vector<std::string> fast_math_flags = {},
+              bool assume_noalias = true)
         : builder(context)
         , symbol_table(nullptr)
         , current_function(nullptr)
@@ -81,7 +85,8 @@ class IRBuilder {
         , vector_width(vector_width)
         , mask(nullptr)
         , kernel_id("")
-        , fast_math_flags(fast_math_flags) {}
+        , fast_math_flags(fast_math_flags)
+        , assume_noalias(assume_noalias) {}
 
     /// Transforms the fast math flags provided to the builder into LLVM's representation.
     llvm::FastMathFlags transform_to_fmf(std::vector<std::string>& flags) {
diff --git a/src/main.cpp b/src/main.cpp
index 4192a44f57..8cc3ba64e2 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -186,6 +186,9 @@ int main(int argc, const char* argv[]) {
     /// run llvm benchmark
     bool run_llvm_benchmark(false);
 
+    /// do not assume that instance struct fields do not alias
+    bool llvm_assume_alias(false);
+
     /// optimisation level for IR generation
     int llvm_opt_level_ir = 0;
 
@@ -201,8 +204,8 @@ int main(int argc, const char* argv[]) {
     /// the number of repeated experiments for the benchmarking
     int num_experiments = 100;
 
-    /// specify the backend for LLVM IR to target
-    std::string backend = "default";
+    /// specify the cpu for LLVM IR to target
+    std::string cpu = "default";
 #endif
 
     app.get_formatter()->column_width(40);
@@ -324,6 +327,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
+    llvm_opt->add_flag("--assume-may-alias",
+                       llvm_assume_alias,
+                       "Assume instance struct fields do not alias ({})"_format(llvm_assume_alias))->ignore_case();
     llvm_opt->add_option("--vector-width",
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
@@ -351,9 +357,9 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
-    benchmark_opt->add_option("--backend",
-                       backend,
-                       "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));
+    benchmark_opt->add_option("--cpu",
+                       cpu,
+                       "Target's backend ({})"_format(cpu))->ignore_case()->check(CLI::IsMember({"nehalem", "haswell", "broadwell", "skylake-avx512", "default"}));
 #endif
     // clang-format on
 
@@ -664,7 +670,8 @@ int main(int argc, const char* argv[]) {
                                            llvm_vec_width,
                                            vector_library,
                                            !disable_debug_information,
-                                           llvm_fast_math_flags);
+                                           llvm_fast_math_flags,
+                                           llvm_assume_alias);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
@@ -677,7 +684,7 @@ int main(int argc, const char* argv[]) {
                                                        shared_lib_paths,
                                                        num_experiments,
                                                        instance_size,
-                                                       backend,
+                                                       cpu,
                                                        llvm_opt_level_ir,
                                                        llvm_opt_level_codegen);
                     benchmark.run(ast);
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index e5a7cd8928..e21ca29bb7 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -33,8 +33,21 @@ namespace runner {
 /*                            Utilities for JIT driver                                  */
 /****************************************************************************************/
 
+/// Get the host CPU features in the format:
+///   +feature,+feature,-feature,+feature,...
+/// where `+` indicates that the feature is enabled.
+std::string get_cpu_features(const std::string& cpu) {
+    llvm::SubtargetFeatures features;
+    llvm::StringMap<bool> host_features;
+    if (llvm::sys::getHostCPUFeatures(host_features)) {
+        for (auto& f: host_features)
+            features.AddFeature(f.first(), f.second);
+    }
+    return llvm::join(features.getFeatures().begin(), features.getFeatures().end(), ",");
+}
+
 /// Sets the target triple and the data layout of the module.
-static void set_triple_and_data_layout(llvm::Module& module, const std::string& features) {
+static void set_triple_and_data_layout(llvm::Module& module, const std::string& cpu) {
     // Get the default target triple for the host.
     auto target_triple = llvm::sys::getDefaultTargetTriple();
     std::string error_msg;
@@ -42,8 +55,8 @@ static void set_triple_and_data_layout(llvm::Module& module, const std::string&
     if (!target)
         throw std::runtime_error("Error " + error_msg + "\n");
 
-    // Get the CPU information and set a target machine to create the data layout.
-    std::string cpu(llvm::sys::getHostCPUName());
+    // Set a target machine to create the data layout.
+    std::string features = get_cpu_features(cpu);
     std::unique_ptr<llvm::TargetMachine> tm(
         target->createTargetMachine(target_triple, cpu, features, {}, {}));
     if (!tm)
@@ -54,10 +67,10 @@ static void set_triple_and_data_layout(llvm::Module& module, const std::string&
     module.setTargetTriple(target_triple);
 }
 
-/// Creates llvm::TargetMachine with certain CPU features turned on/off.
+/// Creates llvm::TargetMachine with for a specified CPU.
 static std::unique_ptr<llvm::TargetMachine> create_target(
     llvm::orc::JITTargetMachineBuilder* tm_builder,
-    const std::string& features,
+    const std::string& cpu,
     int opt_level) {
     // First, look up the target.
     std::string error_msg;
@@ -68,8 +81,8 @@ static std::unique_ptr<llvm::TargetMachine> create_target(
 
     // Create default target machine with provided features.
     auto tm = target->createTargetMachine(target_triple,
-                                          llvm::sys::getHostCPUName().str(),
-                                          features,
+                                          cpu,
+                                          get_cpu_features(cpu),
                                           tm_builder->getOptions(),
                                           tm_builder->getRelocationModel(),
                                           tm_builder->getCodeModel(),
@@ -85,15 +98,13 @@ static std::unique_ptr<llvm::TargetMachine> create_target(
 /*                                      JIT driver                                      */
 /****************************************************************************************/
 
-void JITDriver::init(std::string features,
-                     std::vector<std::string> lib_paths,
-                     BenchmarkInfo* benchmark_info) {
+void JITDriver::init(const std::string& cpu, BenchmarkInfo* benchmark_info) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
     utils::initialise_optimisation_passes();
 
     // Set the target triple and the data layout for the module.
-    set_triple_and_data_layout(*module, features);
+    set_triple_and_data_layout(*module, cpu);
     auto data_layout = module->getDataLayout();
 
     // If benchmarking, enable listeners to use GDB, perf or VTune. Note that LLVM should be built
@@ -114,32 +125,30 @@ void JITDriver::init(std::string features,
             return std::make_unique<llvm::SectionMemoryManager>();
         });
 
-        // Register event listeners if they exist.
-        if (gdb_event_listener)
+        // If benchmarking, register event listeners and resolve shared libraries.
+        if (benchmark_info) {
             layer->registerJITEventListener(*gdb_event_listener);
-        if (perf_event_listener)
             layer->registerJITEventListener(*perf_event_listener);
-        if (intel_event_listener)
             layer->registerJITEventListener(*intel_event_listener);
 
-        for (const auto& lib_path: lib_paths) {
-            // For every library path, create a corresponding memory buffer.
-            auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
-            if (!memory_buffer)
-                throw std::runtime_error("Unable to create memory buffer for " + lib_path);
-
-            // Create a new JIT library instance for this session and resolve symbols.
-            auto& jd = session.createBareJITDylib(std::string(lib_path));
-            auto loaded =
-                llvm::orc::DynamicLibrarySearchGenerator::Load(lib_path.data(),
-                                                               data_layout.getGlobalPrefix());
-
-            if (!loaded)
-                throw std::runtime_error("Unable to load " + lib_path);
-            jd.addGenerator(std::move(*loaded));
-            cantFail(layer->add(jd, std::move(*memory_buffer)));
+            for (const auto& lib_path: benchmark_info->shared_lib_paths) {
+                // For every library path, create a corresponding memory buffer.
+                auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
+                if (!memory_buffer)
+                    throw std::runtime_error("Unable to create memory buffer for " + lib_path);
+
+                // Create a new JIT library instance for this session and resolve symbols.
+                auto& jd = session.createBareJITDylib(std::string(lib_path));
+                auto loaded =
+                    llvm::orc::DynamicLibrarySearchGenerator::Load(lib_path.data(),
+                                                                   data_layout.getGlobalPrefix());
+
+                if (!loaded)
+                    throw std::runtime_error("Unable to load " + lib_path);
+                jd.addGenerator(std::move(*loaded));
+                cantFail(layer->add(jd, std::move(*memory_buffer)));
+            }
         }
-
         return layer;
     };
 
@@ -148,7 +157,7 @@ void JITDriver::init(std::string features,
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
         // Create target machine with some features possibly turned off.
         int opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
-        auto tm = create_target(&tm_builder, features, opt_level_codegen);
+        auto tm = create_target(&tm_builder, cpu, opt_level_codegen);
 
         // Optimise the LLVM IR module and save it to .ll file if benchmarking.
         if (benchmark_info) {
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index d8e1127417..7106311523 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -17,6 +17,7 @@
 
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/Support/Host.h"
 
 namespace nmodl {
 namespace runner {
@@ -29,6 +30,9 @@ struct BenchmarkInfo {
     /// Object file output directory.
     std::string output_dir;
 
+    /// Shared libraries' paths to link against.
+    std::vector<std::string> shared_lib_paths;
+
     /// Optimisation level for IT.
     int opt_level_ir;
 
@@ -63,9 +67,7 @@ class JITDriver {
         : module(std::move(m)) {}
 
     /// Initializes the JIT driver.
-    void init(std::string features = "",
-              std::vector<std::string> lib_paths = {},
-              BenchmarkInfo* benchmark_info = nullptr);
+    void init(const std::string& cpu, BenchmarkInfo* benchmark_info = nullptr);
 
     /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
     template <typename ReturnType>
@@ -131,7 +133,7 @@ class TestRunner: public BaseRunner {
         : BaseRunner(std::move(m)) {}
 
     virtual void initialize_driver() {
-        driver->init();
+        driver->init(llvm::sys::getHostCPUName().str());
     }
 };
 
@@ -145,27 +147,23 @@ class BenchmarkRunner: public BaseRunner {
     /// Benchmarking information passed to JIT driver.
     BenchmarkInfo benchmark_info;
 
-    /// CPU features specified by the user.
-    std::string features;
-
-    /// Shared libraries' paths to link against.
-    std::vector<std::string> shared_lib_paths;
+    /// CPU to target.
+    std::string cpu;
 
   public:
     BenchmarkRunner(std::unique_ptr<llvm::Module> m,
                     std::string filename,
                     std::string output_dir,
-                    std::string features = "",
+                    std::string cpu,
                     std::vector<std::string> lib_paths = {},
                     int opt_level_ir = 0,
                     int opt_level_codegen = 0)
         : BaseRunner(std::move(m))
-        , benchmark_info{filename, output_dir, opt_level_ir, opt_level_codegen}
-        , features(features)
-        , shared_lib_paths(lib_paths) {}
+        , cpu(cpu)
+        , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
     virtual void initialize_driver() {
-        driver->init(features, shared_lib_paths, &benchmark_info);
+        driver->init(cpu, &benchmark_info);
     }
 };
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index e48df0d457..f6896aad3d 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -6,7 +6,6 @@
  *************************************************************************/
 
 #include <chrono>
-#include <fstream>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "llvm_benchmark.hpp"
@@ -19,35 +18,6 @@
 namespace nmodl {
 namespace benchmark {
 
-/// Precision for the timing measurements.
-static constexpr int PRECISION = 9;
-
-/// Get the host CPU features in the format:
-///   +feature,+feature,-feature,+feature,...
-/// where `+` indicates that the feature is enabled.
-static std::vector<std::string> get_cpu_features() {
-    std::string cpu(llvm::sys::getHostCPUName());
-
-    llvm::SubtargetFeatures features;
-    llvm::StringMap<bool> host_features;
-    if (llvm::sys::getHostCPUFeatures(host_features)) {
-        for (auto& f: host_features)
-            features.AddFeature(f.first(), f.second);
-    }
-    return features.getFeatures();
-}
-
-
-void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>& host_features) {
-    for (auto& host_feature: host_features) {
-        if (feature == host_feature.substr(1)) {
-            host_feature[0] = '-';
-            logger->info("{}", host_feature);
-            return;
-        }
-    }
-}
-
 void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
     // create functions
     generate_llvm(node);
@@ -72,37 +42,17 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     std::vector<std::string> kernel_names;
     llvm_visitor.find_kernel_names(kernel_names);
 
-    // Get feature's string and turn them off depending on the backend.
-    std::vector<std::string> features = get_cpu_features();
-    logger->info("Backend: {}", backend);
-    if (backend == "avx2") {
-        // Disable SSE.
-        logger->info("Disabling features:");
-        disable("sse", features);
-        disable("sse2", features);
-        disable("sse3", features);
-        disable("sse4.1", features);
-        disable("sse4.2", features);
-    } else if (backend == "sse2") {
-        // Disable AVX.
-        logger->info("Disabling features:");
-        disable("avx", features);
-        disable("avx2", features);
-    }
+    // Get feature's string and turn them off depending on the cpu.
+    std::string cpu_name = cpu == "default" ? llvm::sys::getHostCPUName().str() : cpu;
+    logger->info("CPU: {}", cpu_name);
 
-    std::string features_str = llvm::join(features.begin(), features.end(), ",");
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
     // Create the benchmark runner and initialize it.
     std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
                            mod_filename;
-    runner::BenchmarkRunner runner(std::move(m),
-                                   filename,
-                                   output_dir,
-                                   features_str,
-                                   shared_libs,
-                                   opt_level_ir,
-                                   opt_level_codegen);
+    runner::BenchmarkRunner runner(
+        std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
     // Benchmark every kernel.
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index 4a66de52fc..bef0aa9962 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <string>
+#include <fstream>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "utils/logger.hpp"
@@ -40,8 +41,8 @@ class LLVMBenchmark {
     /// The size of the instance struct for benchmarking.
     int instance_size;
 
-    /// Benchmarking backend
-    std::string backend;
+    /// CPU to target.
+    std::string cpu;
 
     /// Optimisation level for IR generation.
     int opt_level_ir;
@@ -59,7 +60,7 @@ class LLVMBenchmark {
                   std::vector<std::string> shared_libs,
                   int num_experiments,
                   int instance_size,
-                  const std::string& backend,
+                  const std::string& cpu,
                   int opt_level_ir,
                   int opt_level_codegen)
         : llvm_visitor(llvm_visitor)
@@ -68,7 +69,7 @@ class LLVMBenchmark {
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
-        , backend(backend)
+        , cpu(cpu)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen) {}
 
@@ -76,9 +77,6 @@ class LLVMBenchmark {
     void run(const std::shared_ptr<ast::Program>& node);
 
   private:
-    /// Disables the specified feature in the target.
-    void disable(const std::string& feature, std::vector<std::string>& host_features);
-
     /// Visits the AST to construct the LLVM IR module.
     void generate_llvm(const std::shared_ptr<ast::Program>& node);
 

From 58c17dc430cc8502fb09e60c5dc3448e39d8c30b Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 10:17:00 +0300
Subject: [PATCH 087/331] Fixed event listeners

---
 test/benchmark/jit_driver.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index e21ca29bb7..d6b19d1724 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -125,12 +125,18 @@ void JITDriver::init(const std::string& cpu, BenchmarkInfo* benchmark_info) {
             return std::make_unique<llvm::SectionMemoryManager>();
         });
 
-        // If benchmarking, register event listeners and resolve shared libraries.
-        if (benchmark_info) {
+        // Register event listeners if they exist.
+        if (gdb_event_listener)
             layer->registerJITEventListener(*gdb_event_listener);
+        if (perf_event_listener)
             layer->registerJITEventListener(*perf_event_listener);
+        if (intel_event_listener)
             layer->registerJITEventListener(*intel_event_listener);
 
+        // If benchmarking, resolve shared libraries.
+        if (benchmark_info) {
+
+
             for (const auto& lib_path: benchmark_info->shared_lib_paths) {
                 // For every library path, create a corresponding memory buffer.
                 auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);

From 55e12c5fd9d1cf565f381305d6d2ff00e2ad5ced Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 10:25:05 +0300
Subject: [PATCH 088/331] Fixed a typo

---
 src/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.cpp b/src/main.cpp
index 8cc3ba64e2..973ee89b46 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -329,7 +329,7 @@ int main(int argc, const char* argv[]) {
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
     llvm_opt->add_flag("--assume-may-alias",
                        llvm_assume_alias,
-                       "Assume instance struct fields do not alias ({})"_format(llvm_assume_alias))->ignore_case();
+                       "Assume instance struct fields may alias ({})"_format(llvm_assume_alias))->ignore_case();
     llvm_opt->add_option("--vector-width",
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();

From cb0c66f4c703355ad9cc23c802ff8a218ff43c00 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 10:29:57 +0300
Subject: [PATCH 089/331] Removed CPU checks

---
 src/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main.cpp b/src/main.cpp
index 973ee89b46..7a3037a464 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -359,7 +359,7 @@ int main(int argc, const char* argv[]) {
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
     benchmark_opt->add_option("--cpu",
                        cpu,
-                       "Target's backend ({})"_format(cpu))->ignore_case()->check(CLI::IsMember({"nehalem", "haswell", "broadwell", "skylake-avx512", "default"}));
+                       "Target's backend ({})"_format(cpu))->ignore_case();
 #endif
     // clang-format on
 

From 8d67ee5e0ad00f72a69c3e092ed5725f8ee3f6a8 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 3 Jun 2021 11:11:13 +0200
Subject: [PATCH 090/331] Fix clang-format

---
 src/codegen/llvm/codegen_llvm_visitor.hpp | 6 +++++-
 test/benchmark/jit_driver.cpp             | 2 --
 test/benchmark/llvm_benchmark.hpp         | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index c3abc3ca0a..22b9fafd83 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -103,7 +103,11 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , vector_width(vector_width)
         , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
-        , ir_builder(*context, use_single_precision, vector_width, fast_math_flags, !llvm_assume_alias)
+        , ir_builder(*context,
+                     use_single_precision,
+                     vector_width,
+                     fast_math_flags,
+                     !llvm_assume_alias)
         , debug_builder(*module) {}
 
     /// Dumps the generated LLVM IR module to string.
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index d6b19d1724..e063cff86f 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -135,8 +135,6 @@ void JITDriver::init(const std::string& cpu, BenchmarkInfo* benchmark_info) {
 
         // If benchmarking, resolve shared libraries.
         if (benchmark_info) {
-
-
             for (const auto& lib_path: benchmark_info->shared_lib_paths) {
                 // For every library path, create a corresponding memory buffer.
                 auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index bef0aa9962..cc9dd3bcf0 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -7,8 +7,8 @@
 
 #pragma once
 
-#include <string>
 #include <fstream>
+#include <string>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "utils/logger.hpp"

From c25071260c916cd5a205e33a031fcd3cda17e5c9 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 3 Jun 2021 11:24:41 +0200
Subject: [PATCH 091/331] Use steady clock

---
 test/benchmark/llvm_benchmark.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index f6896aad3d..0e94ae231b 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -27,9 +27,9 @@ void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
 
 void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
     // First, visit the AST to build the LLVM IR module and wrap the kernel function calls.
-    auto start = std::chrono::high_resolution_clock::now();
+    auto start = std::chrono::steady_clock::now();
     llvm_visitor.wrap_kernel_functions();
-    auto end = std::chrono::high_resolution_clock::now();
+    auto end = std::chrono::steady_clock::now();
 
     // Log the time taken to visit the AST and build LLVM IR.
     std::chrono::duration<double> diff = end - start;
@@ -74,9 +74,9 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
-            auto start = std::chrono::high_resolution_clock::now();
+            auto start = std::chrono::steady_clock::now();
             runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
-            auto end = std::chrono::high_resolution_clock::now();
+            auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
             // Log the time taken for each run.

From 7202898ef365eb1719b5939a577d862e02be1a48 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 3 Jun 2021 11:43:02 +0200
Subject: [PATCH 092/331] Use steady clock

---
 test/benchmark/llvm_benchmark.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 1dd521d88c..06c1524b69 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -108,9 +108,9 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
         
         // Record the execution time of the kernel.
-        auto start = std::chrono::high_resolution_clock::now();
+        auto start = std::chrono::steady_clock::now();
         nrn_state_hh_ext(instance_data.base_ptr);
-        auto end = std::chrono::high_resolution_clock::now();
+        auto end = std::chrono::steady_clock::now();
         std::chrono::duration<double> diff = end - start;
 
         // Log the time taken for each run.

From 125148afb26aed72cef815faeadd36a8069c35c6 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 3 Jun 2021 13:55:54 +0200
Subject: [PATCH 093/331] Only run external kernel if defined in CLI in NMODL
 and edits to the script

---
 src/main.cpp                      |   9 +-
 test/benchmark/llvm_benchmark.cpp |  92 ++++++++--------
 test/benchmark/llvm_benchmark.hpp |   9 +-
 test/benchmark/nmodl-llvm-time.sh | 177 ++++++++++++++++++------------
 4 files changed, 168 insertions(+), 119 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 7a3037a464..c6c8e141ce 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -206,6 +206,9 @@ int main(int argc, const char* argv[]) {
 
     /// specify the cpu for LLVM IR to target
     std::string cpu = "default";
+
+    /// benchmark external kernel with JIT
+    bool external_kernel;
 #endif
 
     app.get_formatter()->column_width(40);
@@ -360,6 +363,9 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--cpu",
                        cpu,
                        "Target's backend ({})"_format(cpu))->ignore_case();
+    benchmark_opt->add_option("--external",
+                              external_kernel,
+                              "Benchmark external kernel ({})"_format(external_kernel))->ignore_case();
 #endif
     // clang-format on
 
@@ -686,7 +692,8 @@ int main(int argc, const char* argv[]) {
                                                        instance_size,
                                                        cpu,
                                                        llvm_opt_level_ir,
-                                                       llvm_opt_level_codegen);
+                                                       llvm_opt_level_codegen,
+                                                       external_kernel);
                     benchmark.run(ast);
                 }
             }
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 06c1524b69..2a34445ae2 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -56,9 +56,9 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
-    // Benchmark every kernel.
-    for (const auto& kernel_name: kernel_names) {
-        // For every kernel run the benchmark `num_experiments` times.
+    if (external_kernel) {
+        // benchmark external kernel
+        logger->info("Benchmarking external kernel");
         double time_min = std::numeric_limits<double>::max();
         double time_max = 0.0;
         double time_sum = 0.0;
@@ -67,16 +67,9 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             // Initialise the data.
             auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
 
-            // Log instance size once.
-            if (i == 0) {
-                double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-                logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
-            }
-
             // Record the execution time of the kernel.
-            std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+            nrn_state_hh_ext(instance_data.base_ptr);
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
@@ -93,43 +86,52 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         double time_mean = time_sum / num_experiments;
         logger->info("Average compute time = {:.6f}", time_mean);
         logger->info("Compute time variance = {:g}",
-                     time_squared_sum / num_experiments - time_mean * time_mean);
+                        time_squared_sum / num_experiments - time_mean * time_mean);
         logger->info("Minimum compute time = {:.6f}", time_min);
         logger->info("Maximum compute time = {:.6f}\n", time_max);
+    } else {
+        // Benchmark every kernel.
+        for (const auto& kernel_name: kernel_names) {
+            // For every kernel run the benchmark `num_experiments` times.
+            double time_min = std::numeric_limits<double>::max();
+            double time_max = 0.0;
+            double time_sum = 0.0;
+            double time_squared_sum = 0.0;
+            for (int i = 0; i < num_experiments; ++i) {
+                // Initialise the data.
+                auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+
+                // Log instance size once.
+                if (i == 0) {
+                    double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
+                    logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
+                }
+
+                // Record the execution time of the kernel.
+                std::string wrapper_name = "__" + kernel_name + "_wrapper";
+                auto start = std::chrono::steady_clock::now();
+                runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+                auto end = std::chrono::steady_clock::now();
+                std::chrono::duration<double> diff = end - start;
+
+                // Log the time taken for each run.
+                logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
+
+                // Update statistics.
+                time_sum += diff.count();
+                time_squared_sum += diff.count() * diff.count();
+                time_min = std::min(time_min, diff.count());
+                time_max = std::max(time_max, diff.count());
+            }
+            // Log the average time taken for the kernel.
+            double time_mean = time_sum / num_experiments;
+            logger->info("Average compute time = {:.6f}", time_mean);
+            logger->info("Compute time variance = {:g}",
+                        time_squared_sum / num_experiments - time_mean * time_mean);
+            logger->info("Minimum compute time = {:.6f}", time_min);
+            logger->info("Maximum compute time = {:.6f}\n", time_max);
+        }
     }
-    // benchmark external kernel
-    logger->info("Benchmarking external kernel");
-    double time_min = std::numeric_limits<double>::max();
-    double time_max = 0.0;
-    double time_sum = 0.0;
-    double time_squared_sum = 0.0;
-    for (int i = 0; i < num_experiments; ++i) {
-        // Initialise the data.
-        auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-        
-        // Record the execution time of the kernel.
-        auto start = std::chrono::steady_clock::now();
-        nrn_state_hh_ext(instance_data.base_ptr);
-        auto end = std::chrono::steady_clock::now();
-        std::chrono::duration<double> diff = end - start;
-
-        // Log the time taken for each run.
-        logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
-
-        // Update statistics.
-        time_sum += diff.count();
-        time_squared_sum += diff.count() * diff.count();
-        time_min = std::min(time_min, diff.count());
-        time_max = std::max(time_max, diff.count());
-    }
-    // Log the average time taken for the kernel.
-    double time_mean = time_sum / num_experiments;
-    logger->info("Average compute time = {:.6f}", time_mean);
-    logger->info("Compute time variance = {:g}",
-                    time_squared_sum / num_experiments - time_mean * time_mean);
-    logger->info("Minimum compute time = {:.6f}", time_min);
-    logger->info("Maximum compute time = {:.6f}\n", time_max);
-
 }
 
 }  // namespace benchmark
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index cc9dd3bcf0..0c64d1ce6c 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -50,6 +50,9 @@ class LLVMBenchmark {
     /// Optimisation level for machine code generation.
     int opt_level_codegen;
 
+    /// Benchmark external kernel
+    bool external_kernel;
+
     /// Filestream for dumping logs to the file.
     std::ofstream ofs;
 
@@ -62,7 +65,8 @@ class LLVMBenchmark {
                   int instance_size,
                   const std::string& cpu,
                   int opt_level_ir,
-                  int opt_level_codegen)
+                  int opt_level_codegen,
+                  bool external_kernel)
         : llvm_visitor(llvm_visitor)
         , mod_filename(mod_filename)
         , output_dir(output_dir)
@@ -71,7 +75,8 @@ class LLVMBenchmark {
         , instance_size(instance_size)
         , cpu(cpu)
         , opt_level_ir(opt_level_ir)
-        , opt_level_codegen(opt_level_codegen) {}
+        , opt_level_codegen(opt_level_codegen)
+        , external_kernel(external_kernel) {}
 
     /// Runs the benchmark.
     void run(const std::shared_ptr<ast::Program>& node);
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 8e869ff0c9..17aba5a0ba 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -3,7 +3,7 @@
 #
 # Driver for nmodl-llvm benchmarking
 #
-
+set -e
 # sh nmodl-llvm-time.sh -vec-sweep -mod-dir /gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/bbp_mod -n 100000000
 # default params
 inst_size=100000000
@@ -97,10 +97,10 @@ vec_lib_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06
 vec_lib="libsvml.so"
 
 # nmodl
-nmodl_exe="/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/build_llvm/install/bin/nmodl"
+nmodl_exe="/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/build_benchmark/install/bin/nmodl"
 
 # external kernel
-nmodl_src_path="/gpfs/bbp.cscs.ch/home/gcastigl/project16/nmodl-llvm"
+nmodl_src_path="/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl"
 kernels_path=${nmodl_src_path}/"test/benchmark/kernels"
 ext_lib="libextkernel.so"
 if ${external_kernel_exec}; then
@@ -111,7 +111,17 @@ mkdir -p ${output_dir}
 
 # compilers
 icpc_exe=icpc
-declare -a icpc_flags=(
+declare -a icpc_flags_avx512=(
+    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml"
+    "-O2 -qopt-zmm-usage=high -xCORE-AVX512 -prec-div -fimf-use-svml"
+    "-O2 -mavx512f -prec-div -fimf-use-svml"
+    )
+
+declare -a icpc_flags_avx2=(
+    "-O2 -mavx2 -prec-div -fimf-use-svml"
+    )
+
+declare -a icpc_flags_sse2=(
     "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml"
     "-O2 -qopt-zmm-usage=high -xCORE-AVX512 -prec-div -fimf-use-svml"
     "-O2 -mavx512f -prec-div -fimf-use-svml"
@@ -122,91 +132,116 @@ declare -a icpc_flags=(
 llvm_path="/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621"
 llvm_lib=${llvm_path}/lib
 clang_exe=${llvm_path}/bin/clang++
-declare -a clang_flags=(
+declare -a clang_flags_avx512=(
     "-O3 -mavx512f -ffast-math -fopenmp -fveclib=SVML"
-    "-O3 -mavx2 -ffast-math -fopenmp -fveclib=SVML"
-    "-O3 -msse2 -ffast-math -fopenmp -fveclib=SVML"
     "-O3 -mavx512f -ffast-math -fveclib=SVML"
     "-O3 -mavx512f -fveclib=SVML"
     "-O3 -march=skylake-avx512 -ffast-math -fopenmp -fveclib=SVML"
     )
 
+declare -a clang_flags_avx2=(
+    "-O3 -mavx2 -ffast-math -fopenmp -fveclib=SVML"
+    )
+
+declare -a clang_flags_sse2=(
+    "-O3 -msse2 -ffast-math -fopenmp -fveclib=SVML"
+    )
+
 gcc_bin_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/gcc-9.3.0-45gzrp/bin"
 gcc_exe=${gcc_bin_path}/g++
-declare -a gcc_flags=(
+declare -a gcc_flags_avx512=(
     "-O3 -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml"
+    )
+
+declare -a gcc_flags_avx2=(
     "-O3 -mavx2 -ffast-math -ftree-vectorize -mveclibabi=svml"
+    )
+
+declare -a gcc_flags_sse2=(
     "-O3 -msse2 -ffast-math -ftree-vectorize -mveclibabi=svml"
     )
 
-# loop over options
-# for kernel_target in compute-bound memory-bound hh; do
-#for kernel_target in compute-bound memory-bound; do
-#for kernel_target in Ca_HVA2 can2 cat DetAMPANMDA DetGABAAB SKv3_1; do
-for kernel_target in ; do
-# for kernel_target in hh; do
-    echo "kernel: "${kernel_target}
-    
-    # loop over other compilers
-    # for compiler in icpc clang gcc; do
-    for compiler in clang; do
-        echo "|  compiler: "${compiler}
-
-        compiler_exe=${compiler}_exe
-        compiler_flags=${compiler}_flags[@]
-        
-         if $external_kernel_exec; then
-            for flags in "${!compiler_flags}"; do
-                echo "|  |  flags: "${flags}
+declare -a benchmark_description
+declare -a benchmark_time
+
+KERNEL_TARGETS="compute-bound memory-bound hh"
+ARCHITECTURES="avx512 avx2 sse2"
 
+# loop over options
+for kernel_target in ${KERNEL_TARGETS}; do
+    echo "Kernel: $kernel_target"
+
+    for architecture in ${ARCHITECTURES}; do
+        if [ "$architecture" = "avx512" ] ; then
+            vec_width=8
+        elif [ "$architecture" = "avx2" ] ; then
+            vec_width=4
+        elif [ "$architecture" = "sse2" ]; then
+            vec_width=2
+        else
+            vec_width=1
+        fi
+        echo "|  Architecture: $architecture"
+
+        if $external_kernel_exec; then
+            for compiler in icpc clang gcc; do
+                echo "|  |  Compiler: $compiler"
+			
+				compiler_exe=${compiler}_exe
+	        	compiler_flags=${compiler}_flags_${architecture}[@]
+	        	for flags in "${!compiler_flags}"; do
+	        		echo "|  |  |  flags: "${flags}
            
-                spec=${compiler}_${flags//[[:blank:]]/}
-                rel_ext_path=${kernel_target}_${spec}
-
-                ${debug} mkdir ${rel_ext_path}
-                ${debug} cd ${rel_ext_path}
-                ext_path=$(pwd)
-                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp \
-                -shared -fpic -o ${ext_lib}
-                ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
-                ${debug} cd ..
+	                spec=${compiler}_${flags//[[:blank:]]/}
+	                rel_ext_path=${kernel_target}_${spec}
+
+	                ${debug} mkdir ${rel_ext_path}
+	                ${debug} cd ${rel_ext_path}
+	                ext_path=$(pwd)
+	                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp \
+	                -shared -fpic -o ${ext_lib}
+	                ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
+	                ${debug} cd ..
+
+                    
+                    nmodl_args="llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark -run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu default --libs {vec_lib_path}/${vec_lib}"
+
+                    nmodl_args="${nmodl_args} --external"
+                    benchmark_ext_desc=ext_${kernel_target}_${compiler}_${architecture}_v${vec_width}_${flags//[[:blank:]]/}
+                    benchmark_description+=("${benchmark_ext_desc}")
+                    # runs only external kernel
+                    ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_ext_desc}.log"
+                    #benchmark_time+=(...)
+                done
             done
-        fi
-
-        if $vec_width_sweep; then
-            for power_of_two in $(seq 1 3); do
-                vec_width=$((2**${power_of_two}))
-                echo "|  | Running JIT for vec width ${vec_width}"
-                nmodl_args="${modfile_directory}/${kernel_target}.mod passes --inline \
-                llvm --ir --fmf nnan contract afn --vector-width ${vec_width} --veclib SVML \
-                benchmark \
-                --opt-level-ir 3 --opt-level-codegen 3 --run --instance-size ${inst_size} \
-                --repeat ${num_exp} \
-                --libs  ${vec_lib_path}/${vec_lib} \
-                --backend default"
-
-                # run experiment
-                if $external_kernel_exec; then
-                    ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${kernel_target}_${spec}_v${vec_width}.log"
-                else
-                    ${debug} eval "LD_LIBRARY_PATH=${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${kernel_target}_${spec}_v${vec_width}.log"
+		fi
+        for fast_math in true false; do
+            if $fast_math; then
+                fast_math_flag="--fmf fast"
+                fast_math_opt="fastmath"
+            else 
+                fast_math_flag=""
+                fast_math_opt="nonfastmath"
+            fi
+            for assume_may_alias in true false; do
+                if $assume_may_alias; then
+                    assume_may_alias_flag="--assume-may-alias"
+                    assume_may_alias_opt="alias"
+                else 
+                    assume_may_alias_flag=""
+                    assume_may_alias_opt="noalias"
                 fi
+                nmodl_args="llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark -run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu default --libs {vec_lib_path}/${vec_lib}"
+                benchmark_nmodl_desc=nmodl_${kernel_target}_${architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
+                benchmark_description+=("${benchmark_nmodl_desc}")
+                # runs only kernel generated by LLVM IR
+                ${debug} eval "LD_LIBRARY_PATH=${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_nmodl_desc}.log"
+                #benchmark_time+=(...)
             done
-        else
-            nmodl_args="${modfile_directory}/${kernel_target}.mod passes --inline \
-            llvm --ir --fmf nnan contract afn --vector-width ${vec_width} --veclib SVML benchmark \
-            --opt-level-ir 3 --opt-level-codegen 3 --run --instance-size ${inst_size} \
-            --repeat ${num_exp} \
-            --libs  ${vec_lib_path}/${vec_lib} \
-            --backend default"
-
-            # run experiment
-            if $external_kernel_exec; then
-                ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${kernel_target}_${spec}_v${vec_width}.log"
-            else
-                ${debug} eval "LD_LIBRARY_PATH=${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${kernel_target}_${spec}_v${vec_width}.log"
-            fi
-        fi
+        done
     done
+done
 
+for bench_desc in ${benchmark_description[@]}; do
+    echo $bench_desc
 done
\ No newline at end of file

From 5a67fe8516b03c930aab0cf88bc21d5155dc8a33 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 08:01:37 -0700
Subject: [PATCH 094/331] [LLVM] Added saving to file utility (#685)

* Added saving to file utility
* Skip NEURON test in LLVM branch

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 azure-pipelines.yml                       |  1 +
 src/codegen/llvm/codegen_llvm_visitor.cpp | 21 ++++++---------------
 src/codegen/llvm/llvm_utils.cpp           | 19 +++++++++++++++++++
 src/codegen/llvm/llvm_utils.hpp           |  3 +++
 test/benchmark/jit_driver.cpp             | 18 +++---------------
 5 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 59f5d5bb04..a80edbd909 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -174,6 +174,7 @@ jobs:
         exit 1
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
+    condition: false
     env:
       SHELL: 'bash'
     displayName: 'Build Neuron and Run Integration Tests'
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index ffbedbb063..bac6f4e0b2 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -13,14 +13,11 @@
 #include "visitors/visitor_utils.hpp"
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/ToolOutputFile.h"
 
 #if LLVM_VERSION_MAJOR >= 13
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
@@ -72,9 +69,12 @@ void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLib
                                                                  llvm::Triple& triple) {
     // Since LLVM does not support SLEEF as a vector library yet, process it separately.
     if (vector_library == "SLEEF") {
-        // Populate function definitions of only exp and pow (for now)
-#define FIXED(w)                        llvm::ElementCount::getFixed(w)
+// clang-format off
+#define FIXED(w) llvm::ElementCount::getFixed(w)
+// clang-format on
 #define DISPATCH(func, vec_func, width) {func, vec_func, width},
+
+        // Populate function definitions of only exp and pow (for now)
         const llvm::VecDesc aarch64_functions[] = {
             // clang-format off
             DISPATCH("llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4))
@@ -890,17 +890,8 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     }
 
     // If the output directory is specified, save the IR to .ll file.
-    // \todo: Consider saving the generated LLVM IR to bytecode (.bc) file instead.
     if (output_dir != ".") {
-        std::error_code error_code;
-        std::unique_ptr<llvm::ToolOutputFile> out = std::make_unique<llvm::ToolOutputFile>(
-            output_dir + "/" + mod_filename + ".ll", error_code, llvm::sys::fs::OF_Text);
-        if (error_code)
-            throw std::runtime_error("Error: " + error_code.message());
-
-        std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
-        module->print(out->os(), annotator.get());
-        out->keep();
+        utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
     }
 
     logger->debug("Dumping generated IR...\n" + dump_module());
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index 684f962b76..59967c59c1 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -8,9 +8,12 @@
 #include "codegen/llvm/llvm_utils.hpp"
 
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 
@@ -75,5 +78,21 @@ void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* t
     populate_pms(func_pm, module_pm, opt_level, /*size_level=*/0, tm);
     run_optimisation_passes(module, func_pm, module_pm);
 }
+
+/****************************************************************************************/
+/*                                    File utils                                        */
+/****************************************************************************************/
+
+void save_ir_to_ll_file(llvm::Module& module, const std::string& filename) {
+    std::error_code error_code;
+    std::unique_ptr<llvm::ToolOutputFile> out = std::make_unique<llvm::ToolOutputFile>(
+        filename + ".ll", error_code, llvm::sys::fs::OF_Text);
+    if (error_code)
+        throw std::runtime_error("Error: " + error_code.message());
+
+    std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
+    module.print(out->os(), annotator.get());
+    out->keep();
+}
 }  // namespace utils
 }  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index 81dc30d97f..8e1e6e48dc 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -19,5 +19,8 @@ void initialise_optimisation_passes();
 /// Optimises the given LLVM IR module.
 void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm = nullptr);
 
+///
+void save_ir_to_ll_file(llvm::Module& module, const std::string& filename);
+
 }  // namespace utils
 }  // namespace nmodl
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index e5a7cd8928..a804a2d4fd 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -20,11 +20,9 @@
 #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
 
 namespace nmodl {
 namespace runner {
@@ -153,19 +151,9 @@ void JITDriver::init(std::string features,
         // Optimise the LLVM IR module and save it to .ll file if benchmarking.
         if (benchmark_info) {
             utils::optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
-
-            std::error_code error_code;
-            std::unique_ptr<llvm::ToolOutputFile> out =
-                std::make_unique<llvm::ToolOutputFile>(benchmark_info->output_dir + "/" +
-                                                           benchmark_info->filename + "_opt.ll",
-                                                       error_code,
-                                                       llvm::sys::fs::OF_Text);
-            if (error_code)
-                throw std::runtime_error("Error: " + error_code.message());
-
-            std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
-            module->print(out->os(), annotator.get());
-            out->keep();
+            const std::string filename = benchmark_info->output_dir + "/" +
+                                         benchmark_info->filename + "_opt";
+            utils::save_ir_to_ll_file(*module, filename);
         }
 
         return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(tm));

From 6ec49d7424872558f53efd1c1c129a968c1d7841 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 08:03:05 -0700
Subject: [PATCH 095/331] [LLVM] Aliasing and `cpu` options for LLVM visitor
 and the benchmark (#686)

* Added may-alias and cpu options
* Removed CPU checks
* Use steady clock as we saw issue on VM

Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>
---
 azure-pipelines.yml                       |  1 +
 src/codegen/llvm/codegen_llvm_visitor.hpp |  9 ++-
 src/codegen/llvm/llvm_ir_builder.cpp      |  9 ++-
 src/codegen/llvm/llvm_ir_builder.hpp      |  9 ++-
 src/main.cpp                              | 21 ++++---
 test/benchmark/jit_driver.cpp             | 71 ++++++++++++++---------
 test/benchmark/jit_driver.hpp             | 26 ++++-----
 test/benchmark/llvm_benchmark.cpp         | 68 +++-------------------
 test/benchmark/llvm_benchmark.hpp         | 12 ++--
 9 files changed, 103 insertions(+), 123 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index a80edbd909..05a24fc841 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -177,6 +177,7 @@ jobs:
     condition: false
     env:
       SHELL: 'bash'
+    condition: false
     displayName: 'Build Neuron and Run Integration Tests'
 - job: 'manylinux_wheels'
   timeoutInMinutes: 45
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 5dd8eda15c..22b9fafd83 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -95,14 +95,19 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        int vector_width = 1,
                        std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {})
+                       std::vector<std::string> fast_math_flags = {},
+                       bool llvm_assume_alias = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_level_ir(opt_level_ir)
         , vector_width(vector_width)
         , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
-        , ir_builder(*context, use_single_precision, vector_width, fast_math_flags)
+        , ir_builder(*context,
+                     use_single_precision,
+                     vector_width,
+                     fast_math_flags,
+                     !llvm_assume_alias)
         , debug_builder(*module) {}
 
     /// Dumps the generated LLVM IR module to string.
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 004f28d857..a585c95b3b 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -200,12 +200,15 @@ void IRBuilder::set_kernel_attributes() {
     current_function->setDoesNotFreeMemory();
     current_function->setDoesNotThrow();
 
-    // We also want to specify that the pointers that instance struct holds, do not alias. In order
-    // to do that, we add a `noalias` attribute to the argument. As per Clang's specification:
+    // We also want to specify that the pointers that instance struct holds do not alias, unless
+    // specified otherwise. In order to do that, we add a `noalias` attribute to the argument. As
+    // per Clang's specification:
     //  > The `noalias` attribute indicates that the only memory accesses inside function are loads
     //  > and stores from objects pointed to by its pointer-typed arguments, with arbitrary
     //  > offsets.
-    current_function->addParamAttr(0, llvm::Attribute::NoAlias);
+    if (assume_noalias) {
+        current_function->addParamAttr(0, llvm::Attribute::NoAlias);
+    }
 
     // Finally, specify that the struct pointer does not capture and is read-only.
     current_function->addParamAttr(0, llvm::Attribute::NoCapture);
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index b9736e2846..b3005db0c7 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -58,6 +58,9 @@ class IRBuilder {
     /// The vector width used for the vectorized code.
     unsigned vector_width;
 
+    /// Instance struct fields do not alias.
+    bool assume_noalias;
+
     /// Masked value used to predicate vector instructions.
     llvm::Value* mask;
 
@@ -71,7 +74,8 @@ class IRBuilder {
     IRBuilder(llvm::LLVMContext& context,
               bool use_single_precision = false,
               unsigned vector_width = 1,
-              std::vector<std::string> fast_math_flags = {})
+              std::vector<std::string> fast_math_flags = {},
+              bool assume_noalias = true)
         : builder(context)
         , symbol_table(nullptr)
         , current_function(nullptr)
@@ -81,7 +85,8 @@ class IRBuilder {
         , vector_width(vector_width)
         , mask(nullptr)
         , kernel_id("")
-        , fast_math_flags(fast_math_flags) {}
+        , fast_math_flags(fast_math_flags)
+        , assume_noalias(assume_noalias) {}
 
     /// Transforms the fast math flags provided to the builder into LLVM's representation.
     llvm::FastMathFlags transform_to_fmf(std::vector<std::string>& flags) {
diff --git a/src/main.cpp b/src/main.cpp
index 4192a44f57..7a3037a464 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -186,6 +186,9 @@ int main(int argc, const char* argv[]) {
     /// run llvm benchmark
     bool run_llvm_benchmark(false);
 
+    /// do not assume that instance struct fields do not alias
+    bool llvm_assume_alias(false);
+
     /// optimisation level for IR generation
     int llvm_opt_level_ir = 0;
 
@@ -201,8 +204,8 @@ int main(int argc, const char* argv[]) {
     /// the number of repeated experiments for the benchmarking
     int num_experiments = 100;
 
-    /// specify the backend for LLVM IR to target
-    std::string backend = "default";
+    /// specify the cpu for LLVM IR to target
+    std::string cpu = "default";
 #endif
 
     app.get_formatter()->column_width(40);
@@ -324,6 +327,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
+    llvm_opt->add_flag("--assume-may-alias",
+                       llvm_assume_alias,
+                       "Assume instance struct fields may alias ({})"_format(llvm_assume_alias))->ignore_case();
     llvm_opt->add_option("--vector-width",
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
@@ -351,9 +357,9 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
-    benchmark_opt->add_option("--backend",
-                       backend,
-                       "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));
+    benchmark_opt->add_option("--cpu",
+                       cpu,
+                       "Target's backend ({})"_format(cpu))->ignore_case();
 #endif
     // clang-format on
 
@@ -664,7 +670,8 @@ int main(int argc, const char* argv[]) {
                                            llvm_vec_width,
                                            vector_library,
                                            !disable_debug_information,
-                                           llvm_fast_math_flags);
+                                           llvm_fast_math_flags,
+                                           llvm_assume_alias);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
@@ -677,7 +684,7 @@ int main(int argc, const char* argv[]) {
                                                        shared_lib_paths,
                                                        num_experiments,
                                                        instance_size,
-                                                       backend,
+                                                       cpu,
                                                        llvm_opt_level_ir,
                                                        llvm_opt_level_codegen);
                     benchmark.run(ast);
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index a804a2d4fd..f91b41cda0 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -31,8 +31,21 @@ namespace runner {
 /*                            Utilities for JIT driver                                  */
 /****************************************************************************************/
 
+/// Get the host CPU features in the format:
+///   +feature,+feature,-feature,+feature,...
+/// where `+` indicates that the feature is enabled.
+std::string get_cpu_features(const std::string& cpu) {
+    llvm::SubtargetFeatures features;
+    llvm::StringMap<bool> host_features;
+    if (llvm::sys::getHostCPUFeatures(host_features)) {
+        for (auto& f: host_features)
+            features.AddFeature(f.first(), f.second);
+    }
+    return llvm::join(features.getFeatures().begin(), features.getFeatures().end(), ",");
+}
+
 /// Sets the target triple and the data layout of the module.
-static void set_triple_and_data_layout(llvm::Module& module, const std::string& features) {
+static void set_triple_and_data_layout(llvm::Module& module, const std::string& cpu) {
     // Get the default target triple for the host.
     auto target_triple = llvm::sys::getDefaultTargetTriple();
     std::string error_msg;
@@ -40,8 +53,8 @@ static void set_triple_and_data_layout(llvm::Module& module, const std::string&
     if (!target)
         throw std::runtime_error("Error " + error_msg + "\n");
 
-    // Get the CPU information and set a target machine to create the data layout.
-    std::string cpu(llvm::sys::getHostCPUName());
+    // Set a target machine to create the data layout.
+    std::string features = get_cpu_features(cpu);
     std::unique_ptr<llvm::TargetMachine> tm(
         target->createTargetMachine(target_triple, cpu, features, {}, {}));
     if (!tm)
@@ -52,10 +65,10 @@ static void set_triple_and_data_layout(llvm::Module& module, const std::string&
     module.setTargetTriple(target_triple);
 }
 
-/// Creates llvm::TargetMachine with certain CPU features turned on/off.
+/// Creates llvm::TargetMachine with for a specified CPU.
 static std::unique_ptr<llvm::TargetMachine> create_target(
     llvm::orc::JITTargetMachineBuilder* tm_builder,
-    const std::string& features,
+    const std::string& cpu,
     int opt_level) {
     // First, look up the target.
     std::string error_msg;
@@ -66,8 +79,8 @@ static std::unique_ptr<llvm::TargetMachine> create_target(
 
     // Create default target machine with provided features.
     auto tm = target->createTargetMachine(target_triple,
-                                          llvm::sys::getHostCPUName().str(),
-                                          features,
+                                          cpu,
+                                          get_cpu_features(cpu),
                                           tm_builder->getOptions(),
                                           tm_builder->getRelocationModel(),
                                           tm_builder->getCodeModel(),
@@ -83,15 +96,13 @@ static std::unique_ptr<llvm::TargetMachine> create_target(
 /*                                      JIT driver                                      */
 /****************************************************************************************/
 
-void JITDriver::init(std::string features,
-                     std::vector<std::string> lib_paths,
-                     BenchmarkInfo* benchmark_info) {
+void JITDriver::init(const std::string& cpu, BenchmarkInfo* benchmark_info) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
     utils::initialise_optimisation_passes();
 
     // Set the target triple and the data layout for the module.
-    set_triple_and_data_layout(*module, features);
+    set_triple_and_data_layout(*module, cpu);
     auto data_layout = module->getDataLayout();
 
     // If benchmarking, enable listeners to use GDB, perf or VTune. Note that LLVM should be built
@@ -120,24 +131,26 @@ void JITDriver::init(std::string features,
         if (intel_event_listener)
             layer->registerJITEventListener(*intel_event_listener);
 
-        for (const auto& lib_path: lib_paths) {
-            // For every library path, create a corresponding memory buffer.
-            auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
-            if (!memory_buffer)
-                throw std::runtime_error("Unable to create memory buffer for " + lib_path);
-
-            // Create a new JIT library instance for this session and resolve symbols.
-            auto& jd = session.createBareJITDylib(std::string(lib_path));
-            auto loaded =
-                llvm::orc::DynamicLibrarySearchGenerator::Load(lib_path.data(),
-                                                               data_layout.getGlobalPrefix());
-
-            if (!loaded)
-                throw std::runtime_error("Unable to load " + lib_path);
-            jd.addGenerator(std::move(*loaded));
-            cantFail(layer->add(jd, std::move(*memory_buffer)));
+        // If benchmarking, resolve shared libraries.
+        if (benchmark_info) {
+            for (const auto& lib_path: benchmark_info->shared_lib_paths) {
+                // For every library path, create a corresponding memory buffer.
+                auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
+                if (!memory_buffer)
+                    throw std::runtime_error("Unable to create memory buffer for " + lib_path);
+
+                // Create a new JIT library instance for this session and resolve symbols.
+                auto& jd = session.createBareJITDylib(std::string(lib_path));
+                auto loaded =
+                    llvm::orc::DynamicLibrarySearchGenerator::Load(lib_path.data(),
+                                                                   data_layout.getGlobalPrefix());
+
+                if (!loaded)
+                    throw std::runtime_error("Unable to load " + lib_path);
+                jd.addGenerator(std::move(*loaded));
+                cantFail(layer->add(jd, std::move(*memory_buffer)));
+            }
         }
-
         return layer;
     };
 
@@ -146,7 +159,7 @@ void JITDriver::init(std::string features,
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
         // Create target machine with some features possibly turned off.
         int opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
-        auto tm = create_target(&tm_builder, features, opt_level_codegen);
+        auto tm = create_target(&tm_builder, cpu, opt_level_codegen);
 
         // Optimise the LLVM IR module and save it to .ll file if benchmarking.
         if (benchmark_info) {
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index d8e1127417..7106311523 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -17,6 +17,7 @@
 
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/Support/Host.h"
 
 namespace nmodl {
 namespace runner {
@@ -29,6 +30,9 @@ struct BenchmarkInfo {
     /// Object file output directory.
     std::string output_dir;
 
+    /// Shared libraries' paths to link against.
+    std::vector<std::string> shared_lib_paths;
+
     /// Optimisation level for IT.
     int opt_level_ir;
 
@@ -63,9 +67,7 @@ class JITDriver {
         : module(std::move(m)) {}
 
     /// Initializes the JIT driver.
-    void init(std::string features = "",
-              std::vector<std::string> lib_paths = {},
-              BenchmarkInfo* benchmark_info = nullptr);
+    void init(const std::string& cpu, BenchmarkInfo* benchmark_info = nullptr);
 
     /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
     template <typename ReturnType>
@@ -131,7 +133,7 @@ class TestRunner: public BaseRunner {
         : BaseRunner(std::move(m)) {}
 
     virtual void initialize_driver() {
-        driver->init();
+        driver->init(llvm::sys::getHostCPUName().str());
     }
 };
 
@@ -145,27 +147,23 @@ class BenchmarkRunner: public BaseRunner {
     /// Benchmarking information passed to JIT driver.
     BenchmarkInfo benchmark_info;
 
-    /// CPU features specified by the user.
-    std::string features;
-
-    /// Shared libraries' paths to link against.
-    std::vector<std::string> shared_lib_paths;
+    /// CPU to target.
+    std::string cpu;
 
   public:
     BenchmarkRunner(std::unique_ptr<llvm::Module> m,
                     std::string filename,
                     std::string output_dir,
-                    std::string features = "",
+                    std::string cpu,
                     std::vector<std::string> lib_paths = {},
                     int opt_level_ir = 0,
                     int opt_level_codegen = 0)
         : BaseRunner(std::move(m))
-        , benchmark_info{filename, output_dir, opt_level_ir, opt_level_codegen}
-        , features(features)
-        , shared_lib_paths(lib_paths) {}
+        , cpu(cpu)
+        , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
     virtual void initialize_driver() {
-        driver->init(features, shared_lib_paths, &benchmark_info);
+        driver->init(cpu, &benchmark_info);
     }
 };
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index e48df0d457..0e94ae231b 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -6,7 +6,6 @@
  *************************************************************************/
 
 #include <chrono>
-#include <fstream>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "llvm_benchmark.hpp"
@@ -19,35 +18,6 @@
 namespace nmodl {
 namespace benchmark {
 
-/// Precision for the timing measurements.
-static constexpr int PRECISION = 9;
-
-/// Get the host CPU features in the format:
-///   +feature,+feature,-feature,+feature,...
-/// where `+` indicates that the feature is enabled.
-static std::vector<std::string> get_cpu_features() {
-    std::string cpu(llvm::sys::getHostCPUName());
-
-    llvm::SubtargetFeatures features;
-    llvm::StringMap<bool> host_features;
-    if (llvm::sys::getHostCPUFeatures(host_features)) {
-        for (auto& f: host_features)
-            features.AddFeature(f.first(), f.second);
-    }
-    return features.getFeatures();
-}
-
-
-void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>& host_features) {
-    for (auto& host_feature: host_features) {
-        if (feature == host_feature.substr(1)) {
-            host_feature[0] = '-';
-            logger->info("{}", host_feature);
-            return;
-        }
-    }
-}
-
 void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
     // create functions
     generate_llvm(node);
@@ -57,9 +27,9 @@ void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
 
 void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
     // First, visit the AST to build the LLVM IR module and wrap the kernel function calls.
-    auto start = std::chrono::high_resolution_clock::now();
+    auto start = std::chrono::steady_clock::now();
     llvm_visitor.wrap_kernel_functions();
-    auto end = std::chrono::high_resolution_clock::now();
+    auto end = std::chrono::steady_clock::now();
 
     // Log the time taken to visit the AST and build LLVM IR.
     std::chrono::duration<double> diff = end - start;
@@ -72,37 +42,17 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     std::vector<std::string> kernel_names;
     llvm_visitor.find_kernel_names(kernel_names);
 
-    // Get feature's string and turn them off depending on the backend.
-    std::vector<std::string> features = get_cpu_features();
-    logger->info("Backend: {}", backend);
-    if (backend == "avx2") {
-        // Disable SSE.
-        logger->info("Disabling features:");
-        disable("sse", features);
-        disable("sse2", features);
-        disable("sse3", features);
-        disable("sse4.1", features);
-        disable("sse4.2", features);
-    } else if (backend == "sse2") {
-        // Disable AVX.
-        logger->info("Disabling features:");
-        disable("avx", features);
-        disable("avx2", features);
-    }
+    // Get feature's string and turn them off depending on the cpu.
+    std::string cpu_name = cpu == "default" ? llvm::sys::getHostCPUName().str() : cpu;
+    logger->info("CPU: {}", cpu_name);
 
-    std::string features_str = llvm::join(features.begin(), features.end(), ",");
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
     // Create the benchmark runner and initialize it.
     std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
                            mod_filename;
-    runner::BenchmarkRunner runner(std::move(m),
-                                   filename,
-                                   output_dir,
-                                   features_str,
-                                   shared_libs,
-                                   opt_level_ir,
-                                   opt_level_codegen);
+    runner::BenchmarkRunner runner(
+        std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
     // Benchmark every kernel.
@@ -124,9 +74,9 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
-            auto start = std::chrono::high_resolution_clock::now();
+            auto start = std::chrono::steady_clock::now();
             runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
-            auto end = std::chrono::high_resolution_clock::now();
+            auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
             // Log the time taken for each run.
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index 4a66de52fc..cc9dd3bcf0 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <fstream>
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
@@ -40,8 +41,8 @@ class LLVMBenchmark {
     /// The size of the instance struct for benchmarking.
     int instance_size;
 
-    /// Benchmarking backend
-    std::string backend;
+    /// CPU to target.
+    std::string cpu;
 
     /// Optimisation level for IR generation.
     int opt_level_ir;
@@ -59,7 +60,7 @@ class LLVMBenchmark {
                   std::vector<std::string> shared_libs,
                   int num_experiments,
                   int instance_size,
-                  const std::string& backend,
+                  const std::string& cpu,
                   int opt_level_ir,
                   int opt_level_codegen)
         : llvm_visitor(llvm_visitor)
@@ -68,7 +69,7 @@ class LLVMBenchmark {
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
-        , backend(backend)
+        , cpu(cpu)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen) {}
 
@@ -76,9 +77,6 @@ class LLVMBenchmark {
     void run(const std::shared_ptr<ast::Program>& node);
 
   private:
-    /// Disables the specified feature in the target.
-    void disable(const std::string& feature, std::vector<std::string>& host_features);
-
     /// Visits the AST to construct the LLVM IR module.
     void generate_llvm(const std::shared_ptr<ast::Program>& node);
 

From 313996f5c91d2495ebc17ad334fbc9465da3c5c4 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 3 Jun 2021 17:54:35 +0200
Subject: [PATCH 096/331] Updates after meeting

---
 test/benchmark/kernels/compute-bound.cpp |  1 -
 test/benchmark/kernels/hh.cpp            |  1 -
 test/benchmark/kernels/memory-bound.cpp  |  1 -
 test/benchmark/nmodl-llvm-time.sh        | 40 +++++++++++++-----------
 4 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/test/benchmark/kernels/compute-bound.cpp b/test/benchmark/kernels/compute-bound.cpp
index db5cfb7cd2..b331b22f76 100644
--- a/test/benchmark/kernels/compute-bound.cpp
+++ b/test/benchmark/kernels/compute-bound.cpp
@@ -21,7 +21,6 @@ void nrn_state_hh_ext(void* __restrict__ mech){
     int id;
     int node_id;
     double v;
-    #pragma ivdep
     for(int id = 0; id<inst->node_count; ++id) {
         node_id = inst->node_index[id];
         v = inst->voltage[node_id];
diff --git a/test/benchmark/kernels/hh.cpp b/test/benchmark/kernels/hh.cpp
index 9fa11beadf..c72fe467af 100644
--- a/test/benchmark/kernels/hh.cpp
+++ b/test/benchmark/kernels/hh.cpp
@@ -52,7 +52,6 @@ void nrn_state_hh_ext(void* __restrict__ mech){
     int id;
     int node_id, ena_id, ek_id;
     double v;
-    #pragma ivdep     // icpc vec helper
     #pragma omp simd  // clang vec helper
     for(id = 0; id<inst->node_count; ++id) {
         node_id = inst->node_index[id];
diff --git a/test/benchmark/kernels/memory-bound.cpp b/test/benchmark/kernels/memory-bound.cpp
index 8f2c165f72..8beead4fde 100644
--- a/test/benchmark/kernels/memory-bound.cpp
+++ b/test/benchmark/kernels/memory-bound.cpp
@@ -26,7 +26,6 @@ void nrn_state_hh_ext(void* __restrict__ mech){
     int id;
     int node_id, nai_id, ion_nai_id;
     double v;
-    #pragma ivdep
     for(int id = 0; id<inst->node_count; ++id) {
         node_id = inst->node_index[id];
         nai_id = inst->ion_nai_index[id];
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 17aba5a0ba..8224faf881 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -112,33 +112,33 @@ mkdir -p ${output_dir}
 # compilers
 icpc_exe=icpc
 declare -a icpc_flags_avx512=(
-    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml"
-    "-O2 -qopt-zmm-usage=high -xCORE-AVX512 -prec-div -fimf-use-svml"
-    "-O2 -mavx512f -prec-div -fimf-use-svml"
+    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml" # avx2
+    "-O2 -mavx512f -prec-div -fimf-use-svml" #generates avx512 code
+    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml -fopenmp" # avx2
+    "-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp"
     )
 
+    #delete ivdep in handwrtten kernels
+# check for haswell architecture option
+# replace -mavx2 with the procesor family
+# for one run keep both family and -mavx2
 declare -a icpc_flags_avx2=(
     "-O2 -mavx2 -prec-div -fimf-use-svml"
     )
 
 declare -a icpc_flags_sse2=(
-    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml"
-    "-O2 -qopt-zmm-usage=high -xCORE-AVX512 -prec-div -fimf-use-svml"
-    "-O2 -mavx512f -prec-div -fimf-use-svml"
-    "-O2 -mavx2 -prec-div -fimf-use-svml"
     "-O2 -msse2 -prec-div -fimf-use-svml"
     )
 
 llvm_path="/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621"
 llvm_lib=${llvm_path}/lib
 clang_exe=${llvm_path}/bin/clang++
+# -march=skylake-avx512 doesn't generate avx512 commands with clang
 declare -a clang_flags_avx512=(
-    "-O3 -mavx512f -ffast-math -fopenmp -fveclib=SVML"
     "-O3 -mavx512f -ffast-math -fveclib=SVML"
-    "-O3 -mavx512f -fveclib=SVML"
-    "-O3 -march=skylake-avx512 -ffast-math -fopenmp -fveclib=SVML"
+    "-O3 -mavx512f -ffast-math -fopenmp -fveclib=SVML"
     )
-
+# check -mcpu=
 declare -a clang_flags_avx2=(
     "-O3 -mavx2 -ffast-math -fopenmp -fveclib=SVML"
     )
@@ -151,21 +151,25 @@ gcc_bin_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06
 gcc_exe=${gcc_bin_path}/g++
 declare -a gcc_flags_avx512=(
     "-O3 -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml"
+    "-O3 -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
     )
 
 declare -a gcc_flags_avx2=(
-    "-O3 -mavx2 -ffast-math -ftree-vectorize -mveclibabi=svml"
+    "-O3 -mavx2 -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
     )
 
 declare -a gcc_flags_sse2=(
-    "-O3 -msse2 -ffast-math -ftree-vectorize -mveclibabi=svml"
+    "-O3 -msse2 -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
     )
 
 declare -a benchmark_description
 declare -a benchmark_time
-
-KERNEL_TARGETS="compute-bound memory-bound hh"
-ARCHITECTURES="avx512 avx2 sse2"
+# also get variance
+#KERNEL_TARGETS="compute-bound memory-bound hh"
+KERNEL_TARGETS="compute-bound"
+#ARCHITECTURES="avx512 avx2 sse2"
+ARCHITECTURES="avx2"
+# set cpu option in jit according to the architecture
 
 # loop over options
 for kernel_target in ${KERNEL_TARGETS}; do
@@ -195,6 +199,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
 	                spec=${compiler}_${flags//[[:blank:]]/}
 	                rel_ext_path=${kernel_target}_${spec}
 
+                    # for avx512 compile and run without pragma omp simd and pragma ivdep
 	                ${debug} mkdir ${rel_ext_path}
 	                ${debug} cd ${rel_ext_path}
 	                ext_path=$(pwd)
@@ -203,7 +208,6 @@ for kernel_target in ${KERNEL_TARGETS}; do
 	                ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
 	                ${debug} cd ..
 
-                    
                     nmodl_args="llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark -run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu default --libs {vec_lib_path}/${vec_lib}"
 
                     nmodl_args="${nmodl_args} --external"
@@ -244,4 +248,4 @@ done
 
 for bench_desc in ${benchmark_description[@]}; do
     echo $bench_desc
-done
\ No newline at end of file
+done

From d83206e7945f7ac7095b5649ec9c03255bc69aaa Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Thu, 3 Jun 2021 18:52:35 +0200
Subject: [PATCH 097/331] Fix azure yaml pipeline from merge (#687)

---
 azure-pipelines.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 05a24fc841..ee0e5152b1 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -174,7 +174,6 @@ jobs:
         exit 1
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
-    condition: false
     env:
       SHELL: 'bash'
     condition: false

From 2e5adf55e7f9ee451620aaf3c0007dbe0644a5c3 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 3 Jun 2021 20:44:50 +0200
Subject: [PATCH 098/331] Updated script

---
 src/main.cpp                      |  2 +-
 test/benchmark/llvm_benchmark.cpp |  6 ++++++
 test/benchmark/nmodl-llvm-time.sh | 32 ++++++++++++++++++-------------
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index c6c8e141ce..bb5ffaa299 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -363,7 +363,7 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--cpu",
                        cpu,
                        "Target's backend ({})"_format(cpu))->ignore_case();
-    benchmark_opt->add_option("--external",
+    benchmark_opt->add_flag("--external",
                               external_kernel,
                               "Benchmark external kernel ({})"_format(external_kernel))->ignore_case();
 #endif
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 2a34445ae2..b0eb5b98e8 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -67,6 +67,12 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             // Initialise the data.
             auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
 
+            // Log instance size once.
+            if (i == 0) {
+                double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
+                logger->info("Benchmarking kernel nrn_state_hh_ext with {} MBs dataset", size_mbs);
+            }
+
             // Record the execution time of the kernel.
             auto start = std::chrono::steady_clock::now();
             nrn_state_hh_ext(instance_data.base_ptr);
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 8224faf881..166e36d38c 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -103,14 +103,13 @@ nmodl_exe="/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/build_benchmark/
 nmodl_src_path="/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl"
 kernels_path=${nmodl_src_path}/"test/benchmark/kernels"
 ext_lib="libextkernel.so"
-if ${external_kernel_exec}; then
-    modfile_directory=${kernels_path}
-fi
+modfile_directory=${kernels_path}
 
 mkdir -p ${output_dir}
 
+intel_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/intel-20.0.2-ilowey/bin"
 # compilers
-icpc_exe=icpc
+icpc_exe=${intel_path}/icpc
 declare -a icpc_flags_avx512=(
     "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml" # avx2
     "-O2 -mavx512f -prec-div -fimf-use-svml" #generates avx512 code
@@ -164,11 +163,12 @@ declare -a gcc_flags_sse2=(
 
 declare -a benchmark_description
 declare -a benchmark_time
+declare -a benchmark_variance
 # also get variance
 #KERNEL_TARGETS="compute-bound memory-bound hh"
 KERNEL_TARGETS="compute-bound"
 #ARCHITECTURES="avx512 avx2 sse2"
-ARCHITECTURES="avx2"
+ARCHITECTURES="avx512"
 # set cpu option in jit according to the architecture
 
 # loop over options
@@ -200,7 +200,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
 	                rel_ext_path=${kernel_target}_${spec}
 
                     # for avx512 compile and run without pragma omp simd and pragma ivdep
-	                ${debug} mkdir ${rel_ext_path}
+	                ${debug} mkdir -p ${rel_ext_path}
 	                ${debug} cd ${rel_ext_path}
 	                ext_path=$(pwd)
 	                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp \
@@ -208,14 +208,15 @@ for kernel_target in ${KERNEL_TARGETS}; do
 	                ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
 	                ${debug} cd ..
 
-                    nmodl_args="llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark -run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu default --libs {vec_lib_path}/${vec_lib}"
+                    nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu default --libs ${vec_lib_path}/${vec_lib}"
 
                     nmodl_args="${nmodl_args} --external"
                     benchmark_ext_desc=ext_${kernel_target}_${compiler}_${architecture}_v${vec_width}_${flags//[[:blank:]]/}
                     benchmark_description+=("${benchmark_ext_desc}")
                     # runs only external kernel
                     ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_ext_desc}.log"
-                    #benchmark_time+=(...)
+                    benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_ext_desc}.log | awk '{print $NF}'))
+                    benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_ext_desc}.log | awk '{print $NF}'))
                 done
             done
 		fi
@@ -235,17 +236,22 @@ for kernel_target in ${KERNEL_TARGETS}; do
                     assume_may_alias_flag=""
                     assume_may_alias_opt="noalias"
                 fi
-                nmodl_args="llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark -run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu default --libs {vec_lib_path}/${vec_lib}"
+                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu default --libs ${vec_lib_path}/${vec_lib}"
                 benchmark_nmodl_desc=nmodl_${kernel_target}_${architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
                 benchmark_description+=("${benchmark_nmodl_desc}")
                 # runs only kernel generated by LLVM IR
-                ${debug} eval "LD_LIBRARY_PATH=${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_nmodl_desc}.log"
-                #benchmark_time+=(...)
+                ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_nmodl_desc}.log"
+                benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
+                benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
             done
         done
     done
 done
 
-for bench_desc in ${benchmark_description[@]}; do
-    echo $bench_desc
+OUTPUT_FILE=${output_dir}/output_${KERNEL_TARGETS//[[:blank:]]/}_${ARCHITECTURES//[[:blank:]]/}.txt
+rm -f ${OUTPUT_FILE}
+for index in ${!benchmark_description[@]}; do
+    echo -e "${benchmark_description[$index]}\t${benchmark_time[$index]}\t${benchmark_variance[$index]}" &>> ${OUTPUT_FILE}
 done
+
+cat ${OUTPUT_FILE}

From 7b0f6ff71ae416413e5a1dce7c049744157a86fe Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 4 Jun 2021 16:07:58 +0200
Subject: [PATCH 099/331] Updated script with architectures

---
 test/benchmark/nmodl-llvm-time.sh | 70 ++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 166e36d38c..61aece0cac 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -110,10 +110,10 @@ mkdir -p ${output_dir}
 intel_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/intel-20.0.2-ilowey/bin"
 # compilers
 icpc_exe=${intel_path}/icpc
-declare -a icpc_flags_avx512=(
-    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml" # avx2
+declare -a icpc_flags_skylake_avx512=(
+    "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml" # avx2
     "-O2 -mavx512f -prec-div -fimf-use-svml" #generates avx512 code
-    "-O2 -march=skylake-avx512 -mtune=skylake-avx512 -prec-div -fimf-use-svml -fopenmp" # avx2
+    "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml -fopenmp" # avx2
     "-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp"
     )
 
@@ -121,11 +121,11 @@ declare -a icpc_flags_avx512=(
 # check for haswell architecture option
 # replace -mavx2 with the procesor family
 # for one run keep both family and -mavx2
-declare -a icpc_flags_avx2=(
-    "-O2 -mavx2 -prec-div -fimf-use-svml"
+declare -a icpc_flags_broadwell=(
+    "-O2 -march=broadwell -mtune=broadwell -prec-div -fimf-use-svml"
     )
 
-declare -a icpc_flags_sse2=(
+declare -a icpc_flags_nehalem=(
     "-O2 -msse2 -prec-div -fimf-use-svml"
     )
 
@@ -133,59 +133,69 @@ llvm_path="/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621"
 llvm_lib=${llvm_path}/lib
 clang_exe=${llvm_path}/bin/clang++
 # -march=skylake-avx512 doesn't generate avx512 commands with clang
-declare -a clang_flags_avx512=(
-    "-O3 -mavx512f -ffast-math -fveclib=SVML"
-    "-O3 -mavx512f -ffast-math -fopenmp -fveclib=SVML"
+declare -a clang_flags_skylake_avx512=(
+    "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fveclib=SVML"
+    "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML"
     )
 # check -mcpu=
-declare -a clang_flags_avx2=(
-    "-O3 -mavx2 -ffast-math -fopenmp -fveclib=SVML"
+declare -a clang_flags_broadwell=(
+    "-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp -fveclib=SVML"
     )
 
-declare -a clang_flags_sse2=(
-    "-O3 -msse2 -ffast-math -fopenmp -fveclib=SVML"
+declare -a clang_flags_nehalem=(
+    "-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp -fveclib=SVML"
     )
 
 gcc_bin_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/gcc-9.3.0-45gzrp/bin"
 gcc_exe=${gcc_bin_path}/g++
-declare -a gcc_flags_avx512=(
-    "-O3 -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml"
-    "-O3 -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
+declare -a gcc_flags_skylake_avx512=(
+    "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -mveclibabi=svml"
+    "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
     )
 
-declare -a gcc_flags_avx2=(
-    "-O3 -mavx2 -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
+declare -a gcc_flags_broadwell=(
+    "-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
     )
 
-declare -a gcc_flags_sse2=(
-    "-O3 -msse2 -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
+declare -a gcc_flags_nehalem=(
+    "-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
     )
 
 declare -a benchmark_description
 declare -a benchmark_time
 declare -a benchmark_variance
 # also get variance
-#KERNEL_TARGETS="compute-bound memory-bound hh"
-KERNEL_TARGETS="compute-bound"
-#ARCHITECTURES="avx512 avx2 sse2"
-ARCHITECTURES="avx512"
+KERNEL_TARGETS="compute-bound memory-bound hh"
+#KERNEL_TARGETS="compute-bound"
+ARCHITECTURES="skylake_avx512 broadwell nehalem"
+#ARCHITECTURES="avx512"
 # set cpu option in jit according to the architecture
 
+#
+#inst_size_hh=$(($inst_size/5))
+
 # loop over options
 for kernel_target in ${KERNEL_TARGETS}; do
     echo "Kernel: $kernel_target"
-
+    # if [ "$kernel_target" == "hh" ]; then
+    #     inst_size=$(($inst_size/5))
+    # fi
     for architecture in ${ARCHITECTURES}; do
-        if [ "$architecture" = "avx512" ] ; then
+        if [ "$architecture" = "skylake_avx512" ] ; then
             vec_width=8
-        elif [ "$architecture" = "avx2" ] ; then
+        elif [ "$architecture" = "broadwell" ] ; then
             vec_width=4
-        elif [ "$architecture" = "sse2" ]; then
+        elif [ "$architecture" = "nehalem" ]; then
             vec_width=2
         else
             vec_width=1
         fi
         echo "|  Architecture: $architecture"
+        if [ "$architecture" = "skylake_avx512" ] ; then
+            nmodl_architecture="skylake-avx512"
+        else
+            nmodl_architecture=$architecture
+        fi
 
         if $external_kernel_exec; then
             for compiler in icpc clang gcc; do
@@ -208,7 +218,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
 	                ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
 	                ${debug} cd ..
 
-                    nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu default --libs ${vec_lib_path}/${vec_lib}"
+                    nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
 
                     nmodl_args="${nmodl_args} --external"
                     benchmark_ext_desc=ext_${kernel_target}_${compiler}_${architecture}_v${vec_width}_${flags//[[:blank:]]/}
@@ -236,7 +246,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
                     assume_may_alias_flag=""
                     assume_may_alias_opt="noalias"
                 fi
-                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu default --libs ${vec_lib_path}/${vec_lib}"
+                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
                 benchmark_nmodl_desc=nmodl_${kernel_target}_${architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
                 benchmark_description+=("${benchmark_nmodl_desc}")
                 # runs only kernel generated by LLVM IR

From 6a71c5cf7cf6f62bc78e25e47ee6d76079542ee0 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 4 Jun 2021 16:13:11 +0200
Subject: [PATCH 100/331] Remove 2 runs with icpc and mavx512

---
 test/benchmark/nmodl-llvm-time.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 61aece0cac..20fdc41bbf 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -112,9 +112,7 @@ intel_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/l
 icpc_exe=${intel_path}/icpc
 declare -a icpc_flags_skylake_avx512=(
     "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml" # avx2
-    "-O2 -mavx512f -prec-div -fimf-use-svml" #generates avx512 code
     "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml -fopenmp" # avx2
-    "-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp"
     )
 
     #delete ivdep in handwrtten kernels

From 71d0049205386d2a7e93fa7293206dfaeed7616b Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 4 Jun 2021 16:48:07 +0200
Subject: [PATCH 101/331] Removed small comment for avx512 intel flags

---
 test/benchmark/nmodl-llvm-time.sh | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 20fdc41bbf..64677a929b 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -111,14 +111,10 @@ intel_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/l
 # compilers
 icpc_exe=${intel_path}/icpc
 declare -a icpc_flags_skylake_avx512=(
-    "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml" # avx2
-    "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml -fopenmp" # avx2
+    "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml"
+    "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml -fopenmp"
     )
 
-    #delete ivdep in handwrtten kernels
-# check for haswell architecture option
-# replace -mavx2 with the procesor family
-# for one run keep both family and -mavx2
 declare -a icpc_flags_broadwell=(
     "-O2 -march=broadwell -mtune=broadwell -prec-div -fimf-use-svml"
     )

From 9441fb5325120522f766f57b46bc15b60e9c32ba Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 4 Jun 2021 19:36:14 +0200
Subject: [PATCH 102/331] Updated script with running externally the .ll
 generated by JIT

---
 test/benchmark/nmodl-llvm-time.sh | 83 ++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 64677a929b..adc9c0ea30 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -25,9 +25,11 @@ function showusage {
 -n NUMBER, --instance-size NUMBER
 -e NUMBER, --num-exeperiment NUMBER
 -v NUMBER, --vec-width NUMBER
--d, --dry-run
--h, --help       Display this usage information.
--V, --version    Show version and exit.
+-ext, --external-kernel    Runs external kernels.
+-o, --output-directory     Sets the output directory.
+-d, --dry-run              Debug run.
+-h, --help                 Display this usage information.
+-V, --version              Show version and exit.
 Driver for benchmarking.
 "
 }
@@ -57,15 +59,6 @@ while [[ "$1" != "" ]]; do
             external_kernel_exec=true
             shift
             ;;
-        -vec-sweep|--vec-width-sweep)
-            vec_width_sweep=true
-            shift
-            ;;
-        -mod-dir|--modfile-directory)
-            modfile_directory=$2
-            shift
-            shift
-            ;;
         -o|--output-directory)
             output_dir=$2
             shift
@@ -126,12 +119,12 @@ declare -a icpc_flags_nehalem=(
 llvm_path="/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621"
 llvm_lib=${llvm_path}/lib
 clang_exe=${llvm_path}/bin/clang++
-# -march=skylake-avx512 doesn't generate avx512 commands with clang
+llc_exe=${llvm_path}/bin/llc
 declare -a clang_flags_skylake_avx512=(
     "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fveclib=SVML"
     "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML"
     )
-# check -mcpu=
+
 declare -a clang_flags_broadwell=(
     "-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp -fveclib=SVML"
     )
@@ -161,19 +154,24 @@ declare -a benchmark_variance
 # also get variance
 KERNEL_TARGETS="compute-bound memory-bound hh"
 #KERNEL_TARGETS="compute-bound"
+#KERNEL_TARGETS="hh"
 ARCHITECTURES="skylake_avx512 broadwell nehalem"
 #ARCHITECTURES="avx512"
+#ARCHITECTURES="skylake_avx512"
 # set cpu option in jit according to the architecture
-
-#
-#inst_size_hh=$(($inst_size/5))
+COMPILERS="icpc clang cpp"
+#COMPILERS="clang"
 
 # loop over options
 for kernel_target in ${KERNEL_TARGETS}; do
     echo "Kernel: $kernel_target"
-    # if [ "$kernel_target" == "hh" ]; then
-    #     inst_size=$(($inst_size/5))
-    # fi
+
+    # hh mechanism size 5 times the compute-bound and memory-bound
+    if [ "$kernel_target" == "hh" ]; then
+        kernel_inst_size=$(($inst_size/5))
+    else
+        kernel_inst_size=$inst_size
+    fi
     for architecture in ${ARCHITECTURES}; do
         if [ "$architecture" = "skylake_avx512" ] ; then
             vec_width=8
@@ -192,7 +190,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
         fi
 
         if $external_kernel_exec; then
-            for compiler in icpc clang gcc; do
+            for compiler in ${COMPILERS}; do
                 echo "|  |  Compiler: $compiler"
 			
 				compiler_exe=${compiler}_exe
@@ -201,34 +199,49 @@ for kernel_target in ${KERNEL_TARGETS}; do
 	        		echo "|  |  |  flags: "${flags}
            
 	                spec=${compiler}_${flags//[[:blank:]]/}
-	                rel_ext_path=${kernel_target}_${spec}
+	                rel_ext_path_cpp=${kernel_target}_${spec}_cpp
 
-                    # for avx512 compile and run without pragma omp simd and pragma ivdep
-	                ${debug} mkdir -p ${rel_ext_path}
-	                ${debug} cd ${rel_ext_path}
+	                ${debug} mkdir -p ${rel_ext_path_cpp}
+	                ${debug} cd ${rel_ext_path_cpp}
 	                ext_path=$(pwd)
 	                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp \
 	                -shared -fpic -o ${ext_lib}
-	                ${debug} eval "llvm-objdump ${ext_lib} -d > ${ext_lib::-1}"
+	                ${debug} eval "objdump ${ext_lib} -d > ${ext_lib::-1}"
 	                ${debug} cd ..
 
-                    nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
-
+                    # add --fmf nnan contract afn here to generate .ll file similar to the fast-math options from external compilers
+                    nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 --fmf nnan contract afn benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
                     nmodl_args="${nmodl_args} --external"
-                    benchmark_ext_desc=ext_${kernel_target}_${compiler}_${architecture}_v${vec_width}_${flags//[[:blank:]]/}
+
+                    benchmark_ext_desc=ext_${kernel_target}_${compiler}_${nmodl_architecture}_v${vec_width}_${flags//[[:blank:]]/}
                     benchmark_description+=("${benchmark_ext_desc}")
                     # runs only external kernel
                     ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_ext_desc}.log"
                     benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_ext_desc}.log | awk '{print $NF}'))
                     benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_ext_desc}.log | awk '{print $NF}'))
+
+                    if [ "$compiler" == "clang" ]; then
+                        rel_ext_path_llvm=${kernel_target}_${spec}_llvm
+                        ${debug} mkdir -p ${rel_ext_path_llvm}
+                        # Generate external library from LLVM IR of JIT
+                        ${debug} sed 's/nrn_state_hh/_Z16nrn_state_hh_extPv/g' v${vec_width}_${kernel_target}_opt.ll > ${output_dir}/v${vec_width}_${kernel_target}_opt_ext.ll
+                        ${debug} ${!compiler_exe} ${flags} -shared -fpic ${output_dir}/v${vec_width}_${kernel_target}_opt_ext.ll -o ${rel_ext_path_llvm}/${ext_lib} &>/dev/null # overwrites previous ${ext_lib}
+
+                        benchmark_ext_jit_desc=ext_jit_${kernel_target}_${compiler}_${nmodl_architecture}_v${vec_width}_${flags//[[:blank:]]/}
+                        benchmark_description+=("${benchmark_ext_jit_desc}")
+                        # run external library generated by the LLVM IR code of JIT
+                        ${debug} eval "LD_LIBRARY_PATH=${rel_ext_path_llvm}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_ext_jit_desc}.log"
+                        benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_ext_jit_desc}.log | awk '{print $NF}'))
+                        benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_ext_jit_desc}.log | awk '{print $NF}'))
+                    fi
                 done
             done
 		fi
         for fast_math in true false; do
             if $fast_math; then
-                fast_math_flag="--fmf fast"
-                fast_math_opt="fastmath"
-            else 
+                fast_math_flag="--fmf nnan contract afn"
+                fast_math_opt="nnancontractafn"
+            else
                 fast_math_flag=""
                 fast_math_opt="nonfastmath"
             fi
@@ -236,12 +249,12 @@ for kernel_target in ${KERNEL_TARGETS}; do
                 if $assume_may_alias; then
                     assume_may_alias_flag="--assume-may-alias"
                     assume_may_alias_opt="alias"
-                else 
+                else
                     assume_may_alias_flag=""
                     assume_may_alias_opt="noalias"
                 fi
-                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
-                benchmark_nmodl_desc=nmodl_${kernel_target}_${architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
+                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
+                benchmark_nmodl_desc=nmodl_${kernel_target}_${nmodl_architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
                 benchmark_description+=("${benchmark_nmodl_desc}")
                 # runs only kernel generated by LLVM IR
                 ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_nmodl_desc}.log"

From d51b4277d7d7bda83a29d2ab66559e5a8a2105ef Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 4 Jun 2021 19:57:13 +0200
Subject: [PATCH 103/331] Added some more printig for nmodl jit

---
 test/benchmark/nmodl-llvm-time.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index adc9c0ea30..9917ff9991 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -237,6 +237,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
                 done
             done
 		fi
+        echo "|  |  NMODL JIT"
         for fast_math in true false; do
             if $fast_math; then
                 fast_math_flag="--fmf nnan contract afn"
@@ -253,6 +254,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
                     assume_may_alias_flag=""
                     assume_may_alias_opt="noalias"
                 fi
+                echo "|  |  options: ${fast_math_flag} ${assume_may_alias_flag}"
                 nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
                 benchmark_nmodl_desc=nmodl_${kernel_target}_${nmodl_architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
                 benchmark_description+=("${benchmark_nmodl_desc}")

From 8fa6225fe4cba3f4c86ea70d131ee0dc34a86687 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 4 Jun 2021 20:02:52 +0200
Subject: [PATCH 104/331] Improved output

---
 test/benchmark/nmodl-llvm-time.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 9917ff9991..dd38de44b9 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -254,7 +254,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
                     assume_may_alias_flag=""
                     assume_may_alias_opt="noalias"
                 fi
-                echo "|  |  options: ${fast_math_flag} ${assume_may_alias_flag}"
+                echo "|  |  |  options: ${fast_math_flag} ${assume_may_alias_flag}"
                 nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
                 benchmark_nmodl_desc=nmodl_${kernel_target}_${nmodl_architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
                 benchmark_description+=("${benchmark_nmodl_desc}")

From ee91aaa39bf364323318d7e97474c3cc279625be Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 4 Jun 2021 20:10:10 +0200
Subject: [PATCH 105/331] Fixed compilers

---
 test/benchmark/nmodl-llvm-time.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index dd38de44b9..979c5d200b 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -159,7 +159,7 @@ ARCHITECTURES="skylake_avx512 broadwell nehalem"
 #ARCHITECTURES="avx512"
 #ARCHITECTURES="skylake_avx512"
 # set cpu option in jit according to the architecture
-COMPILERS="icpc clang cpp"
+COMPILERS="icpc clang gcc"
 #COMPILERS="clang"
 
 # loop over options

From 9286bacd76bacf4b955574ef9e8500ae35cbe8a6 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 6 Jun 2021 08:07:25 +0200
Subject: [PATCH 106/331] Changes for Ioannis script

---
 test/benchmark/ext_kernel.cpp     |   4 +-
 test/benchmark/kernels/hh.cpp     |   1 +
 test/benchmark/nmodl-llvm-time.sh | 141 ++++++++++++++++++------------
 3 files changed, 90 insertions(+), 56 deletions(-)

diff --git a/test/benchmark/ext_kernel.cpp b/test/benchmark/ext_kernel.cpp
index e0ce026338..632328e03f 100644
--- a/test/benchmark/ext_kernel.cpp
+++ b/test/benchmark/ext_kernel.cpp
@@ -10,4 +10,6 @@
 #include <iostream>
 
 // external kernel stub
-void nrn_state_hh_ext(void* ){}
+void nrn_state_hh_ext(void* ) {
+    throw std::runtime_error("Error: this should have been external nrn_state_hh_ext kernel, check library and LD_LIBRARY_PATH\n");
+}
diff --git a/test/benchmark/kernels/hh.cpp b/test/benchmark/kernels/hh.cpp
index c72fe467af..f84358f417 100644
--- a/test/benchmark/kernels/hh.cpp
+++ b/test/benchmark/kernels/hh.cpp
@@ -53,6 +53,7 @@ void nrn_state_hh_ext(void* __restrict__ mech){
     int node_id, ena_id, ek_id;
     double v;
     #pragma omp simd  // clang vec helper
+    #pragma clang loop vectorize(enable)
     for(id = 0; id<inst->node_count; ++id) {
         node_id = inst->node_index[id];
         ena_id = inst->ion_ena_index[id];
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 979c5d200b..2de18abb73 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -1,17 +1,21 @@
 #!/bin/bash
-# set -x
+
 #
 # Driver for nmodl-llvm benchmarking
 #
+
 set -e
-# sh nmodl-llvm-time.sh -vec-sweep -mod-dir /gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/bbp_mod -n 100000000
+#set -x
+
+# sample run
+# sh nmodl-llvm-time.sh -n 100000000 -o llvm_benchmark_all_big_100mil -ext -e 5
+
 # default params
 inst_size=100000000
-num_exp=10
+num_exp=5
 vec_width=8
 external_kernel_exec=false
 modfile_directory=$(pwd)
-vec_width_sweep=false
 output_dir=$(pwd)
 
 # version
@@ -85,24 +89,37 @@ while [[ "$1" != "" ]]; do
 
 done
 
-# vec libs
-vec_lib_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/intel-20.0.2-ilowey/lib/intel64_lin"
-vec_lib="libsvml.so"
+#intel paths
+intel_compiler_dir=/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/intel-20.0.2-ilowey/
+svml_lib=$intel_compiler_dir/lib/intel64_lin/libsvml.so
+icpc_exe=$intel_compiler_dir/bin/icpc
+
+#sleef library
+sleef_lib=/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621/sleef-3.5.1/lib64/libsleefgnuabi.so
+
+#llvm path
+llvm_path="/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621"
+clang_exe=${llvm_path}/bin/clang++
+llc_exe=${llvm_path}/bin/llc
+
+#gcc path
+gcc_exe=/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/gcc-9.3.0-45gzrp/bin/g++
+
+#add ld library path
+export LD_LIBRARY_PATH=`dirname $svml_lib`:`dirname $sleef_lib`:${llvm_path}/lib:$LD_LIBRARY_PATH
 
-# nmodl
-nmodl_exe="/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/build_benchmark/install/bin/nmodl"
+# nmodl binary
+nmodl_src_dir=$(pwd)/../../
+nmodl_exe=${nmodl_src_dir}/build/bin/nmodl
+#nmodl_exe=/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/build_benchmark/install/bin/nmodl
 
 # external kernel
-nmodl_src_path="/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl"
-kernels_path=${nmodl_src_path}/"test/benchmark/kernels"
+kernels_path=${nmodl_src_dir}/test/benchmark/kernels
+modfile_directory=${nmodl_src_dir}/test/benchmark/kernels
 ext_lib="libextkernel.so"
-modfile_directory=${kernels_path}
 
-mkdir -p ${output_dir}
 
-intel_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/intel-20.0.2-ilowey/bin"
-# compilers
-icpc_exe=${intel_path}/icpc
+# compiler flags
 declare -a icpc_flags_skylake_avx512=(
     "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml"
     "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml -fopenmp"
@@ -116,51 +133,56 @@ declare -a icpc_flags_nehalem=(
     "-O2 -msse2 -prec-div -fimf-use-svml"
     )
 
-llvm_path="/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621"
-llvm_lib=${llvm_path}/lib
-clang_exe=${llvm_path}/bin/clang++
-llc_exe=${llvm_path}/bin/llc
 declare -a clang_flags_skylake_avx512=(
+    "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math"
+    "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp"
     "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fveclib=SVML"
     "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML"
     )
 
 declare -a clang_flags_broadwell=(
+    "-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp"
     "-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp -fveclib=SVML"
     )
 
 declare -a clang_flags_nehalem=(
+    "-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp"
     "-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp -fveclib=SVML"
     )
 
-gcc_bin_path="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/gcc-9.3.0-45gzrp/bin"
-gcc_exe=${gcc_bin_path}/g++
 declare -a gcc_flags_skylake_avx512=(
+    "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize"
+    "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -fopenmp"
     "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -mveclibabi=svml"
     "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
     )
 
 declare -a gcc_flags_broadwell=(
+    "-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -fopenmp"
     "-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
     )
 
 declare -a gcc_flags_nehalem=(
+    "-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -fopenmp"
     "-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
     )
 
 declare -a benchmark_description
 declare -a benchmark_time
 declare -a benchmark_variance
-# also get variance
+
+# Kernels, architectures and compilers loop
+
 KERNEL_TARGETS="compute-bound memory-bound hh"
-#KERNEL_TARGETS="compute-bound"
-#KERNEL_TARGETS="hh"
+KERNEL_TARGETS="hh"
+
+ARCHITECTURES="skylake_avx512"
 ARCHITECTURES="skylake_avx512 broadwell nehalem"
-#ARCHITECTURES="avx512"
-#ARCHITECTURES="skylake_avx512"
-# set cpu option in jit according to the architecture
+
+COMPILERS="clang"
 COMPILERS="icpc clang gcc"
-#COMPILERS="clang"
+
+mkdir -p ${output_dir}
 
 # loop over options
 for kernel_target in ${KERNEL_TARGETS}; do
@@ -192,49 +214,58 @@ for kernel_target in ${KERNEL_TARGETS}; do
         if $external_kernel_exec; then
             for compiler in ${COMPILERS}; do
                 echo "|  |  Compiler: $compiler"
-			
+
 				compiler_exe=${compiler}_exe
 	        	compiler_flags=${compiler}_flags_${architecture}[@]
 	        	for flags in "${!compiler_flags}"; do
 	        		echo "|  |  |  flags: "${flags}
-           
+
 	                spec=${compiler}_${flags//[[:blank:]]/}
 	                rel_ext_path_cpp=${kernel_target}_${spec}_cpp
+                    rel_ext_path_cpp=${rel_ext_path_cpp//=/_}
+                    rel_ext_path_cpp=${rel_ext_path_cpp//-/_}
 
 	                ${debug} mkdir -p ${rel_ext_path_cpp}
 	                ${debug} cd ${rel_ext_path_cpp}
 	                ext_path=$(pwd)
-	                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp \
-	                -shared -fpic -o ${ext_lib}
+	                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp -shared -fpic -o ${ext_lib}
 	                ${debug} eval "objdump ${ext_lib} -d > ${ext_lib::-1}"
 	                ${debug} cd ..
 
                     # add --fmf nnan contract afn here to generate .ll file similar to the fast-math options from external compilers
-                    nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 --fmf nnan contract afn benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
-                    nmodl_args="${nmodl_args} --external"
+                    nmodl_common_args="${kernels_path}/${kernel_target}.mod benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${svml_lib} ${sleef_lib} --external"
+                    nmodl_llvm_args="llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 --fmf nnan contract afn"
 
-                    benchmark_ext_desc=ext_${kernel_target}_${compiler}_${nmodl_architecture}_v${vec_width}_${flags//[[:blank:]]/}
+                    benchmark_ext_desc=${kernel_target}_${compiler}_${nmodl_architecture}_v${vec_width}_${flags//[[:blank:]]/}
                     benchmark_description+=("${benchmark_ext_desc}")
                     # runs only external kernel
-                    ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_ext_desc}.log"
+                    ${debug} eval "LD_PRELOAD=${ext_path}/${ext_lib} ${nmodl_exe} ${nmodl_common_args} ${nmodl_llvm_args} 2>&1 | tee ${output_dir}/${benchmark_ext_desc}.log"
                     benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_ext_desc}.log | awk '{print $NF}'))
                     benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_ext_desc}.log | awk '{print $NF}'))
-
-                    if [ "$compiler" == "clang" ]; then
-                        rel_ext_path_llvm=${kernel_target}_${spec}_llvm
-                        ${debug} mkdir -p ${rel_ext_path_llvm}
-                        # Generate external library from LLVM IR of JIT
-                        ${debug} sed 's/nrn_state_hh/_Z16nrn_state_hh_extPv/g' v${vec_width}_${kernel_target}_opt.ll > ${output_dir}/v${vec_width}_${kernel_target}_opt_ext.ll
-                        ${debug} ${!compiler_exe} ${flags} -shared -fpic ${output_dir}/v${vec_width}_${kernel_target}_opt_ext.ll -o ${rel_ext_path_llvm}/${ext_lib} &>/dev/null # overwrites previous ${ext_lib}
-
-                        benchmark_ext_jit_desc=ext_jit_${kernel_target}_${compiler}_${nmodl_architecture}_v${vec_width}_${flags//[[:blank:]]/}
-                        benchmark_description+=("${benchmark_ext_jit_desc}")
-                        # run external library generated by the LLVM IR code of JIT
-                        ${debug} eval "LD_LIBRARY_PATH=${rel_ext_path_llvm}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_ext_jit_desc}.log"
-                        benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_ext_jit_desc}.log | awk '{print $NF}'))
-                        benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_ext_jit_desc}.log | awk '{print $NF}'))
-                    fi
                 done
+
+                if [ "$compiler" == "clang" ]; then
+                    for math_lib in SVML SLEEF;
+                    do
+                      nmodl_llvm_args="llvm --ir --vector-width ${vec_width} --veclib ${math_lib} --opt-level-ir 3 --fmf nnan contract afn"
+                      rel_ext_path_llvm=${kernel_target}_nmodl_${spec}_llvm_${math_lib}
+                      rel_ext_path_llvm=${rel_ext_path_llvm//=/_}
+                      rel_ext_path_llvm=${rel_ext_path_llvm//-/_}
+                      ${debug} mkdir -p ${rel_ext_path_llvm}
+                      ${debug} eval "LD_PRELOAD=${ext_path}/${ext_lib} ${nmodl_exe} ${kernels_path}/${kernel_target}.mod ${nmodl_common_args} ${nmodl_llvm_args}"
+                      # Generate external library from LLVM IR of JIT
+                      ${debug} sed 's/nrn_state_hh/_Z16nrn_state_hh_extPv/g' v${vec_width}_${kernel_target}_opt.ll > ${rel_ext_path_llvm}/v${vec_width}_${kernel_target}_opt_ext.ll
+                      ${debug} ${!compiler_exe} ${flags} -shared -fpic ${rel_ext_path_llvm}/v${vec_width}_${kernel_target}_opt_ext.ll -o ${rel_ext_path_llvm}/${ext_lib} &>/dev/null # overwrites previous ${ext_lib}
+                      benchmark_ext_jit_desc=${kernel_target}_nmodl_${nmodl_architecture}_v${vec_width}_${flags//[[:blank:]]/}_${math_lib}
+                      benchmark_description+=("${benchmark_ext_jit_desc}")
+                      # run external library generated by the LLVM IR code of JIT
+                      ${debug} eval "LD_PRELOAD=${rel_ext_path_llvm}/${ext_lib} ${nmodl_exe} ${nmodl_common_args} ${nmodl_llvm_args} 2>&1 | tee ${output_dir}/${benchmark_ext_jit_desc}.log"
+                      benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_ext_jit_desc}.log | awk '{print $NF}'))
+                      benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_ext_jit_desc}.log | awk '{print $NF}'))
+
+                    done
+                fi
+
             done
 		fi
         echo "|  |  NMODL JIT"
@@ -255,11 +286,11 @@ for kernel_target in ${KERNEL_TARGETS}; do
                     assume_may_alias_opt="noalias"
                 fi
                 echo "|  |  |  options: ${fast_math_flag} ${assume_may_alias_flag}"
-                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${vec_lib_path}/${vec_lib}"
-                benchmark_nmodl_desc=nmodl_${kernel_target}_${nmodl_architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
+                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${svml_lib}"
+                benchmark_nmodl_desc=${kernel_target}_nmodl-jit_${nmodl_architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
                 benchmark_description+=("${benchmark_nmodl_desc}")
                 # runs only kernel generated by LLVM IR
-                ${debug} eval "LD_LIBRARY_PATH=${ext_path}:${vec_lib_path}:${llvm_lib} ${nmodl_exe} ${nmodl_args} &> ${output_dir}/${benchmark_nmodl_desc}.log"
+                ${debug} eval "${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
                 benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
                 benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
             done

From a5a259f6bfe534971c17c1779d1ec5d0e18d18ca Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 7 Jun 2021 19:51:56 +0200
Subject: [PATCH 107/331] Updated script and hh.cpp

---
 test/benchmark/kernels/hh.cpp     |  3 +--
 test/benchmark/nmodl-llvm-time.sh | 37 ++++++++++++++++++++-----------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/test/benchmark/kernels/hh.cpp b/test/benchmark/kernels/hh.cpp
index f84358f417..86c1bfb996 100644
--- a/test/benchmark/kernels/hh.cpp
+++ b/test/benchmark/kernels/hh.cpp
@@ -52,8 +52,7 @@ void nrn_state_hh_ext(void* __restrict__ mech){
     int id;
     int node_id, ena_id, ek_id;
     double v;
-    #pragma omp simd  // clang vec helper
-    #pragma clang loop vectorize(enable)
+    #pragma omp simd
     for(id = 0; id<inst->node_count; ++id) {
         node_id = inst->node_index[id];
         ena_id = inst->ion_ena_index[id];
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 2de18abb73..344a86a086 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -10,6 +10,11 @@ set -e
 # sample run
 # sh nmodl-llvm-time.sh -n 100000000 -o llvm_benchmark_all_big_100mil -ext -e 5
 
+module purge
+unset MODULEPATH
+export MODULEPATH="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/modules/all"
+module load unstable
+
 # default params
 inst_size=100000000
 num_exp=5
@@ -90,9 +95,9 @@ while [[ "$1" != "" ]]; do
 done
 
 #intel paths
-intel_compiler_dir=/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/intel-20.0.2-ilowey/
-svml_lib=$intel_compiler_dir/lib/intel64_lin/libsvml.so
-icpc_exe=$intel_compiler_dir/bin/icpc
+intel_library_dir=$(module show intel 2>&1 | grep " LD_LIBRARY_PATH " | awk -F' ' '{print $3}' | head -n 1)
+svml_lib=$intel_library_dir/intel64_lin/libsvml.so
+intel_exe=$(module show intel 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)/icpc
 
 #sleef library
 sleef_lib=/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621/sleef-3.5.1/lib64/libsleefgnuabi.so
@@ -103,15 +108,14 @@ clang_exe=${llvm_path}/bin/clang++
 llc_exe=${llvm_path}/bin/llc
 
 #gcc path
-gcc_exe=/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/deploy/compilers/2021-01-06/linux-rhel7-x86_64/gcc-4.8.5/gcc-9.3.0-45gzrp/bin/g++
+gcc_exe=$(module show gcc 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)/g++
 
 #add ld library path
 export LD_LIBRARY_PATH=`dirname $svml_lib`:`dirname $sleef_lib`:${llvm_path}/lib:$LD_LIBRARY_PATH
 
 # nmodl binary
 nmodl_src_dir=$(pwd)/../../
-nmodl_exe=${nmodl_src_dir}/build/bin/nmodl
-#nmodl_exe=/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl/build_benchmark/install/bin/nmodl
+nmodl_exe=${nmodl_src_dir}/build_benchmark/bin/nmodl
 
 # external kernel
 kernels_path=${nmodl_src_dir}/test/benchmark/kernels
@@ -120,16 +124,16 @@ ext_lib="libextkernel.so"
 
 
 # compiler flags
-declare -a icpc_flags_skylake_avx512=(
+declare -a intel_flags_skylake_avx512=(
     "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml"
     "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml -fopenmp"
     )
 
-declare -a icpc_flags_broadwell=(
+declare -a intel_flags_broadwell=(
     "-O2 -march=broadwell -mtune=broadwell -prec-div -fimf-use-svml"
     )
 
-declare -a icpc_flags_nehalem=(
+declare -a intel_flags_nehalem=(
     "-O2 -msse2 -prec-div -fimf-use-svml"
     )
 
@@ -174,13 +178,10 @@ declare -a benchmark_variance
 # Kernels, architectures and compilers loop
 
 KERNEL_TARGETS="compute-bound memory-bound hh"
-KERNEL_TARGETS="hh"
 
-ARCHITECTURES="skylake_avx512"
 ARCHITECTURES="skylake_avx512 broadwell nehalem"
 
-COMPILERS="clang"
-COMPILERS="icpc clang gcc"
+COMPILERS="intel clang gcc"
 
 mkdir -p ${output_dir}
 
@@ -228,6 +229,16 @@ for kernel_target in ${KERNEL_TARGETS}; do
 	                ${debug} mkdir -p ${rel_ext_path_cpp}
 	                ${debug} cd ${rel_ext_path_cpp}
 	                ext_path=$(pwd)
+                    # replace pragmas with the corresponding ones for openmp or certain compiler
+                    if [[ "$kernel_target" == "hh" ]]; then
+                        if [[ "openmp" == *"$flags"* ]] || [[ "$compiler" == "intel" ]]; then
+                            ${debug} sed -i 's/#pragma.*/#pragma omp simd/g' ${kernels_path}/${kernel_target}.cpp
+                        elif [[ "$compiler" == "clang" ]]; then
+                            ${debug} sed -i 's/#pragma.*/#pragma clang vectorize(enable)/g' ${kernels_path}/${kernel_target}.cpp
+                        elif [[ "$compiler" == "gcc" ]]; then
+                            ${debug} sed -i 's/#pragma.*/#pragma GCC ivdep/g' ${kernels_path}/${kernel_target}.cpp
+                        fi
+                    fi
 	                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp -shared -fpic -o ${ext_lib}
 	                ${debug} eval "objdump ${ext_lib} -d > ${ext_lib::-1}"
 	                ${debug} cd ..

From a02dfa6b7078a090c70a71d1864a5a1e0526ffdf Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 8 Jun 2021 15:48:59 +0200
Subject: [PATCH 108/331] Small fix in openmp flags

---
 test/benchmark/nmodl-llvm-time.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 344a86a086..2e368a2d38 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -231,7 +231,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
 	                ext_path=$(pwd)
                     # replace pragmas with the corresponding ones for openmp or certain compiler
                     if [[ "$kernel_target" == "hh" ]]; then
-                        if [[ "openmp" == *"$flags"* ]] || [[ "$compiler" == "intel" ]]; then
+                        if [[ "$flags" == *"openmp"* ]] || [[ "$compiler" == "intel" ]]; then
                             ${debug} sed -i 's/#pragma.*/#pragma omp simd/g' ${kernels_path}/${kernel_target}.cpp
                         elif [[ "$compiler" == "clang" ]]; then
                             ${debug} sed -i 's/#pragma.*/#pragma clang vectorize(enable)/g' ${kernels_path}/${kernel_target}.cpp

From a23b1edee9c06dcd615024b8c46eaf2e9a0b60ac Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Fri, 27 Nov 2020 21:16:30 +0100
Subject: [PATCH 109/331] Disable python bindings for faster build   - while
 working on NMODL + LLVM, we don't worry that much     about Python bindings
 by default   - so lets disable them by default

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88b6cb07f8..bea7677040 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 # =============================================================================
 # Build options for NMODL
 # =============================================================================
-option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" ON)
+option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" OFF)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
 if(NMODL_ENABLE_LEGACY_UNITS)
   add_definitions(-DUSE_LEGACY_UNITS)

From 6698213109966f81c9a80266f85595a35330167c Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sat, 28 Nov 2020 06:22:04 +0100
Subject: [PATCH 110/331] Integrate LLVM into CMake build system   * added
 NMODL_ENABLE_LLVM option to enable/disable     llvm support in nmodl   *
 LLVMHelper.cmake added to help with linking LLVM libraries      - clang might
 need to use libstdc++ or libc++ linking      - on BB5, using GCC with LLVM
 libraries is fine. But using        clang results into lots of link error.
 Adding -stdlib=libstd++        solves the issue      - use
 check_cxx_source_compiles to find out which cxx flag is needed

---
 CMakeLists.txt         |  9 +++++++++
 cmake/LLVMHelper.cmake | 45 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 cmake/LLVMHelper.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bea7677040..c841f141d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,8 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 # =============================================================================
 option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" OFF)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
+option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
+
 if(NMODL_ENABLE_LEGACY_UNITS)
   add_definitions(-DUSE_LEGACY_UNITS)
 endif()
@@ -151,6 +153,13 @@ nmodl_find_python_module(sympy 1.2 REQUIRED)
 nmodl_find_python_module(textwrap 0.9 REQUIRED)
 nmodl_find_python_module(yaml 3.12 REQUIRED)
 
+# =============================================================================
+# Find LLVM dependencies
+# =============================================================================
+if(NMODL_ENABLE_LLVM)
+  include(LLVMHelper)
+endif()
+
 # =============================================================================
 # Compiler specific flags for external submodules
 # =============================================================================
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
new file mode 100644
index 0000000000..a22cf4c835
--- /dev/null
+++ b/cmake/LLVMHelper.cmake
@@ -0,0 +1,45 @@
+# =============================================================================
+# LLVM/Clang needs to be linked with either libc++ or libstdc++
+# =============================================================================
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
+  find_package(LLVM REQUIRED CONFIG)
+  include(CheckCXXSourceCompiles)
+
+  # test by including LLVM header and core library
+  llvm_map_components_to_libnames(LLVM_CORE_LIB core)
+  set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
+  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_CORE_LIB})
+
+  # simple code to test LLVM library linking
+  set(CODE_TO_TEST
+      "
+    #include <llvm/IR/IRBuilder.h>
+    using namespace llvm;
+    int main(int argc, char* argv[]) {
+        std::unique_ptr<IRBuilder<>> Builder;
+    }")
+
+  # first compile without any flags
+  check_cxx_source_compiles("${CODE_TO_TEST}" LLVM_LIB_LINK_TEST)
+
+  # if standard compilation fails
+  if(NOT LLVM_LIB_LINK_TEST)
+    # try libstdc++ first
+    set(CMAKE_REQUIRED_FLAGS "-stdlib=libstdc++")
+    check_cxx_source_compiles("${CODE_TO_TEST}" LLVM_LIBSTDCPP_TEST)
+    # on failure, try libc++
+    if(NOT LLVM_LIBSTDCPP_TEST)
+      set(CMAKE_REQUIRED_FLAGS "-stdlib=libc++")
+      check_cxx_source_compiles("${CODE_TO_TEST}" LLVM_LIBCPP_TEST)
+    endif()
+    # if either library works then add it to CXX flags
+    if(LLVM_LIBSTDCPP_TEST OR LLVM_LIBCPP_TEST)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_REQUIRED_FLAGS}")
+      message(
+        STATUS
+          "Adding ${CMAKE_REQUIRED_FLAGS} to CMAKE_CXX_FLAGS, required to link with LLVM libraries")
+    else()
+      message(STATUS "WARNING : -stdlib=libstdcx++ or -stdlib=libc++ didn't work to link with LLVM library")
+    endif()
+  endif()
+endif()

From c73d83d54ca4548be284e5ad52e81eae43442b31 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 29 Nov 2020 00:50:16 +0100
Subject: [PATCH 111/331] Code infrastructure for LLVM code generation backend 
 - added llvm dir under codegen where LLVM code generation    work will live 
 - llvm codegen visitor created that can be used as template    for initial
 work  - cmake adapted to enable llvm codegen based on CMake option  - simple
 procedure.mod added that can be initial target for    testing  - new CLI
 option --llvm that runs LLVM codegen visitor  - Enable CXX 14 because new
 LLVM versions require it

---
 CMakeLists.txt                            |  1 +
 cmake/LLVMHelper.cmake                    |  4 +-
 src/CMakeLists.txt                        |  6 ++
 src/codegen/CMakeLists.txt                |  5 ++
 src/codegen/llvm/CMakeLists.txt           | 13 +++++
 src/codegen/llvm/codegen_llvm_visitor.cpp | 46 ++++++++++++++++
 src/codegen/llvm/codegen_llvm_visitor.hpp | 67 +++++++++++++++++++++++
 src/main.cpp                              | 18 ++++++
 test/integration/mod/procedure.mod        | 15 +++++
 9 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 src/codegen/llvm/CMakeLists.txt
 create mode 100644 src/codegen/llvm/codegen_llvm_visitor.cpp
 create mode 100644 src/codegen/llvm/codegen_llvm_visitor.hpp
 create mode 100644 test/integration/mod/procedure.mod

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c841f141d4..354e69eb6c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,6 +158,7 @@ nmodl_find_python_module(yaml 3.12 REQUIRED)
 # =============================================================================
 if(NMODL_ENABLE_LLVM)
   include(LLVMHelper)
+  add_definitions(-DNMODL_LLVM_BACKEND)
 endif()
 
 # =============================================================================
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index a22cf4c835..de078be7b5 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -6,9 +6,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
   include(CheckCXXSourceCompiles)
 
   # test by including LLVM header and core library
-  llvm_map_components_to_libnames(LLVM_CORE_LIB core)
+  llvm_map_components_to_libnames(LLVM_CORE_LIBS core)
   set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
-  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_CORE_LIB})
+  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_CORE_LIBS})
 
   # simple code to test LLVM library linking
   set(CODE_TO_TEST
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 29e1e205a2..f5f535fb55 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -35,6 +35,12 @@ target_link_libraries(
   lexer
   ${NMODL_WRAPPER_LIBS})
 
+if(NMODL_ENABLE_LLVM)
+  # LLVM core libraries to link
+  llvm_map_components_to_libnames(LLVM_CORE_LIBS core)
+  target_link_libraries(nmodl llvm_codegen ${LLVM_CORE_LIBS})
+endif()
+
 # =============================================================================
 # Add dependency with nmodl pytnon module (for consumer projects)
 # =============================================================================
diff --git a/src/codegen/CMakeLists.txt b/src/codegen/CMakeLists.txt
index 32ad4e1303..2d31e1b1d6 100644
--- a/src/codegen/CMakeLists.txt
+++ b/src/codegen/CMakeLists.txt
@@ -35,6 +35,11 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/fast_math.ispc
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/fast_math.hpp
                ${CMAKE_BINARY_DIR}/include/nmodl/fast_math.hpp COPYONLY)
 
+# build llvm visitor if enabled
+if(NMODL_ENABLE_LLVM)
+  add_subdirectory(llvm)
+endif()
+
 # =============================================================================
 # Install include files
 # =============================================================================
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
new file mode 100644
index 0000000000..71ecca338c
--- /dev/null
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -0,0 +1,13 @@
+# =============================================================================
+# Codegen sources
+# =============================================================================
+set(LLVM_CODEGEN_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.cpp
+                              ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp)
+
+# =============================================================================
+# LLVM codegen library
+# =============================================================================
+
+include_directories(${LLVM_INCLUDE_DIRS})
+add_library(llvm_codegen STATIC ${LLVM_CODEGEN_SOURCE_FILES})
+add_dependencies(llvm_codegen lexer util visitor)
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
new file mode 100644
index 0000000000..3f4e319503
--- /dev/null
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -0,0 +1,46 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "ast/all.hpp"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
+namespace nmodl {
+namespace codegen {
+
+
+// LLVM code generator objects
+using namespace llvm;
+static std::unique_ptr<LLVMContext> TheContext;
+static std::unique_ptr<Module> TheModule;
+static std::unique_ptr<IRBuilder<>> Builder;
+static std::map<std::string, Value*> NamedValues;
+
+
+void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node) {
+    logger->info("CodegenLLVMVisitor : visiting statement block");
+    node.visit_children(*this);
+    // TODO : code generation for new block scope
+}
+
+void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
+    logger->info("CodegenLLVMVisitor : visiting {} procedure", node.get_node_name());
+    node.visit_children(*this);
+    // TODO : code generation for procedure block
+}
+
+void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
+    node.visit_children(*this);
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
new file mode 100644
index 0000000000..2b77160cd5
--- /dev/null
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -0,0 +1,67 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief LLVM based code generation backend implementation for CoreNEURON
+ *
+ * \file
+ * \brief \copybrief nmodl::codegen::CodegenLLVMVisitor
+ */
+
+#include <ostream>
+#include <string>
+
+#include "utils/logger.hpp"
+#include "visitors/ast_visitor.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+/**
+ * @defgroup llvm LLVM Based Code Generation Implementation
+ * @brief Implementations of LLVM based code generation
+ *
+ * @defgroup llvm_backends LLVM Codegen Backend
+ * @ingroup llvm
+ * @brief Code generation backends for NMODL AST to LLVM IR
+ * @{
+ */
+
+/**
+ * \class CodegenLLVMVisitor
+ * \brief %Visitor for transforming NMODL AST to LLVM IR
+ */
+class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
+    // Name of mod file (without .mod suffix)
+    std::string mod_filename;
+
+    // Output directory for code generation
+    std::string output_dir;
+
+  public:
+    /**
+     * \brief Constructs the LLVM code generator visitor
+     *
+     * This constructor instantiates an NMODL LLVM code generator. This is
+     * just template to work with initial implementation.
+     */
+    CodegenLLVMVisitor(const std::string& mod_filename, const std::string& output_dir)
+        : mod_filename(mod_filename)
+        , output_dir(output_dir) {}
+
+    void visit_statement_block(const ast::StatementBlock& node) override;
+    void visit_procedure_block(const ast::ProcedureBlock& node) override;
+    void visit_program(const ast::Program& node) override;
+};
+
+/** \} */  // end of llvm_backends
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/main.cpp b/src/main.cpp
index 0e27e6a349..ec0d32b848 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -17,6 +17,9 @@
 #include "codegen/codegen_cuda_visitor.hpp"
 #include "codegen/codegen_ispc_visitor.hpp"
 #include "codegen/codegen_omp_visitor.hpp"
+#ifdef NMODL_LLVM_BACKEND
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#endif
 #include "config/config.h"
 #include "parser/nmodl_driver.hpp"
 #include "pybind/pyembed.hpp"
@@ -84,6 +87,9 @@ int main(int argc, const char* argv[]) {
     /// true if cuda code to be generated
     bool cuda_backend(false);
 
+    /// true if llvm code to be generated
+    bool llvm_backend(false);
+
     /// true if sympy should be used for solving ODEs analytically
     bool sympy_analytic(false);
 
@@ -167,6 +173,10 @@ int main(int argc, const char* argv[]) {
         ->ignore_case()
         ->check(CLI::IsMember({"trace", "debug", "info", "warning", "error", "critical", "off"}));
 
+#ifdef NMODL_LLVM_BACKEND
+    app.add_flag("--llvm", llvm_backend, "Enable LLVM based code generation")->ignore_case();
+#endif
+
     app.add_option("file", mod_files, "One or more MOD files to process")
         ->ignore_case()
         ->required()
@@ -564,6 +574,14 @@ int main(int argc, const char* argv[]) {
                                            optimize_ionvar_copies_codegen);
                 visitor.visit_program(*ast);
             }
+
+#ifdef NMODL_LLVM_BACKEND
+            if (llvm_backend) {
+                logger->info("Running LLVM backend code generator");
+                CodegenLLVMVisitor visitor(modfile, output_dir);
+                visitor.visit_program(*ast);
+            }
+#endif
         }
     }
 
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
new file mode 100644
index 0000000000..3eb4817b3b
--- /dev/null
+++ b/test/integration/mod/procedure.mod
@@ -0,0 +1,15 @@
+PROCEDURE state(x, y) {
+    LOCAL z
+    z = x + y
+}
+
+PROCEDURE rates(v) {
+    LOCAL  alpha, beta, sum
+    {
+        alpha = .1 * exp(-(v+40))
+        beta =  4 * exp(-(v+65)/18)
+    }
+    {
+        sum = alpha + beta
+    }
+}

From c7c6e40298f0760b71273abe79e141b65e1ad44a Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 29 Nov 2020 00:58:20 +0100
Subject: [PATCH 112/331] Azure CI fixes for LLVM build and README update  -
 install llvm via brew  - set LLV_DIR variable so that CMake can find
 llvm-config

---
 INSTALL.md          | 13 +++++++++++--
 azure-pipelines.yml |  5 +++--
 setup.py            |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 8ce1c43d5f..1b65c1212c 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -31,7 +31,7 @@ Typically the versions of bison and flex provided by the system are outdated and
 To get recent version of all dependencies we recommend using [homebrew](https://brew.sh/):
 
 ```sh
-brew install flex bison cmake python3
+brew install flex bison cmake python3 llvm
 ```
 
 The necessary Python packages can then easily be added using the pip3 command.
@@ -57,7 +57,7 @@ export PATH=/opt/homebrew/opt/flex/bin:/opt/homebrew/opt/bison/bin:$PATH
 On Ubuntu (>=18.04) flex/bison versions are recent enough and are installed along with the system toolchain:
 
 ```sh
-apt-get install flex bison gcc python3 python3-pip
+apt-get install flex bison gcc python3 python3-pip llvm-dev llvm-runtime llvm clang-format clang
 ```
 
 The Python dependencies are installed using:
@@ -79,6 +79,15 @@ cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/nmodl
 make -j && make install
 ```
 
+If `llvm-config` is not in PATH then set LLVM_DIR as:
+
+```sh
+cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DLLVM_DIR=/path/to/llvm/install/lib/cmake/llvm
+
+# on OSX
+cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DLLVM_DIR=`brew --prefix llvm`/lib/cmake/llvm
+```
+
 And set PYTHONPATH as:
 
 ```sh
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index fb540afe1f..c29d8b1f4f 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -57,7 +57,7 @@ jobs:
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
       cmake --version
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3.7) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=Release
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3.7) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=Release -DNMODL_ENABLE_LLVM=OFF
       make -j 2
       if [ $? -ne 0 ]
       then
@@ -125,6 +125,7 @@ jobs:
     submodules: True
   - script: |
       brew install flex bison cmake python@3 gcc@8
+      brew install bison llvm
       python3 -m pip install --upgrade pip 'setuptools<59.7.0'
       python3 -m pip install --user 'Jinja2>=2.9.3' 'PyYAML>=3.13' pytest pytest-cov numpy 'sympy>=1.3,<1.9'
     displayName: 'Install Dependencies'
@@ -132,7 +133,7 @@ jobs:
       export PATH=/usr/local/opt/flex/bin:/usr/local/opt/bison/bin:$PATH;
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=`brew --prefix llvm`/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
       make -j 2
       if [ $? -ne 0 ]
       then
diff --git a/setup.py b/setup.py
index a99c01a276..d384af2d9f 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,7 @@ def _config_exe(exe_name):
 ]
 
 
-cmake_args = ["-DPYTHON_EXECUTABLE=" + sys.executable]
+cmake_args = ["-DPYTHON_EXECUTABLE=" + sys.executable, "-DNMODL_ENABLE_LLVM=OFF"]
 if "bdist_wheel" in sys.argv:
     cmake_args.append("-DLINK_AGAINST_PYTHON=FALSE")
 

From 6cf832033a16727696753204bf62461bc712d79e Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 29 Nov 2020 08:40:54 +0100
Subject: [PATCH 113/331] Print build status after cmake configure stage   -
 print table with different build options, flags and paths     used that can
 be helpful for debugging   - fix git revision date for older git version   -
 update INSTALL.md with correct brew paths for flex and bison

---
 CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 354e69eb6c..fc9beb9130 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -257,6 +257,12 @@ message(STATUS "Python Bindings     | ${NMODL_ENABLE_PYTHON_BINDINGS}")
 message(STATUS "Flex                | ${FLEX_EXECUTABLE}")
 message(STATUS "Bison               | ${BISON_EXECUTABLE}")
 message(STATUS "Python              | ${PYTHON_EXECUTABLE}")
+message(STATUS "LLVM Codegen        | ${NMODL_ENABLE_LLVM}")
+if(NMODL_ENABLE_LLVM)
+  message(STATUS "  VERSION           | ${LLVM_PACKAGE_VERSION}")
+  message(STATUS "  INCLUDE           | ${LLVM_INCLUDE_DIRS}")
+  message(STATUS "  CMAKE             | ${LLVM_CMAKE_DIR}")
+endif()
 if(NMODL_CLANG_FORMAT)
   message(STATUS "Clang Format        | ${ClangFormat_EXECUTABLE}")
 endif()

From d32e2a311f95024fff1bade36db4515d3954434d Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sun, 29 Nov 2020 16:46:29 +0100
Subject: [PATCH 114/331] Adding test template for LLVM codegen   -
 test/unit/codegen/llvm.cpp added for unit testing     LLVM code generation
 visitor   - ./bin/testcodegen binary can be used to launch     LLVM codegen
 specific tests   - multiple llvm_map_components_to_libnames removed   -
 update procedure.mod with simple examples for IR generation

---
 cmake/LLVMHelper.cmake                    |  8 ++--
 src/CMakeLists.txt                        |  4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 10 +++++
 src/codegen/llvm/codegen_llvm_visitor.hpp |  8 ++++
 test/integration/mod/procedure.mod        | 19 +++++++--
 test/unit/CMakeLists.txt                  | 37 ++++++++++++----
 test/unit/codegen/llvm.cpp                | 51 +++++++++++++++++++++++
 7 files changed, 118 insertions(+), 19 deletions(-)
 create mode 100644 test/unit/codegen/llvm.cpp

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index de078be7b5..dbd29c92b6 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -6,9 +6,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
   include(CheckCXXSourceCompiles)
 
   # test by including LLVM header and core library
-  llvm_map_components_to_libnames(LLVM_CORE_LIBS core)
+  llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core)
   set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
-  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_CORE_LIBS})
+  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
   # simple code to test LLVM library linking
   set(CODE_TO_TEST
@@ -39,7 +39,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
         STATUS
           "Adding ${CMAKE_REQUIRED_FLAGS} to CMAKE_CXX_FLAGS, required to link with LLVM libraries")
     else()
-      message(STATUS "WARNING : -stdlib=libstdcx++ or -stdlib=libc++ didn't work to link with LLVM library")
+      message(
+        STATUS
+          "WARNING : -stdlib=libstdcx++ or -stdlib=libc++ didn't work to link with LLVM library")
     endif()
   endif()
 endif()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f5f535fb55..4c8a9801a7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -36,9 +36,7 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 
 if(NMODL_ENABLE_LLVM)
-  # LLVM core libraries to link
-  llvm_map_components_to_libnames(LLVM_CORE_LIBS core)
-  target_link_libraries(nmodl llvm_codegen ${LLVM_CORE_LIBS})
+  target_link_libraries(nmodl llvm_codegen ${LLVM_LIBS_TO_LINK})
 endif()
 
 # =============================================================================
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 3f4e319503..494d5fd1f3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -7,6 +7,7 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "ast/all.hpp"
+#include "visitors/visitor_utils.hpp"
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -34,12 +35,21 @@ void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node)
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
     logger->info("CodegenLLVMVisitor : visiting {} procedure", node.get_node_name());
+
+    // print position, nmodl and json form as
+    /*
+    logger->info("Location {} \n NMODL {} \n JSON : {} \n",
+                 node.get_token()->position(),
+                 to_nmodl(node),
+                 to_json(node));
+    */
     node.visit_children(*this);
     // TODO : code generation for procedure block
 }
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     node.visit_children(*this);
+    result_code = "Hello World";
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 2b77160cd5..5b0ad3a968 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -45,6 +45,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Output directory for code generation
     std::string output_dir;
 
+    // result string for demo
+    std::string result_code;
+
   public:
     /**
      * \brief Constructs the LLVM code generator visitor
@@ -59,6 +62,11 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_statement_block(const ast::StatementBlock& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
+
+    // demo method
+    std::string get_code() const {
+        return result_code;
+    }
 };
 
 /** \} */  // end of llvm_backends
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
index 3eb4817b3b..ebbc39f15a 100644
--- a/test/integration/mod/procedure.mod
+++ b/test/integration/mod/procedure.mod
@@ -1,15 +1,26 @@
-PROCEDURE state(x, y) {
+PROCEDURE hello_world() {
+    print("Hello World")
+}
+
+PROCEDURE simple_sum(x, y) {
     LOCAL z
     z = x + y
 }
 
-PROCEDURE rates(v) {
+PROCEDURE complex_sum(v) {
     LOCAL  alpha, beta, sum
     {
         alpha = .1 * exp(-(v+40))
         beta =  4 * exp(-(v+65)/18)
-    }
-    {
         sum = alpha + beta
     }
 }
+
+PROCEDURE loop_function(v) {
+    LOCAL i
+    i = 0
+    WHILE(i < 10) {
+        print("Hello World")
+        i = i + 1
+    }
+}
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 457f35a288..fa79367cd6 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -69,6 +69,11 @@ add_executable(testcodegen codegen/main.cpp codegen/codegen_ispc.cpp codegen/cod
 
 target_link_libraries(testmodtoken lexer util)
 target_link_libraries(testlexer lexer util)
+target_link_libraries(testprinter printer util)
+target_link_libraries(testsymtab symtab lexer util)
+target_link_libraries(testunitlexer lexer util)
+target_link_libraries(testunitparser lexer test_util config)
+
 target_link_libraries(
   testparser
   visitor
@@ -78,6 +83,7 @@ target_link_libraries(
   test_util
   printer
   ${NMODL_WRAPPER_LIBS})
+
 target_link_libraries(
   testvisitor
   visitor
@@ -97,10 +103,22 @@ target_link_libraries(
   test_util
   printer
   ${NMODL_WRAPPER_LIBS})
-target_link_libraries(testprinter printer util)
-target_link_libraries(testsymtab symtab lexer util)
-target_link_libraries(testunitlexer lexer util)
-target_link_libraries(testunitparser lexer test_util config)
+
+if(NMODL_ENABLE_LLVM)
+  add_executable(testcodegen visitor/main.cpp codegen/llvm.cpp)
+  target_link_libraries(
+    testcodegen
+    visitor
+    symtab
+    lexer
+    util
+    test_util
+    printer
+    llvm_codegen
+    ${NMODL_WRAPPER_LIBS}
+    ${LLVM_LIBS_TO_LINK})
+  set(CODEGEN_TEST testcodegen)
+endif()
 
 # =============================================================================
 # Use catch_discover instead of add_test for granular test report if CMAKE ver is greater than 3.9,
@@ -109,9 +127,10 @@ target_link_libraries(testunitparser lexer test_util config)
 set(testvisitor_env "PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH}")
 if(NOT LINK_AGAINST_PYTHON)
   list(APPEND testvisitor_env "NMODL_PYLIB=$ENV{NMODL_PYLIB}")
-  list(APPEND testvisitor_env
-       "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
-
+  list(
+    APPEND
+      testvisitor_env
+      "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif()
 
 foreach(
@@ -126,8 +145,8 @@ foreach(
   testnewton
   testfast_math
   testunitlexer
-  testunitparser)
-
+  testunitparser
+  ${CODEGEN_TEST})
   if(${test_name} STREQUAL "testvisitor")
     catch_discover_tests(${test_name} TEST_PREFIX "${test_name}/" PROPERTIES ENVIRONMENT
                          "${testvisitor_env}")
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
new file mode 100644
index 0000000000..b6efe2f9ca
--- /dev/null
+++ b/test/unit/codegen/llvm.cpp
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <catch/catch.hpp>
+
+#include "ast/program.hpp"
+#include "parser/nmodl_driver.hpp"
+#include "visitors/checkparent_visitor.hpp"
+#include "visitors/inline_visitor.hpp"
+#include "visitors/symtab_visitor.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+
+using namespace nmodl;
+using namespace visitor;
+using nmodl::parser::NmodlDriver;
+
+//=============================================================================
+// Sample LLVM codegen test
+//=============================================================================
+
+std::string run_llvm_visitor(const std::string& text) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    SymtabVisitor().visit_program(*ast);
+    InlineVisitor().visit_program(*ast);
+
+    codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".");
+    llvm_visitor.visit_program(*ast);
+    return llvm_visitor.get_code();
+}
+
+SCENARIO("Running LLVM Codegen", "[visitor][llvm]") {
+    GIVEN("Simple procedure with hello world message") {
+        std::string nmodl_text = R"(
+            PROCEDURE say_hello() {
+                print("Hello World")
+            }
+        )";
+
+        THEN("Hello world message is printed") {
+            std::string expected = "Hello World";
+            auto result = run_llvm_visitor(nmodl_text);
+            REQUIRE(result == expected);
+        }
+    }
+}
\ No newline at end of file

From 01001dfb02e28f6f3e746902f71430440662a49c Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 22 Dec 2020 13:54:32 +0300
Subject: [PATCH 115/331] Initial LLVM codegen vistor routines (#457)

* Added LLVM code generation for `ProcedureBlock`.
* Added code generation routines for double, integer and
   boolean variable types.
* Added binary and unary operator code generation:
     - Supported binary operators: +, -, *, /.
     - Supported unary operators: -.
     - Assignment (=) is also supported.
* Added regex matching unit tests for LLVM code generation.
* Fixed Travis CI/builds.

fixes #451, fixes #452, fixes #456

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 CMakeLists.txt                            |   1 +
 azure-pipelines.yml                       |   5 +-
 cmake/LLVMHelper.cmake                    |  14 +-
 setup.py                                  |   2 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 157 +++++++++++++++---
 src/codegen/llvm/codegen_llvm_visitor.hpp |  40 ++++-
 test/integration/mod/procedure.mod        |   9 +-
 test/unit/CMakeLists.txt                  |   7 +-
 test/unit/codegen/llvm.cpp                | 188 ++++++++++++++++++++--
 9 files changed, 365 insertions(+), 58 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc9beb9130..26ff33eeb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,6 +158,7 @@ nmodl_find_python_module(yaml 3.12 REQUIRED)
 # =============================================================================
 if(NMODL_ENABLE_LLVM)
   include(LLVMHelper)
+  include_directories(${LLVM_INCLUDE_DIRS})
   add_definitions(-DNMODL_LLVM_BACKEND)
 endif()
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index c29d8b1f4f..d8e6408d74 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -124,8 +124,7 @@ jobs:
   - checkout: self
     submodules: True
   - script: |
-      brew install flex bison cmake python@3 gcc@8
-      brew install bison llvm
+      brew install flex bison cmake python@3 gcc@8 llvm
       python3 -m pip install --upgrade pip 'setuptools<59.7.0'
       python3 -m pip install --user 'Jinja2>=2.9.3' 'PyYAML>=3.13' pytest pytest-cov numpy 'sympy>=1.3,<1.9'
     displayName: 'Install Dependencies'
@@ -172,6 +171,7 @@ jobs:
     displayName: 'Build Neuron and Run Integration Tests'
 - job: 'manylinux_wheels'
   timeoutInMinutes: 45
+  condition: eq(1,2)
   pool:
     vmImage: 'ubuntu-18.04'
   strategy:
@@ -221,6 +221,7 @@ jobs:
   - template: ci/upload-wheels.yml
 - job: 'macos_wheels'
   timeoutInMinutes: 45
+  condition: eq(1,2)
   pool:
     vmImage: 'macOS-10.15'
   strategy:
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index dbd29c92b6..982af48660 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -1,15 +1,17 @@
 # =============================================================================
 # LLVM/Clang needs to be linked with either libc++ or libstdc++
 # =============================================================================
+
+find_package(LLVM REQUIRED CONFIG)
+
+# include LLVM header and core library
+llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core)
+set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
+set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
+
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NMODL_ENABLE_LLVM)
-  find_package(LLVM REQUIRED CONFIG)
   include(CheckCXXSourceCompiles)
 
-  # test by including LLVM header and core library
-  llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core)
-  set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
-  set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
-
   # simple code to test LLVM library linking
   set(CODE_TO_TEST
       "
diff --git a/setup.py b/setup.py
index d384af2d9f..dba102570a 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,7 @@ def _config_exe(exe_name):
 ]
 
 
-cmake_args = ["-DPYTHON_EXECUTABLE=" + sys.executable, "-DNMODL_ENABLE_LLVM=OFF"]
+cmake_args = ["-DPYTHON_EXECUTABLE=" + sys.executable, "-DNMODL_ENABLE_LLVM=OFF", "-DNMODL_ENABLE_PYTHON_BINDINGS=ON"]
 if "bdist_wheel" in sys.argv:
     cmake_args.append("-DLINK_AGAINST_PYTHON=FALSE")
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 494d5fd1f3..b8b3778e86 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -10,46 +10,153 @@
 #include "visitors/visitor_utils.hpp"
 
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/ValueSymbolTable.h"
 
 namespace nmodl {
 namespace codegen {
 
 
-// LLVM code generator objects
-using namespace llvm;
-static std::unique_ptr<LLVMContext> TheContext;
-static std::unique_ptr<Module> TheModule;
-static std::unique_ptr<IRBuilder<>> Builder;
-static std::map<std::string, Value*> NamedValues;
+/****************************************************************************************/
+/*                            Overloaded visitor routines                               */
+/****************************************************************************************/
 
 
-void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node) {
-    logger->info("CodegenLLVMVisitor : visiting statement block");
-    node.visit_children(*this);
-    // TODO : code generation for new block scope
+void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& node) {
+    const auto& op = node.get_op().get_value();
+
+    // Process rhs first, since lhs is handled differently for assignment and binary
+    // operators.
+    node.get_rhs()->accept(*this);
+    llvm::Value* rhs = values.back();
+    values.pop_back();
+    if (op == ast::BinaryOp::BOP_ASSIGN) {
+        auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
+        if (!var) {
+            throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
+        }
+        llvm::Value* alloca = named_values[var->get_node_name()];
+        builder.CreateStore(rhs, alloca);
+        return;
+    }
+
+    node.get_lhs()->accept(*this);
+    llvm::Value* lhs = values.back();
+    values.pop_back();
+    llvm::Value* result;
+
+    // \todo: Support other binary operators
+    switch (op) {
+#define DISPATCH(binary_op, llvm_op) \
+    case binary_op:                  \
+        result = llvm_op(lhs, rhs);  \
+        values.push_back(result);    \
+        break;
+
+        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd);
+        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv);
+        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul);
+        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub);
+
+#undef DISPATCH
+    }
 }
 
-void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
-    logger->info("CodegenLLVMVisitor : visiting {} procedure", node.get_node_name());
-
-    // print position, nmodl and json form as
-    /*
-    logger->info("Location {} \n NMODL {} \n JSON : {} \n",
-                 node.get_token()->position(),
-                 to_nmodl(node),
-                 to_json(node));
-    */
-    node.visit_children(*this);
-    // TODO : code generation for procedure block
+void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
+    const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*context),
+                                                  node.get_value());
+    values.push_back(constant);
+}
+
+void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
+    const auto& constant = llvm::ConstantFP::get(llvm::Type::getDoubleTy(*context),
+                                                 node.get_value());
+    values.push_back(constant);
+}
+
+void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
+    const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
+                                                  node.get_value());
+    values.push_back(constant);
+}
+
+void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatement& node) {
+    for (const auto& variable: node.get_variables()) {
+        // LocalVar always stores a Name.
+        auto name = variable->get_node_name();
+        llvm::Type* var_type = llvm::Type::getDoubleTy(*context);
+        llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+        named_values[name] = alloca;
+    }
 }
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     node.visit_children(*this);
-    result_code = "Hello World";
+    // Keep this for easier development (maybe move to debug mode later).
+    std::cout << print_module();
+}
+
+void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
+    const auto& name = node.get_node_name();
+    const auto& parameters = node.get_parameters();
+
+    // The procedure parameters are doubles by default.
+    std::vector<llvm::Type*> arg_types;
+    for (size_t i = 0, e = parameters.size(); i < e; ++i)
+        arg_types.push_back(llvm::Type::getDoubleTy(*context));
+    llvm::Type* return_type = llvm::Type::getVoidTy(*context);
+
+    llvm::Function* proc =
+        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                               llvm::Function::ExternalLinkage,
+                               name,
+                               *module);
+
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", proc);
+    builder.SetInsertPoint(body);
+
+    // First, allocate parameters on the stack and add them to the symbol table.
+    unsigned i = 0;
+    for (auto& arg: proc->args()) {
+        std::string arg_name = parameters[i++].get()->get_node_name();
+        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
+        arg.setName(arg_name);
+        builder.CreateStore(&arg, alloca);
+        named_values[arg_name] = alloca;
+    }
+
+    const auto& statements = node.get_statement_block()->get_statements();
+    for (const auto& statement: statements) {
+        // \todo: Support other statement types.
+        if (statement->is_local_list_statement() || statement->is_expression_statement())
+            statement->accept(*this);
+    }
+
+    values.clear();
+    // \todo: Add proper support for the symbol table.
+    named_values.clear();
+}
+
+void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node) {
+    ast::UnaryOp op = node.get_op().get_value();
+    node.get_expression()->accept(*this);
+    llvm::Value* value = values.back();
+    values.pop_back();
+    if (op == ast::UOP_NEGATION) {
+        llvm::Value* result = builder.CreateFNeg(value);
+        values.push_back(result);
+    } else {
+        // Support only `double` operators for now.
+        throw std::runtime_error("Error: unsupported unary operator\n");
+    }
+}
+
+void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
+    llvm::Value* var = builder.CreateLoad(named_values[node.get_node_name()]);
+    values.push_back(var);
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 5b0ad3a968..5a288d9836 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -21,6 +21,10 @@
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
 namespace nmodl {
 namespace codegen {
 
@@ -45,8 +49,18 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Output directory for code generation
     std::string output_dir;
 
-    // result string for demo
-    std::string result_code;
+  private:
+    std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
+
+    std::unique_ptr<llvm::Module> module = std::make_unique<llvm::Module>(mod_filename, *context);
+
+    llvm::IRBuilder<> builder;
+
+    // Stack to hold visited values
+    std::vector<llvm::Value*> values;
+
+    // Mappings for named values for lookups
+    std::map<std::string, llvm::Value*> named_values;
 
   public:
     /**
@@ -57,15 +71,27 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     CodegenLLVMVisitor(const std::string& mod_filename, const std::string& output_dir)
         : mod_filename(mod_filename)
-        , output_dir(output_dir) {}
+        , output_dir(output_dir)
+        , builder(*context) {}
 
-    void visit_statement_block(const ast::StatementBlock& node) override;
+    // Visitors
+    void visit_binary_expression(const ast::BinaryExpression& node) override;
+    void visit_boolean(const ast::Boolean& node) override;
+    void visit_double(const ast::Double& node) override;
+    void visit_integer(const ast::Integer& node) override;
+    void visit_local_list_statement(const ast::LocalListStatement& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
+    void visit_unary_expression(const ast::UnaryExpression& node) override;
+    void visit_var_name(const ast::VarName& node) override;
 
-    // demo method
-    std::string get_code() const {
-        return result_code;
+    // TODO: use custom printer here
+    std::string print_module() const {
+        std::string str;
+        llvm::raw_string_ostream os(str);
+        os << *module;
+        os.flush();
+        return str;
     }
 };
 
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
index ebbc39f15a..4017b6a505 100644
--- a/test/integration/mod/procedure.mod
+++ b/test/integration/mod/procedure.mod
@@ -1,5 +1,10 @@
+NEURON {
+    SUFFIX procedure_test
+    THREADSAFE
+}
+
 PROCEDURE hello_world() {
-    print("Hello World")
+    printf("Hello World")
 }
 
 PROCEDURE simple_sum(x, y) {
@@ -20,7 +25,7 @@ PROCEDURE loop_function(v) {
     LOCAL i
     i = 0
     WHILE(i < 10) {
-        print("Hello World")
+        printf("Hello World")
         i = i + 1
     }
 }
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index fa79367cd6..29957a7530 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -105,9 +105,10 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 
 if(NMODL_ENABLE_LLVM)
-  add_executable(testcodegen visitor/main.cpp codegen/llvm.cpp)
+  include_directories(${LLVM_INCLUDE_DIRS})
+  add_executable(testllvm visitor/main.cpp codegen/llvm.cpp)
   target_link_libraries(
-    testcodegen
+    testllvm
     visitor
     symtab
     lexer
@@ -117,7 +118,7 @@ if(NMODL_ENABLE_LLVM)
     llvm_codegen
     ${NMODL_WRAPPER_LIBS}
     ${LLVM_LIBS_TO_LINK})
-  set(CODEGEN_TEST testcodegen)
+  set(CODEGEN_TEST testllvm)
 endif()
 
 # =============================================================================
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index b6efe2f9ca..270ce97ec0 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -6,20 +6,21 @@
  *************************************************************************/
 
 #include <catch/catch.hpp>
+#include <regex>
 
 #include "ast/program.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
 #include "visitors/inline_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
-#include "codegen/llvm/codegen_llvm_visitor.hpp"
 
 using namespace nmodl;
 using namespace visitor;
 using nmodl::parser::NmodlDriver;
 
 //=============================================================================
-// Sample LLVM codegen test
+// Utility to get LLVM module as a string
 //=============================================================================
 
 std::string run_llvm_visitor(const std::string& text) {
@@ -31,21 +32,184 @@ std::string run_llvm_visitor(const std::string& text) {
 
     codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".");
     llvm_visitor.visit_program(*ast);
-    return llvm_visitor.get_code();
+    return llvm_visitor.print_module();
 }
 
-SCENARIO("Running LLVM Codegen", "[visitor][llvm]") {
-    GIVEN("Simple procedure with hello world message") {
+//=============================================================================
+// BinaryExpression and Double
+//=============================================================================
+
+SCENARIO("Binary expression", "[visitor][llvm]") {
+    GIVEN("Procedure with addition of its arguments") {
+        std::string nmodl_text = R"(
+            PROCEDURE add(a, b) {
+                LOCAL i
+                i = a + b
+            }
+        )";
+
+        THEN("variables are loaded and add instruction is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check the values are loaded correctly and added
+            std::regex rhs(R"(%1 = load double, double\* %b)");
+            std::regex lhs(R"(%2 = load double, double\* %a)");
+            std::regex res(R"(%3 = fadd double %2, %1)");
+            REQUIRE(std::regex_search(module_string, m, rhs));
+            REQUIRE(std::regex_search(module_string, m, lhs));
+            REQUIRE(std::regex_search(module_string, m, res));
+        }
+    }
+
+    GIVEN("Procedure with multiple binary operators") {
         std::string nmodl_text = R"(
-            PROCEDURE say_hello() {
-                print("Hello World")
+            PROCEDURE multiple(a, b) {
+                LOCAL i
+                i = (a - b) / (a + b)
             }
         )";
 
-        THEN("Hello world message is printed") {
-            std::string expected = "Hello World";
-            auto result = run_llvm_visitor(nmodl_text);
-            REQUIRE(result == expected);
+        THEN("variables are processed from rhs first") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check rhs
+            std::regex rr(R"(%1 = load double, double\* %b)");
+            std::regex rl(R"(%2 = load double, double\* %a)");
+            std::regex x(R"(%3 = fadd double %2, %1)");
+            REQUIRE(std::regex_search(module_string, m, rr));
+            REQUIRE(std::regex_search(module_string, m, rl));
+            REQUIRE(std::regex_search(module_string, m, x));
+
+            // Check lhs
+            std::regex lr(R"(%4 = load double, double\* %b)");
+            std::regex ll(R"(%5 = load double, double\* %a)");
+            std::regex y(R"(%6 = fsub double %5, %4)");
+            REQUIRE(std::regex_search(module_string, m, lr));
+            REQUIRE(std::regex_search(module_string, m, ll));
+            REQUIRE(std::regex_search(module_string, m, y));
+
+            // Check result
+            std::regex res(R"(%7 = fdiv double %6, %3)");
+            REQUIRE(std::regex_search(module_string, m, res));
         }
     }
-}
\ No newline at end of file
+
+    GIVEN("Procedure with assignment") {
+        std::string nmodl_text = R"(
+            PROCEDURE assignment() {
+                LOCAL i
+                i = 2
+            }
+        )";
+
+        THEN("double constant is stored into i") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check store immediate is created
+            std::regex allocation(R"(%i = alloca double)");
+            std::regex assignment(R"(store double 2.0*e\+00, double\* %i)");
+            REQUIRE(std::regex_search(module_string, m, allocation));
+            REQUIRE(std::regex_search(module_string, m, assignment));
+        }
+    }
+}
+
+//=============================================================================
+// LocalList and LocalVar
+//=============================================================================
+
+SCENARIO("Local variable", "[visitor][llvm]") {
+    GIVEN("Procedure with some local variables") {
+        std::string nmodl_text = R"(
+            PROCEDURE local() {
+                LOCAL i, j
+            }
+        )";
+
+        THEN("local variables are allocated on the stack") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check stack allocations for i and j
+            std::regex i(R"(%i = alloca double)");
+            std::regex j(R"(%j = alloca double)");
+            REQUIRE(std::regex_search(module_string, m, i));
+            REQUIRE(std::regex_search(module_string, m, j));
+        }
+    }
+}
+
+//=============================================================================
+// ProcedureBlock
+//=============================================================================
+
+SCENARIO("Procedure", "[visitor][llvm]") {
+    GIVEN("Empty procedure with no arguments") {
+        std::string nmodl_text = R"(
+            PROCEDURE empty() {}
+        )";
+
+        THEN("empty void function is produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check procedure has empty body
+            std::regex procedure(R"(define void @empty\(\) \{\n\})");
+            REQUIRE(std::regex_search(module_string, m, procedure));
+        }
+    }
+
+    GIVEN("Empty procedure with arguments") {
+        std::string nmodl_text = R"(
+            PROCEDURE with_argument(x) {}
+        )";
+
+        THEN("void function is produced with arguments allocated on stack") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check procedure signature
+            std::regex function_signature(R"(define void @with_argument\(double %x1\) \{)");
+            REQUIRE(std::regex_search(module_string, m, function_signature));
+
+            // Check that procedure arguments are allocated on the local stack
+            std::regex alloca_instr(R"(%x = alloca double)");
+            std::regex store_instr(R"(store double %x1, double\* %x)");
+            REQUIRE(std::regex_search(module_string, m, alloca_instr));
+            REQUIRE(std::regex_search(module_string, m, store_instr));
+        }
+    }
+}
+
+//=============================================================================
+// UnaryExpression
+//=============================================================================
+
+SCENARIO("Unary expression", "[visitor][llvm]") {
+    GIVEN("Procedure with negation") {
+        std::string nmodl_text = R"(
+            PROCEDURE negation(a) {
+                LOCAL i
+                i = -a
+            }
+        )";
+
+        THEN("fneg instruction is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex allocation(R"(%1 = load double, double\* %a)");
+            REQUIRE(std::regex_search(module_string, m, allocation));
+
+            // llvm v9 and llvm v11 implementation for negation
+            std::regex negation_v9(R"(%2 = fsub double -0.000000e\+00, %1)");
+            std::regex negation_v11(R"(fneg double %1)");
+            bool result = std::regex_search(module_string, m, negation_v9) ||
+                          std::regex_search(module_string, m, negation_v11);
+            REQUIRE(result == true);
+        }
+    }
+}

From c8ee79b44ea3ae2b9b1a2dd597e6715e90d0594c Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 25 Dec 2020 13:29:02 +0300
Subject: [PATCH 116/331] FunctionBlock code generation and terminator checks
 (#470)

* LLVM code generation for `FunctionBlock` is now supported.
* Terminators in function or procedure blocks are enforced:
      - Every procedure must have `ret void` instruction.
      - Every function returns a double, specified by `ret_<function_name>`.
* For local symbol table, code generation now uses LLVM's builtin
`llvm::ValueSymbolTable`.


fixes #454, fixes #469
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 123 ++++++++++++++--------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  11 +-
 test/unit/codegen/llvm.cpp                |  50 ++++++++-
 3 files changed, 137 insertions(+), 47 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index b8b3778e86..6e1177cbec 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -7,6 +7,7 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "ast/all.hpp"
+#include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
 #include "llvm/IR/BasicBlock.h"
@@ -20,6 +21,80 @@ namespace nmodl {
 namespace codegen {
 
 
+/****************************************************************************************/
+/*                            Helper routines                                           */
+/****************************************************************************************/
+
+
+void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
+    const auto& name = node.get_node_name();
+    const auto& parameters = node.get_parameters();
+
+    // Procedure or function parameters are doubles by default.
+    std::vector<llvm::Type*> arg_types;
+    for (size_t i = 0; i < parameters.size(); ++i)
+        arg_types.push_back(llvm::Type::getDoubleTy(*context));
+
+    // If visiting a function, the return type is a double by default.
+    llvm::Type* return_type = node.is_function_block() ? llvm::Type::getDoubleTy(*context)
+                                                       : llvm::Type::getVoidTy(*context);
+
+    llvm::Function* func =
+        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                               llvm::Function::ExternalLinkage,
+                               name,
+                               *module);
+
+    // Create the entry basic block of the function/procedure and point the local named values table
+    // to the symbol table.
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
+    builder.SetInsertPoint(body);
+    local_named_values = func->getValueSymbolTable();
+
+    // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
+    // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
+    // symbolic conflicts. Then, allocate the return variable on the local stack.
+    std::string return_var_name = "ret_" + name;
+    const auto& block = node.get_statement_block();
+    if (node.is_function_block()) {
+        visitor::RenameVisitor v(name, return_var_name);
+        block->accept(v);
+        builder.CreateAlloca(llvm::Type::getDoubleTy(*context),
+                             /*ArraySize=*/nullptr,
+                             return_var_name);
+    }
+
+    // Allocate parameters on the stack and add them to the symbol table.
+    unsigned i = 0;
+    for (auto& arg: func->args()) {
+        std::string arg_name = parameters[i++].get()->get_node_name();
+        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
+        arg.setName(arg_name);
+        builder.CreateStore(&arg, alloca);
+    }
+
+    // Process function or procedure body.
+    const auto& statements = block->get_statements();
+    for (const auto& statement: statements) {
+        // \todo: Support other statement types.
+        if (statement->is_local_list_statement() || statement->is_expression_statement())
+            statement->accept(*this);
+    }
+
+    // Add the terminator. If visiting function, we need to return the value specified by
+    // ret_<function_name>.
+    if (node.is_function_block()) {
+        llvm::Value* return_var = builder.CreateLoad(local_named_values->lookup(return_var_name));
+        builder.CreateRet(return_var);
+    } else {
+        builder.CreateRetVoid();
+    }
+
+    // Clear local values stack and remove the pointer to the local symbol table.
+    values.clear();
+    local_named_values = nullptr;
+}
+
 /****************************************************************************************/
 /*                            Overloaded visitor routines                               */
 /****************************************************************************************/
@@ -38,7 +113,7 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
         if (!var) {
             throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
         }
-        llvm::Value* alloca = named_values[var->get_node_name()];
+        llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
         builder.CreateStore(rhs, alloca);
         return;
     }
@@ -77,6 +152,10 @@ void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
     values.push_back(constant);
 }
 
+void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
+    visit_procedure_or_function(node);
+}
+
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
                                                   node.get_value());
@@ -89,7 +168,6 @@ void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatemen
         auto name = variable->get_node_name();
         llvm::Type* var_type = llvm::Type::getDoubleTy(*context);
         llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
-        named_values[name] = alloca;
     }
 }
 
@@ -100,44 +178,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 }
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
-    const auto& name = node.get_node_name();
-    const auto& parameters = node.get_parameters();
-
-    // The procedure parameters are doubles by default.
-    std::vector<llvm::Type*> arg_types;
-    for (size_t i = 0, e = parameters.size(); i < e; ++i)
-        arg_types.push_back(llvm::Type::getDoubleTy(*context));
-    llvm::Type* return_type = llvm::Type::getVoidTy(*context);
-
-    llvm::Function* proc =
-        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
-                               llvm::Function::ExternalLinkage,
-                               name,
-                               *module);
-
-    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", proc);
-    builder.SetInsertPoint(body);
-
-    // First, allocate parameters on the stack and add them to the symbol table.
-    unsigned i = 0;
-    for (auto& arg: proc->args()) {
-        std::string arg_name = parameters[i++].get()->get_node_name();
-        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
-        arg.setName(arg_name);
-        builder.CreateStore(&arg, alloca);
-        named_values[arg_name] = alloca;
-    }
-
-    const auto& statements = node.get_statement_block()->get_statements();
-    for (const auto& statement: statements) {
-        // \todo: Support other statement types.
-        if (statement->is_local_list_statement() || statement->is_expression_statement())
-            statement->accept(*this);
-    }
-
-    values.clear();
-    // \todo: Add proper support for the symbol table.
-    named_values.clear();
+    visit_procedure_or_function(node);
 }
 
 void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node) {
@@ -155,7 +196,7 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
 }
 
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
-    llvm::Value* var = builder.CreateLoad(named_values[node.get_node_name()]);
+    llvm::Value* var = builder.CreateLoad(local_named_values->lookup(node.get_node_name()));
     values.push_back(var);
 }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 5a288d9836..801922cdc1 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -59,8 +59,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Stack to hold visited values
     std::vector<llvm::Value*> values;
 
-    // Mappings for named values for lookups
-    std::map<std::string, llvm::Value*> named_values;
+    // Pointer to the local symbol table.
+    llvm::ValueSymbolTable* local_named_values = nullptr;
 
   public:
     /**
@@ -74,10 +74,17 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , output_dir(output_dir)
         , builder(*context) {}
 
+    /**
+     * Visit nmodl function or procedure
+     * \param node the AST node representing the function or procedure in NMODL
+     */
+    void visit_procedure_or_function(const ast::Block& node);
+
     // Visitors
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
     void visit_double(const ast::Double& node) override;
+    void visit_function_block(const ast::FunctionBlock& node) override;
     void visit_integer(const ast::Integer& node) override;
     void visit_local_list_statement(const ast::LocalListStatement& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index 270ce97ec0..44ca18391b 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -117,6 +117,44 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// FunctionBlock
+//=============================================================================
+
+SCENARIO("Function", "[visitor][llvm]") {
+    GIVEN("Simple function with arguments") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(x) {
+               foo = x
+            }
+        )";
+
+        THEN("function is produced with arguments allocated on stack and a return instruction") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check function signature. The return type should be the default double type.
+            std::regex function_signature(R"(define double @foo\(double %x1\) \{)");
+            REQUIRE(std::regex_search(module_string, m, function_signature));
+
+            // Check that function arguments are allocated on the local stack.
+            std::regex alloca_instr(R"(%x = alloca double)");
+            std::regex store_instr(R"(store double %x1, double\* %x)");
+            REQUIRE(std::regex_search(module_string, m, alloca_instr));
+            REQUIRE(std::regex_search(module_string, m, store_instr));
+
+            // Check the return variable has also been allocated.
+            std::regex ret_instr(R"(%ret_foo = alloca double)");
+
+            // Check that the return value has been loaded and passed to terminator.
+            std::regex loaded(R"(%2 = load double, double\* %ret_foo)");
+            std::regex terminator(R"(ret double %2)");
+            REQUIRE(std::regex_search(module_string, m, loaded));
+            REQUIRE(std::regex_search(module_string, m, terminator));
+        }
+    }
+}
+
 //=============================================================================
 // LocalList and LocalVar
 //=============================================================================
@@ -156,8 +194,8 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check procedure has empty body
-            std::regex procedure(R"(define void @empty\(\) \{\n\})");
+            // Check procedure has empty body with a void return.
+            std::regex procedure(R"(define void @empty\(\) \{\n(\s)*ret void\n\})");
             REQUIRE(std::regex_search(module_string, m, procedure));
         }
     }
@@ -171,15 +209,19 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check procedure signature
+            // Check procedure signature.
             std::regex function_signature(R"(define void @with_argument\(double %x1\) \{)");
             REQUIRE(std::regex_search(module_string, m, function_signature));
 
-            // Check that procedure arguments are allocated on the local stack
+            // Check that procedure arguments are allocated on the local stack.
             std::regex alloca_instr(R"(%x = alloca double)");
             std::regex store_instr(R"(store double %x1, double\* %x)");
             REQUIRE(std::regex_search(module_string, m, alloca_instr));
             REQUIRE(std::regex_search(module_string, m, store_instr));
+
+            // Check terminator.
+            std::regex terminator(R"(ret void)");
+            REQUIRE(std::regex_search(module_string, m, terminator));
         }
     }
 }

From f70c4d2213bf7bbc73332353d512b943199d66d7 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Mon, 28 Dec 2020 23:59:20 +0100
Subject: [PATCH 117/331] Add option to run LLVM optimisation passes (#471)

* Add option to run LLVM optimisation passes
  - update CLI argument from --llvm to llvm --ir --opt
  - --ir runs CodegenLLVMVicitor and emits LLVM IR
  - if --opt is passed, we run basic LLVM optimisation passes
  - update simple test to check optimisation passes
* Add function example in procedure.mod
* Add test for LLVM optimisation passes and dead code removal
---
 cmake/LLVMHelper.cmake                    |  2 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 25 ++++++++++++++++++
 src/codegen/llvm/codegen_llvm_visitor.hpp | 26 ++++++++++++++++--
 src/main.cpp                              | 25 +++++++++++++-----
 test/integration/mod/procedure.mod        |  8 +++++-
 test/unit/codegen/llvm.cpp                | 32 ++++++++++++++++++++---
 6 files changed, 105 insertions(+), 13 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 982af48660..5d451697b9 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -5,7 +5,7 @@
 find_package(LLVM REQUIRED CONFIG)
 
 # include LLVM header and core library
-llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core)
+llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core native)
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
 set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6e1177cbec..d99e519dca 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -25,6 +25,24 @@ namespace codegen {
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
+void CodegenLLVMVisitor::run_llvm_opt_passes() {
+    /// run some common optimisation passes that are commonly suggested
+    fpm.add(llvm::createInstructionCombiningPass());
+    fpm.add(llvm::createReassociatePass());
+    fpm.add(llvm::createGVNPass());
+    fpm.add(llvm::createCFGSimplificationPass());
+
+    /// initialize pass manager
+    fpm.doInitialization();
+
+    /// iterate over all functions and run the optimisation passes
+    auto& functions = module->getFunctionList();
+    for (auto& function: functions) {
+        llvm::verifyFunction(function);
+        fpm.run(function);
+    }
+}
+
 
 void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
     const auto& name = node.get_node_name();
@@ -95,6 +113,7 @@ void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
     local_named_values = nullptr;
 }
 
+
 /****************************************************************************************/
 /*                            Overloaded visitor routines                               */
 /****************************************************************************************/
@@ -173,6 +192,12 @@ void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatemen
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     node.visit_children(*this);
+
+    if (opt_passes) {
+        logger->info("Running LLVM optimisation passes");
+        run_llvm_opt_passes();
+    }
+
     // Keep this for easier development (maybe move to debug mode later).
     std::cout << print_module();
 }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 801922cdc1..6b94ecffbe 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -23,7 +23,12 @@
 
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
 
 namespace nmodl {
 namespace codegen {
@@ -56,12 +61,25 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
     llvm::IRBuilder<> builder;
 
+    llvm::legacy::FunctionPassManager fpm;
+
     // Stack to hold visited values
     std::vector<llvm::Value*> values;
 
     // Pointer to the local symbol table.
     llvm::ValueSymbolTable* local_named_values = nullptr;
 
+    // Run optimisation passes if true
+    bool opt_passes;
+
+    /**
+     *\brief Run LLVM optimisation passes on generated IR
+     *
+     * LLVM provides number of optimisation passes that can be run on the generated IR.
+     * Here we run common optimisation LLVM passes that benefits code optimisation.
+     */
+    void run_llvm_opt_passes();
+
   public:
     /**
      * \brief Constructs the LLVM code generator visitor
@@ -69,10 +87,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      * This constructor instantiates an NMODL LLVM code generator. This is
      * just template to work with initial implementation.
      */
-    CodegenLLVMVisitor(const std::string& mod_filename, const std::string& output_dir)
+    CodegenLLVMVisitor(const std::string& mod_filename,
+                       const std::string& output_dir,
+                       bool opt_passes)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
-        , builder(*context) {}
+        , opt_passes(opt_passes)
+        , builder(*context)
+        , fpm(module.get()) {}
 
     /**
      * Visit nmodl function or procedure
diff --git a/src/main.cpp b/src/main.cpp
index ec0d32b848..336b877686 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -166,6 +166,14 @@ int main(int argc, const char* argv[]) {
     /// floating point data type
     std::string data_type("double");
 
+#ifdef NMODL_LLVM_BACKEND
+    /// generate llvm IR
+    bool llvm_ir(false);
+
+    /// run llvm optimisation passes
+    bool llvm_opt_passes(false);
+#endif
+
     app.get_formatter()->column_width(40);
     app.set_help_all_flag("-H,--help-all", "Print this help message including all sub-commands");
 
@@ -173,10 +181,6 @@ int main(int argc, const char* argv[]) {
         ->ignore_case()
         ->check(CLI::IsMember({"trace", "debug", "info", "warning", "error", "critical", "off"}));
 
-#ifdef NMODL_LLVM_BACKEND
-    app.add_flag("--llvm", llvm_backend, "Enable LLVM based code generation")->ignore_case();
-#endif
-
     app.add_option("file", mod_files, "One or more MOD files to process")
         ->ignore_case()
         ->required()
@@ -276,6 +280,15 @@ int main(int argc, const char* argv[]) {
         optimize_ionvar_copies_codegen,
         "Optimize copies of ion variables ({})"_format(optimize_ionvar_copies_codegen))->ignore_case();
 
+#ifdef NMODL_LLVM_BACKEND
+    auto llvm_opt = app.add_subcommand("llvm", "LLVM code generation option")->ignore_case();
+    llvm_opt->add_flag("--ir",
+        llvm_ir,
+        "Generate LLVM IR ({})"_format(llvm_ir))->ignore_case();
+    llvm_opt->add_flag("--opt",
+        llvm_opt_passes,
+        "Run LLVM optimisation passes ({})"_format(llvm_opt_passes))->ignore_case();
+#endif
     // clang-format on
 
     CLI11_PARSE(app, argc, argv);
@@ -576,9 +589,9 @@ int main(int argc, const char* argv[]) {
             }
 
 #ifdef NMODL_LLVM_BACKEND
-            if (llvm_backend) {
+            if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(modfile, output_dir);
+                CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes);
                 visitor.visit_program(*ast);
             }
 #endif
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
index 4017b6a505..4a45af7d1e 100644
--- a/test/integration/mod/procedure.mod
+++ b/test/integration/mod/procedure.mod
@@ -21,7 +21,7 @@ PROCEDURE complex_sum(v) {
     }
 }
 
-PROCEDURE loop_function(v) {
+PROCEDURE loop_proc(v) {
     LOCAL i
     i = 0
     WHILE(i < 10) {
@@ -29,3 +29,9 @@ PROCEDURE loop_function(v) {
         i = i + 1
     }
 }
+
+FUNCTION square(x) {
+    LOCAL res
+    res = x * x
+    square = res
+}
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index 44ca18391b..9c86e8c30a 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -23,14 +23,14 @@ using nmodl::parser::NmodlDriver;
 // Utility to get LLVM module as a string
 //=============================================================================
 
-std::string run_llvm_visitor(const std::string& text) {
+std::string run_llvm_visitor(const std::string& text, bool opt = false) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
     InlineVisitor().visit_program(*ast);
 
-    codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".");
+    codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".", opt);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.print_module();
 }
@@ -52,10 +52,11 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check the values are loaded correctly and added
             std::regex rhs(R"(%1 = load double, double\* %b)");
             std::regex lhs(R"(%2 = load double, double\* %a)");
             std::regex res(R"(%3 = fadd double %2, %1)");
+
+            // Check the values are loaded correctly and added
             REQUIRE(std::regex_search(module_string, m, rhs));
             REQUIRE(std::regex_search(module_string, m, lhs));
             REQUIRE(std::regex_search(module_string, m, res));
@@ -255,3 +256,28 @@ SCENARIO("Unary expression", "[visitor][llvm]") {
         }
     }
 }
+
+//=============================================================================
+// Optimization : dead code removal
+//=============================================================================
+
+SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
+    GIVEN("Procedure using local variables, without any side effects") {
+        std::string nmodl_text = R"(
+            PROCEDURE add(a, b) {
+                LOCAL i
+                i = a + b
+            }
+        )";
+
+        THEN("with optimisation enabled, all ops are eliminated") {
+            std::string module_string = run_llvm_visitor(nmodl_text, true);
+            std::smatch m;
+
+            // Check if the values are optimised out
+            std::regex empty_proc(
+                R"(define void @add\(double %a1, double %b2\) \{\n(\s)*ret void\n\})");
+            REQUIRE(std::regex_search(module_string, m, empty_proc));
+        }
+    }
+}
\ No newline at end of file

From de3308801605bf31c38560f9d13c076bbd9b0117 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Wed, 30 Dec 2020 22:14:00 +0300
Subject: [PATCH 118/331] Add function call LLVM code generation (#477)

This patch adds support for function call code generation, particularly:

- User-defined procedures and functions can now lowered to LLVM IR.
- A framework for external method calls (e.g. sin, exp, etc.) has been created, currently `exp` and `pow` are supported.
- Corresponding tests added.

fixes #472
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 103 +++++++++++++++++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp |  32 ++++++-
 test/unit/CMakeLists.txt                  |   3 +-
 test/unit/codegen/llvm.cpp                | 104 +++++++++++++++++++++-
 4 files changed, 231 insertions(+), 11 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index d99e519dca..430f3d78de 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -7,8 +7,8 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "ast/all.hpp"
+#include "codegen/codegen_helper_visitor.hpp"
 #include "visitors/rename_visitor.hpp"
-#include "visitors/visitor_utils.hpp"
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -44,7 +44,56 @@ void CodegenLLVMVisitor::run_llvm_opt_passes() {
 }
 
 
-void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
+void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
+                                                     const ast::ExpressionVector& arguments) {
+    std::vector<llvm::Value*> argument_values;
+    std::vector<llvm::Type*> argument_types;
+    for (const auto& arg: arguments) {
+        arg->accept(*this);
+        llvm::Value* value = values.back();
+        llvm::Type* type = value->getType();
+        values.pop_back();
+        argument_types.push_back(type);
+        argument_values.push_back(value);
+    }
+
+#define DISPATCH(method_name, intrinsic)                                                           \
+    if (name == method_name) {                                                                     \
+        llvm::Value* result = builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
+        values.push_back(result);                                                                  \
+        return;                                                                                    \
+    }
+
+    DISPATCH("exp", llvm::Intrinsic::exp);
+    DISPATCH("pow", llvm::Intrinsic::pow);
+#undef DISPATCH
+
+    throw std::runtime_error("Error: External method" + name + " is not currently supported");
+}
+
+void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
+                                              const std::string& name,
+                                              const ast::ExpressionVector& arguments) {
+    // Check that function is called with the expected number of arguments.
+    if (arguments.size() != func->arg_size()) {
+        throw std::runtime_error("Error: Incorrect number of arguments passed");
+    }
+
+    // Process each argument and add it to a vector to pass to the function call instruction. Note
+    // that type checks are not needed here as NMODL operates on doubles by default.
+    std::vector<llvm::Value*> argument_values;
+    for (const auto& arg: arguments) {
+        arg->accept(*this);
+        llvm::Value* value = values.back();
+        values.pop_back();
+        argument_values.push_back(value);
+    }
+
+    llvm::Value* call = builder.CreateCall(func, argument_values);
+    values.push_back(call);
+}
+
+void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Block& node) {
     const auto& name = node.get_node_name();
     const auto& parameters = node.get_parameters();
 
@@ -57,11 +106,17 @@ void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
     llvm::Type* return_type = node.is_function_block() ? llvm::Type::getDoubleTy(*context)
                                                        : llvm::Type::getVoidTy(*context);
 
-    llvm::Function* func =
-        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
-                               llvm::Function::ExternalLinkage,
-                               name,
-                               *module);
+    // Create a function that is automatically inserted into module's symbol table.
+    llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                           llvm::Function::ExternalLinkage,
+                           name,
+                           *module);
+}
+
+void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
+    const auto& name = node.get_node_name();
+    const auto& parameters = node.get_parameters();
+    llvm::Function* func = module->getFunction(name);
 
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
@@ -175,6 +230,22 @@ void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
     visit_procedure_or_function(node);
 }
 
+void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
+    const auto& name = node.get_node_name();
+    auto func = module->getFunction(name);
+    if (func) {
+        create_function_call(func, name, node.get_arguments());
+    } else {
+        auto symbol = sym_tab->lookup(name);
+        if (symbol && symbol->has_any_property(symtab::syminfo::NmodlType::extern_method)) {
+            create_external_method_call(name, node.get_arguments());
+        } else {
+            throw std::runtime_error("Error: Unknown function name: " + name +
+                                     ". (External functions references are not supported)");
+        }
+    }
+}
+
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
                                                   node.get_value());
@@ -191,6 +262,24 @@ void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatemen
 }
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
+    // Before generating LLVM, gather information about AST. For now, information about functions
+    // and procedures is used only.
+    CodegenHelperVisitor v;
+    CodegenInfo info = v.analyze(node);
+
+    // For every function and procedure, generate its declaration. Thus, we can look up
+    // `llvm::Function` in the symbol table in the module.
+    for (const auto& func: info.functions) {
+        emit_procedure_or_function_declaration(*func);
+    }
+    for (const auto& proc: info.procedures) {
+        emit_procedure_or_function_declaration(*proc);
+    }
+
+    // Set the AST symbol table.
+    sym_tab = node.get_symbol_table();
+
+    // Proceed with code generation.
     node.visit_children(*this);
 
     if (opt_passes) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 6b94ecffbe..32347bdabd 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -18,6 +18,7 @@
 #include <ostream>
 #include <string>
 
+#include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
@@ -69,7 +70,10 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Pointer to the local symbol table.
     llvm::ValueSymbolTable* local_named_values = nullptr;
 
-    // Run optimisation passes if true
+    // Pointer to AST symbol table.
+    symtab::SymbolTable* sym_tab;
+
+    // Run optimisation passes if true.
     bool opt_passes;
 
     /**
@@ -96,6 +100,31 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , builder(*context)
         , fpm(module.get()) {}
 
+    /**
+     * Create a function call to an external method
+     * \param name external method name
+     * \param arguments expressions passed as arguments to the given external method
+     */
+    void create_external_method_call(const std::string& name,
+                                     const ast::ExpressionVector& arguments);
+
+    /**
+     * Create a function call to NMODL function or procedure in the same mod file
+     * \param func LLVM function corresponding ti this call
+     * \param name function name
+     * \param arguments expressions passed as arguments to the function call
+     */
+    void create_function_call(llvm::Function* func,
+                              const std::string& name,
+                              const ast::ExpressionVector& arguments);
+
+    /**
+     * Emit function or procedure declaration in LLVM given the node
+     *
+     * \param node the AST node representing the function or procedure in NMODL
+     */
+    void emit_procedure_or_function_declaration(const ast::Block& node);
+
     /**
      * Visit nmodl function or procedure
      * \param node the AST node representing the function or procedure in NMODL
@@ -107,6 +136,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_boolean(const ast::Boolean& node) override;
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
+    void visit_function_call(const ast::FunctionCall& node) override;
     void visit_integer(const ast::Integer& node) override;
     void visit_local_list_statement(const ast::LocalListStatement& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 29957a7530..f9c76827fd 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -109,13 +109,14 @@ if(NMODL_ENABLE_LLVM)
   add_executable(testllvm visitor/main.cpp codegen/llvm.cpp)
   target_link_libraries(
     testllvm
+    llvm_codegen
+    codegen
     visitor
     symtab
     lexer
     util
     test_util
     printer
-    llvm_codegen
     ${NMODL_WRAPPER_LIBS}
     ${LLVM_LIBS_TO_LINK})
   set(CODEGEN_TEST testllvm)
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index 9c86e8c30a..d2c0a65e86 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -12,7 +12,6 @@
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
-#include "visitors/inline_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
 
 using namespace nmodl;
@@ -28,7 +27,6 @@ std::string run_llvm_visitor(const std::string& text, bool opt = false) {
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
-    InlineVisitor().visit_program(*ast);
 
     codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".", opt);
     llvm_visitor.visit_program(*ast);
@@ -156,6 +154,108 @@ SCENARIO("Function", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// FunctionCall
+//=============================================================================
+
+SCENARIO("Function call", "[visitor][llvm]") {
+    GIVEN("A call to procedure") {
+        std::string nmodl_text = R"(
+            PROCEDURE bar() {}
+            FUNCTION foo() {
+                bar()
+            }
+        )";
+
+        THEN("a void call instruction is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check for call instruction.
+            std::regex call(R"(call void @bar\(\))");
+            REQUIRE(std::regex_search(module_string, m, call));
+        }
+    }
+
+    GIVEN("A call to function declared below the caller") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(x) {
+                foo = 4 * bar()
+            }
+            FUNCTION bar() {
+                bar = 5
+            }
+        )";
+
+        THEN("a correct call instruction is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check for call instruction.
+            std::regex call(R"(%[0-9]+ = call double @bar\(\))");
+            REQUIRE(std::regex_search(module_string, m, call));
+        }
+    }
+
+    GIVEN("A call to function with arguments") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(x, y) {
+                foo = 4 * x - y
+            }
+            FUNCTION bar(i) {
+                bar = foo(i, 4)
+            }
+        )";
+
+        THEN("arguments are processed before the call and passed to call instruction") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check correct arguments.
+            std::regex i(R"(%1 = load double, double\* %i)");
+            std::regex call(R"(call double @foo\(double %1, double 4.000000e\+00\))");
+            REQUIRE(std::regex_search(module_string, m, i));
+            REQUIRE(std::regex_search(module_string, m, call));
+        }
+    }
+
+    GIVEN("A call to external method") {
+        std::string nmodl_text = R"(
+            FUNCTION bar(i) {
+                bar = exp(i)
+            }
+        )";
+
+        THEN("LLVM intrinsic corresponding to this method is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check for intrinsic declaration.
+            std::regex exp(R"(declare double @llvm\.exp\.f64\(double\))");
+            REQUIRE(std::regex_search(module_string, m, exp));
+
+            // Check the correct call is made.
+            std::regex call(R"(call double @llvm\.exp\.f64\(double %[0-9]+\))");
+            REQUIRE(std::regex_search(module_string, m, call));
+        }
+    }
+
+    GIVEN("A call to function with the wrong number of arguments") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(x, y) {
+                foo = 4 * x - y
+            }
+            FUNCTION bar(i) {
+                bar = foo(i)
+            }
+        )";
+
+        THEN("a runtime error is thrown") {
+            REQUIRE_THROWS_AS(run_llvm_visitor(nmodl_text), std::runtime_error);
+        }
+    }
+}
+
 //=============================================================================
 // LocalList and LocalVar
 //=============================================================================

From d0fe34d56aea6d771fb33f8ffb46f18f3166345f Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 31 Dec 2020 00:49:13 +0300
Subject: [PATCH 119/331] Support for IndexedName codegen (#478)

LLVM code generation for `IndexedName`s.

- Added code generation for initialising arrays in LOCAL blocks (with both integer constants and macros).
- Added support for indexing arrays.

fixes #467
---
 src/codegen/llvm/codegen_llvm_visitor.cpp |  88 +++++++++++++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp |  29 ++++++
 test/unit/codegen/llvm.cpp                | 111 ++++++++++++++++++++++
 3 files changed, 220 insertions(+), 8 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 430f3d78de..b2a09fdd96 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -25,6 +25,44 @@ namespace codegen {
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
+bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
+    llvm::Type* array_type =
+        local_named_values->lookup(node.get_node_name())->getType()->getPointerElementType();
+    unsigned length = array_type->getArrayNumElements();
+    return 0 <= index && index < length;
+}
+
+llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, unsigned index) {
+    llvm::Type* index_type = llvm::Type::getInt32Ty(*context);
+    std::vector<llvm::Value*> indices;
+    indices.push_back(llvm::ConstantInt::get(index_type, 0));
+    indices.push_back(llvm::ConstantInt::get(index_type, index));
+
+    return builder.CreateInBoundsGEP(local_named_values->lookup(name), indices);
+}
+
+llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
+    unsigned index = get_array_index_or_length(node);
+
+    // Check if index is within array bounds.
+    if (!check_array_bounds(node, index))
+        throw std::runtime_error("Error: Index is out of bounds");
+
+    return create_gep(node.get_node_name(), index);
+}
+
+unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& indexed_name) {
+    auto integer = std::dynamic_pointer_cast<ast::Integer>(indexed_name.get_length());
+    if (!integer)
+        throw std::runtime_error("Error: expecting integer index or length");
+
+    // Check if integer value is taken from a macro.
+    if (!integer->get_macro())
+        return integer->get_value();
+    const auto& macro = sym_tab->lookup(integer->get_macro()->get_node_name());
+    return static_cast<unsigned>(*macro->get_value());
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
@@ -43,7 +81,6 @@ void CodegenLLVMVisitor::run_llvm_opt_passes() {
     }
 }
 
-
 void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
                                                      const ast::ExpressionVector& arguments) {
     std::vector<llvm::Value*> argument_values;
@@ -187,8 +224,17 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
         if (!var) {
             throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
         }
-        llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
-        builder.CreateStore(rhs, alloca);
+
+        const auto& identifier = var->get_name();
+        if (identifier->is_name()) {
+            llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
+            builder.CreateStore(rhs, alloca);
+        } else if (identifier->is_indexed_name()) {
+            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+            builder.CreateStore(rhs, codegen_indexed_name(*indexed_name));
+        } else {
+            throw std::runtime_error("Error: Unsupported variable type");
+        }
         return;
     }
 
@@ -254,10 +300,22 @@ void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
 
 void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatement& node) {
     for (const auto& variable: node.get_variables()) {
-        // LocalVar always stores a Name.
-        auto name = variable->get_node_name();
-        llvm::Type* var_type = llvm::Type::getDoubleTy(*context);
-        llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+        std::string name = variable->get_node_name();
+        const auto& identifier = variable->get_name();
+        // Local variable can be a scalar (Node AST class) or an array (IndexedName AST class). For
+        // each case, create memory allocations with the corresponding LLVM type.
+        llvm::Type* var_type;
+        if (identifier->is_indexed_name()) {
+            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+            unsigned length = get_array_index_or_length(*indexed_name);
+            var_type = llvm::ArrayType::get(llvm::Type::getDoubleTy(*context), length);
+        } else if (identifier->is_name()) {
+            // This case corresponds to a scalar local variable. Its type is double by default.
+            var_type = llvm::Type::getDoubleTy(*context);
+        } else {
+            throw std::runtime_error("Error: Unsupported local variable type");
+        }
+        builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
     }
 }
 
@@ -310,7 +368,21 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
 }
 
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
-    llvm::Value* var = builder.CreateLoad(local_named_values->lookup(node.get_node_name()));
+    const auto& identifier = node.get_name();
+    if (!identifier->is_name() && !identifier->is_indexed_name())
+        throw std::runtime_error("Error: Unsupported variable type");
+
+    llvm::Value* ptr;
+    if (identifier->is_name())
+        ptr = local_named_values->lookup(node.get_node_name());
+
+    if (identifier->is_indexed_name()) {
+        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        ptr = codegen_indexed_name(*indexed_name);
+    }
+
+    // Finally, load the variable from the pointer value.
+    llvm::Value* var = builder.CreateLoad(ptr);
     values.push_back(var);
 }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 32347bdabd..be4eb04867 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -100,6 +100,35 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , builder(*context)
         , fpm(module.get()) {}
 
+    /**
+     * Checks if array index specified by the given IndexedName is within bounds
+     * \param node IndexedName representing array
+     * \return     \c true if the index is within bounds
+     */
+    bool check_array_bounds(const ast::IndexedName& node, unsigned index);
+
+    /**
+     * Generates LLVM code for the given IndexedName
+     * \param node IndexedName NMODL AST node
+     * \return LLVM code generated for this AST node
+     */
+    llvm::Value* codegen_indexed_name(const ast::IndexedName& node);
+
+    /**
+     * Returns GEP instruction to 1D array
+     * \param name 1D array name
+     * \param index element index
+     * \return GEP instruction value
+     */
+    llvm::Value* create_gep(const std::string& name, unsigned index);
+
+    /**
+     * Returns array index or length from given IndexedName
+     * \param node IndexedName representing array
+     * \return array index or length
+     */
+    unsigned get_array_index_or_length(const ast::IndexedName& node);
+
     /**
      * Create a function call to an external method
      * \param name external method name
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index d2c0a65e86..0ceadbe6f1 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -116,6 +116,31 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// Define
+//=============================================================================
+
+SCENARIO("Define", "[visitor][llvm]") {
+    GIVEN("Procedure with array variable of length specified by DEFINE") {
+        std::string nmodl_text = R"(
+            DEFINE N 100
+
+            PROCEDURE foo() {
+                LOCAL x[N]
+            }
+        )";
+
+        THEN("macro is expanded and array is allocated") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check stack allocations for i and j
+            std::regex array(R"(%x = alloca \[100 x double\])");
+            REQUIRE(std::regex_search(module_string, m, array));
+        }
+    }
+}
+
 //=============================================================================
 // FunctionBlock
 //=============================================================================
@@ -256,6 +281,92 @@ SCENARIO("Function call", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// IndexedName
+//=============================================================================
+
+SCENARIO("Indexed name", "[visitor][llvm]") {
+    GIVEN("Procedure with a local array variable") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo() {
+                LOCAL x[2]
+            }
+        )";
+
+        THEN("array is allocated") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex array(R"(%x = alloca \[2 x double\])");
+            REQUIRE(std::regex_search(module_string, m, array));
+        }
+    }
+
+    GIVEN("Procedure with a local array assignment") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo() {
+                LOCAL x[2]
+                x[1] = 3
+            }
+        )";
+
+        THEN("element is stored to the array") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check GEP is created correctly to pint at array element.
+            std::regex GEP(
+                R"(%1 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i32 0, i32 1)");
+            REQUIRE(std::regex_search(module_string, m, GEP));
+
+            // Check the value is stored to the pointer.
+            std::regex store(R"(store double 3.000000e\+00, double\* %1)");
+            REQUIRE(std::regex_search(module_string, m, store));
+        }
+    }
+
+    GIVEN("Procedure with a assignment of array element") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo() {
+                LOCAL x[2], y
+                x[1] = 3
+                y = x[1]
+            }
+        )";
+
+        THEN("array element is stored to the variable") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check GEP is created correctly to pint at array element.
+            std::regex GEP(
+                R"(%2 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i32 0, i32 1)");
+            REQUIRE(std::regex_search(module_string, m, GEP));
+
+            // Check the value is loaded from the pointer.
+            std::regex load(R"(%3 = load double, double\* %2)");
+            REQUIRE(std::regex_search(module_string, m, load));
+
+            // Check the value is stored to the the variable.
+            std::regex store(R"(store double %3, double\* %y)");
+            REQUIRE(std::regex_search(module_string, m, store));
+        }
+    }
+
+    GIVEN("Array with out of bounds access") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo() {
+                LOCAL x[2]
+                x[5] = 3
+            }
+        )";
+
+        THEN("error is thrown") {
+            REQUIRE_THROWS_AS(run_llvm_visitor(nmodl_text), std::runtime_error);
+        }
+    }
+}
+
 //=============================================================================
 // LocalList and LocalVar
 //=============================================================================

From c79689a6c52c80c0737bc6eec7b6f06cdf0a1bbd Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Wed, 6 Jan 2021 23:51:59 +0100
Subject: [PATCH 120/331] Improvements for code generation specific
 transformations (#483)

NMODL AST needs various transformation to generate C++
code or LLVM IR. This PR is begining of AST transformations
to simplify code generation backends.

* New CodegenLLVMHelperVisitor to perform various AST
  transformations to simplify code generation for various
  backends and simulators.
* CodegenLLVMHelperVisitor is currently limited to LLVM
  backend to simplify initial implementation and keep
  C++ based backends working.
* CodegenLLVMHelperVisitor now handles FUNCTIONS and
  PROCEDURES blocks
  -  Replace LocalListStatement with CodegenVarStatement
  - Added new AST types for code generation
    - CodegenVar to represent variable used for code generation
    - CodegenVarType to represent codegen variable
    - CodegenVarListStatement to represent list of CodegenVar
    - CodegenStruct will be used in future to represent struct
      like NrnThread or Mechanism class

See #474
---
 src/codegen/llvm/CMakeLists.txt               |   7 +-
 .../llvm/codegen_llvm_helper_visitor.cpp      | 113 ++++++++++++++++++
 .../llvm/codegen_llvm_helper_visitor.hpp      |  50 ++++++++
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   8 ++
 src/language/code_generator.cmake             |   8 ++
 src/language/codegen.yaml                     | 109 ++++++++++++++++-
 src/language/nmodl.yaml                       |   2 +-
 src/language/node_info.py                     |   2 +
 src/language/nodes.py                         |   4 +
 src/language/templates/ast/ast_decl.hpp       |  10 ++
 .../templates/visitors/nmodl_visitor.cpp      |   3 +
 src/main.cpp                                  |   3 +
 test/integration/mod/procedure.mod            |   2 +-
 test/unit/codegen/llvm.cpp                    |   2 +-
 14 files changed, 316 insertions(+), 7 deletions(-)
 create mode 100644 src/codegen/llvm/codegen_llvm_helper_visitor.cpp
 create mode 100644 src/codegen/llvm/codegen_llvm_helper_visitor.hpp

diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 71ecca338c..db16d4072c 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -1,8 +1,11 @@
 # =============================================================================
 # Codegen sources
 # =============================================================================
-set(LLVM_CODEGEN_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.cpp
-                              ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp)
+set(LLVM_CODEGEN_SOURCE_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp)
 
 # =============================================================================
 # LLVM codegen library
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
new file mode 100644
index 0000000000..c52cc92a3d
--- /dev/null
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -0,0 +1,113 @@
+
+/*************************************************************************
+ * Copyright (C) 2018-2019 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen_llvm_helper_visitor.hpp"
+
+#include "ast/all.hpp"
+#include "utils/logger.hpp"
+#include "visitors/visitor_utils.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+using namespace fmt::literals;
+
+void CodegenLLVMHelperVisitor::visit_statement_block(ast::StatementBlock& node) {
+    node.visit_children(*this);
+
+    /// if local list statement exist, we have to replace it
+    auto local_statement = visitor::get_local_list_statement(node);
+    if (local_statement) {
+        /// create codegen variables from local variables
+        ast::CodegenVarVector variables;
+        for (const auto& var: local_statement->get_variables()) {
+            variables.emplace_back(new ast::CodegenVar(0, var->get_name()->clone()));
+        }
+
+        /// remove local list statement now
+        const auto& statements = node.get_statements();
+        node.erase_statement(statements.begin());
+
+        /// create new codegen variable statement
+        auto type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
+        auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+
+        /// insert codegen variable statement
+        node.insert_statement(statements.begin(), statement);
+    }
+}
+
+void CodegenLLVMHelperVisitor::add_function_procedure_node(ast::Block& node) {
+    std::string function_name = node.get_node_name();
+
+    const auto& source_node_type = node.get_node_type();
+    auto name = new ast::Name(new ast::String(function_name));
+    auto return_var = new ast::Name(new ast::String("ret_" + function_name));
+    ast::CodegenVarType* var_type = nullptr;
+    ast::CodegenVarType* return_type = nullptr;
+
+    /// return type based on node type
+    bool is_function = source_node_type == ast::AstNodeType::FUNCTION_BLOCK;
+    if (is_function) {
+        var_type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
+    } else {
+        var_type = new ast::CodegenVarType(ast::AstNodeType::INTEGER);
+    }
+
+    /// return type is same as variable type
+    return_type = var_type->clone();
+
+    /// function body and it's statement
+    auto block = node.get_statement_block()->clone();
+    const auto& statements = block->get_statements();
+
+    /// insert return variable at the start of the block
+    ast::CodegenVarVector codegen_vars;
+    codegen_vars.emplace_back(new ast::CodegenVar(0, return_var->clone()));
+    auto statement = std::make_shared<ast::CodegenVarListStatement>(var_type, codegen_vars);
+    block->insert_statement(statements.begin(), statement);
+
+    /// add return statement
+    auto return_statement = new ast::CodegenReturnStatement(return_var);
+    block->emplace_back_statement(return_statement);
+
+    /// prepare arguments
+    ast::CodegenArgumentVector code_arguments;
+    const auto& arguments = node.get_parameters();
+    for (const auto& arg: arguments) {
+        auto type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
+        auto var = arg->get_name()->clone();
+        code_arguments.emplace_back(new ast::CodegenArgument(type, var));
+    }
+
+    /// add new node to AST
+    auto function =
+        std::make_shared<ast::CodegenFunction>(return_type, name, code_arguments, block);
+    codegen_functions.push_back(function);
+}
+
+void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
+    node.visit_children(*this);
+    add_function_procedure_node(node);
+}
+
+void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
+    node.visit_children(*this);
+    add_function_procedure_node(node);
+}
+
+void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
+    logger->info("Running CodegenLLVMHelperVisitor");
+    node.visit_children(*this);
+    for (auto& fun: codegen_functions) {
+        node.emplace_back_node(fun);
+    }
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
new file mode 100644
index 0000000000..b7ff57aec1
--- /dev/null
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (C) 2018-2019 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief \copybrief nmodl::codegen::CodegenLLVMHelperVisitor
+ */
+
+#include <string>
+
+#include "codegen/codegen_info.hpp"
+#include "symtab/symbol_table.hpp"
+#include "visitors/ast_visitor.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+/**
+ * @addtogroup llvm_codegen_details
+ * @{
+ */
+
+/**
+ * \class CodegenLLVMHelperVisitor
+ * \brief Helper visitor to gather AST information to help LLVM code generation
+ */
+class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
+    std::vector<std::shared_ptr<ast::CodegenFunction>> codegen_functions;
+
+    void add_function_procedure_node(ast::Block& node);
+
+  public:
+    CodegenLLVMHelperVisitor() = default;
+
+    void visit_statement_block(ast::StatementBlock& node) override;
+    void visit_procedure_block(ast::ProcedureBlock& node) override;
+    void visit_function_block(ast::FunctionBlock& node) override;
+    void visit_program(ast::Program& node) override;
+};
+
+/** @} */  // end of llvm_codegen_details
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index b2a09fdd96..0fa0864d9a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -6,6 +6,8 @@
  *************************************************************************/
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+
 #include "ast/all.hpp"
 #include "codegen/codegen_helper_visitor.hpp"
 #include "visitors/rename_visitor.hpp"
@@ -347,6 +349,12 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     // Keep this for easier development (maybe move to debug mode later).
     std::cout << print_module();
+
+    // not used yet
+    {
+        CodegenLLVMHelperVisitor v;
+        v.visit_program(const_cast<ast::Program&>(node));
+    }
 }
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 783acd8d5b..f4847f8ec8 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -65,6 +65,14 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/block_comment.hpp
     ${PROJECT_BINARY_DIR}/src/ast/boolean.hpp
     ${PROJECT_BINARY_DIR}/src/ast/breakpoint_block.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_argument.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_for_statement.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_function.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_struct.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_var.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_var_list_statement.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_var_type.hpp
     ${PROJECT_BINARY_DIR}/src/ast/compartment.hpp
     ${PROJECT_BINARY_DIR}/src/ast/conductance_hint.hpp
     ${PROJECT_BINARY_DIR}/src/ast/conserve.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index a3156b4f3c..6a3f7fbf83 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -29,6 +29,34 @@
                 children:
                   - Number:
                   - Identifier:
+                      children:
+                        - CodegenVarType:
+                            brief: "Represent type of the variable"
+                            members:
+                              - type:
+                                  brief: "Type of the ast node"
+                                  type: AstNodeType
+                        - CodegenVar:
+                            brief: "Represent variable used for code generation"
+                            members:
+                              - pointer:
+                                  brief: "If variable is pointer type"
+                                  type: int
+                              - name:
+                                  brief: "Name of the variable"
+                                  type: Identifier
+                                  node_name: true
+                        - CodegenArgument:
+                            brief: "Represent argument to a function"
+                            members:
+                              - type:
+                                  brief: "Type of the argument"
+                                  type: CodegenVarType
+                                  suffix: {value: " "}
+                              - name:
+                                  brief: "Name of the argument"
+                                  type: Identifier
+                                  node_name: true
                   - Block:
                       children:
                         - NrnStateBlock:
@@ -89,7 +117,29 @@
                                   type: StatementBlock
                               - finalize_block:
                                   brief: "Statement block to be executed after calling linear solver"
-                                  type: StatementBlock                                  
+                                  type: StatementBlock
+                        - CodegenFunction:
+                            brief: "Function generated from FUNCTION or PROCEDURE block"
+                            members:
+                              - return_type:
+                                  brief: "Return type of the function"
+                                  type: CodegenVarType
+                                  suffix: {value: " "}
+                              - name:
+                                  brief: "Name of the function"
+                                  type: Name
+                                  node_name: true
+                              - arguments:
+                                  brief: "Vector of the parameters to the function"
+                                  type: CodegenArgument
+                                  vector: true
+                                  prefix: {value: "(", force: true}
+                                  suffix: {value: ")", force: true}
+                                  separator: ", "
+                              - statement_block:
+                                  brief: "Body of the function"
+                                  type: StatementBlock
+                                  getter: {override: true}
                   - WrappedExpression:
                       brief: "Wrap any other expression type"
                       members:
@@ -110,6 +160,17 @@
                         - node_to_solve:
                             brief: "Block to be solved (callback node or solution node itself)"
                             type: Expression
+                  - CodegenStruct:
+                      brief: "Represent a struct or class for code generation"
+                      members:
+                        - variable_statements:
+                            brief: "member variables of the class/struct"
+                            type: CodegenVarListStatement
+                            vector: true
+                        - functions:
+                            brief: "member functions of the class/struct"
+                            type: CodegenFunction
+                            vector: true
             - Statement:
                 brief: "Statement base class"
                 children:
@@ -120,4 +181,48 @@
                             brief: "Value of new timestep"
                             type: Double
                             prefix: {value: " = "}
-                      brief: "Statement to indicate a change in timestep in a given block"
\ No newline at end of file
+                      brief: "Statement to indicate a change in timestep in a given block"
+                  - CodegenForStatement:
+                      brief: "Represent for loop used for code generation"
+                      nmodl: "for("
+                      members:
+                        - initialization:
+                            brief: "initialization expression for the loop"
+                            type: Expression
+                            optional: true
+                        - condition:
+                            brief: "condition expression for the loop"
+                            type: Expression
+                            optional: true
+                            prefix: {value: ";"}
+                            suffix: {value: "; "}
+                        - increment:
+                            brief: "increment or decrement expression for the loop"
+                            type: Expression
+                            optional: true
+                            suffix: {value: ") "}
+                        - statement_block:
+                            brief: "body of the loop"
+                            type: StatementBlock
+                            getter: {override: true}
+                  - CodegenReturnStatement:
+                      brief: "Represent return statement for code generation"
+                      nmodl: "return "
+                      members:
+                        - statement:
+                            brief: "return statement"
+                            type: Expression
+                            optional: true
+                  - CodegenVarListStatement:
+                      brief: "Represent list of variables used for code generation"
+                      members:
+                        - var_type:
+                            brief: "Type of the variables"
+                            type: CodegenVarType
+                            suffix: {value: " "}
+                        - variables:
+                            brief: "List of the variables to define"
+                            type: CodegenVar
+                            vector: true
+                            separator: ", "
+                            add: true
diff --git a/src/language/nmodl.yaml b/src/language/nmodl.yaml
index a71358f701..2bafd00af5 100644
--- a/src/language/nmodl.yaml
+++ b/src/language/nmodl.yaml
@@ -1374,7 +1374,7 @@
                             type: Double
 
             - Statement:
-                brief: "TODO"
+                brief: "Base class to represent a statement in the NMODL"
                 children:
                   - UnitState:
                       brief: "TODO"
diff --git a/src/language/node_info.py b/src/language/node_info.py
index f4fb599347..bd81a0d14a 100644
--- a/src/language/node_info.py
+++ b/src/language/node_info.py
@@ -29,6 +29,7 @@
                   "QueueType",
                   "BAType",
                   "UnitStateType",
+                  "AstNodeType",
                   }
 
 BASE_TYPES = {"std::string" } | INTEGRAL_TYPES
@@ -167,6 +168,7 @@
 STATEMENT_BLOCK_NODE = "StatementBlock"
 STRING_NODE = "String"
 UNIT_BLOCK = "UnitBlock"
+AST_NODETYPE_NODE= "AstNodeType"
 
 # name of variable in prime node which represent order of derivative
 ORDER_VAR_NAME = "order"
diff --git a/src/language/nodes.py b/src/language/nodes.py
index bbbd983b01..84cddd8d12 100644
--- a/src/language/nodes.py
+++ b/src/language/nodes.py
@@ -136,6 +136,10 @@ def is_boolean_node(self):
     def is_name_node(self):
         return self.class_name == node_info.NAME_NODE
 
+    @property
+    def is_ast_nodetype_node(self):
+        return self.class_name == node_info.AST_NODETYPE_NODE
+
     @property
     def is_enum_node(self):
         data_type = node_info.DATA_TYPES[self.class_name]
diff --git a/src/language/templates/ast/ast_decl.hpp b/src/language/templates/ast/ast_decl.hpp
index 184bfc3315..17faa90d6f 100644
--- a/src/language/templates/ast/ast_decl.hpp
+++ b/src/language/templates/ast/ast_decl.hpp
@@ -12,6 +12,7 @@
 #pragma once
 
 #include <memory>
+#include <string>
 #include <vector>
 
 /// \file
@@ -50,6 +51,15 @@ enum class AstNodeType {
 
 /** @} */  // end of ast_type
 
+static inline std::string to_string(AstNodeType type) {
+    {% for node in nodes %}
+      if(type == AstNodeType::{{ node.class_name|snake_case|upper }}) {
+          return "{{ node.class_name|snake_case|upper }}";
+      }
+    {% endfor %}
+    throw std::runtime_error("Unhandled type in to_string(AstNodeType type)!");
+}
+
 /**
  * @defgroup ast_vec_type AST Vector Type Aliases
  * @ingroup ast
diff --git a/src/language/templates/visitors/nmodl_visitor.cpp b/src/language/templates/visitors/nmodl_visitor.cpp
index a69c3b0b26..f7bb8279ca 100644
--- a/src/language/templates/visitors/nmodl_visitor.cpp
+++ b/src/language/templates/visitors/nmodl_visitor.cpp
@@ -116,6 +116,9 @@ void NmodlPrintVisitor::visit_{{ node.class_name|snake_case}}(const {{ node.clas
     {% for child in node.children %}
         {% call guard(child.force_prefix, child.force_suffix) -%}
         {% if child.is_base_type_node %}
+            {% if child.is_ast_nodetype_node %}
+               printer->add_element(ast::to_string(node.get_{{child.varname}}()));
+            {% endif %}
         {% else %}
             {% if child.optional or child.is_statement_block_node %}
                 if(node.get_{{ child.varname }}()) {
diff --git a/src/main.cpp b/src/main.cpp
index 336b877686..c1b508e21e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -17,9 +17,11 @@
 #include "codegen/codegen_cuda_visitor.hpp"
 #include "codegen/codegen_ispc_visitor.hpp"
 #include "codegen/codegen_omp_visitor.hpp"
+
 #ifdef NMODL_LLVM_BACKEND
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #endif
+
 #include "config/config.h"
 #include "parser/nmodl_driver.hpp"
 #include "pybind/pyembed.hpp"
@@ -593,6 +595,7 @@ int main(int argc, const char* argv[]) {
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes);
                 visitor.visit_program(*ast);
+                ast_to_nmodl(*ast, filepath("llvm"));
             }
 #endif
         }
diff --git a/test/integration/mod/procedure.mod b/test/integration/mod/procedure.mod
index 4a45af7d1e..daa4ad33ad 100644
--- a/test/integration/mod/procedure.mod
+++ b/test/integration/mod/procedure.mod
@@ -21,7 +21,7 @@ PROCEDURE complex_sum(v) {
     }
 }
 
-PROCEDURE loop_proc(v) {
+PROCEDURE loop_proc(v, t) {
     LOCAL i
     i = 0
     WHILE(i < 10) {
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/llvm.cpp
index 0ceadbe6f1..d644947e79 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/llvm.cpp
@@ -491,4 +491,4 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
             REQUIRE(std::regex_search(module_string, m, empty_proc));
         }
     }
-}
\ No newline at end of file
+}

From 3115a32201d9c896927f5f87228a174e8ce10355 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Thu, 7 Jan 2021 00:50:36 +0100
Subject: [PATCH 121/331] nrn_state function generation in NMODL AST to help
 LLVM codegen (#484)

* Added new BinaryOp for += and -=
  * Added string_to_binaryop function
  * Added Void node type to represent void return type
  * Added CodegenAtomicStatement for ion write statements
  * llvm helper started handling visit_nrn_state_block
    - NrnStateBlock is being converted into CodegenFunction
    - for loop body with solution blocks created
    - voltage and node index initialization code added
    - read and write ion statements are handled
  * Some of the functions are now moved into CodegenInfo

Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>
---
 src/ast/ast_common.hpp                        |  25 +-
 src/codegen/codegen_c_visitor.hpp             |  56 --
 src/codegen/codegen_info.cpp                  |  74 +++
 src/codegen/codegen_info.hpp                  |  94 ++++
 .../llvm/codegen_llvm_helper_visitor.cpp      | 500 ++++++++++++++++--
 .../llvm/codegen_llvm_helper_visitor.hpp      |  42 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   2 +-
 src/language/code_generator.cmake             |   2 +
 src/language/codegen.yaml                     |  37 +-
 test/unit/CMakeLists.txt                      |   6 +-
 10 files changed, 724 insertions(+), 114 deletions(-)

diff --git a/src/ast/ast_common.hpp b/src/ast/ast_common.hpp
index eb854bb5c5..733fc406f7 100644
--- a/src/ast/ast_common.hpp
+++ b/src/ast/ast_common.hpp
@@ -43,9 +43,12 @@ namespace ast {
  *
  * NMODL support different binary operators and this
  * type is used to store their value in the AST.
+ *
+ * \note `+=` and `-=` are not supported by NMODL but they
+ * are added for code generation nodes.
  */
 typedef enum {
-    BOP_ADDITION,        ///< \+
+    BOP_ADDITION = 0,    ///< \+
     BOP_SUBTRACTION,     ///< --
     BOP_MULTIPLICATION,  ///< \c *
     BOP_DIVISION,        ///< \/
@@ -58,7 +61,9 @@ typedef enum {
     BOP_LESS_EQUAL,      ///< <=
     BOP_ASSIGN,          ///< =
     BOP_NOT_EQUAL,       ///< !=
-    BOP_EXACT_EQUAL      ///< ==
+    BOP_EXACT_EQUAL,     ///< ==
+    BOP_ADD_ASSIGN,      ///< \+=
+    BOP_SUB_ASSIGN       ///< \-=
 } BinaryOp;
 
 /**
@@ -68,7 +73,7 @@ typedef enum {
  * is used to lookup the corresponding symbol for the operator.
  */
 static const std::string BinaryOpNames[] =
-    {"+", "-", "*", "/", "^", "&&", "||", ">", "<", ">=", "<=", "=", "!=", "=="};
+    {"+", "-", "*", "/", "^", "&&", "||", ">", "<", ">=", "<=", "=", "!=", "==", "+=", "-="};
 
 /// enum type for unary operators
 typedef enum { UOP_NOT, UOP_NEGATION } UnaryOp;
@@ -106,6 +111,20 @@ typedef enum { LTMINUSGT, LTLT, MINUSGT } ReactionOp;
 /// string representation of ast::ReactionOp
 static const std::string ReactionOpNames[] = {"<->", "<<", "->"};
 
+/**
+ * Get corresponding ast::BinaryOp for given string
+ * @param op Binary operator in string format
+ * @return ast::BinaryOp for given string
+ */
+static inline BinaryOp string_to_binaryop(const std::string& op) {
+    /// check if binary operator supported otherwise error
+    auto it = std::find(std::begin(BinaryOpNames), std::end(BinaryOpNames), op);
+    if (it == std::end(BinaryOpNames)) {
+        throw std::runtime_error("Error in string_to_binaryop, can't find " + op);
+    }
+    int pos = std::distance(std::begin(BinaryOpNames), it);
+    return static_cast<BinaryOp>(pos);
+}
 /** @} */  // end of ast_prop
 
 }  // namespace ast
diff --git a/src/codegen/codegen_c_visitor.hpp b/src/codegen/codegen_c_visitor.hpp
index f9353bbc4a..a1eda7497b 100644
--- a/src/codegen/codegen_c_visitor.hpp
+++ b/src/codegen/codegen_c_visitor.hpp
@@ -47,46 +47,6 @@ namespace codegen {
  * @{
  */
 
-/**
- * \enum BlockType
- * \brief Helper to represent various block types
- *
- * Note: do not assign integers to these enums
- *
- */
-enum BlockType {
-    /// initial block
-    Initial,
-
-    /// constructor block
-    Constructor,
-
-    /// destructor block
-    Destructor,
-
-    /// breakpoint block
-    Equation,
-
-    /// ode_* routines block (not used)
-    Ode,
-
-    /// derivative block
-    State,
-
-    /// watch block
-    Watch,
-
-    /// net_receive block
-    NetReceive,
-
-    /// before / after block
-    BeforeAfter,
-
-    /// fake ending block type for loops on the enums. Keep it at the end
-    BlockTypeEnd
-};
-
-
 /**
  * \enum MemberType
  * \brief Helper to represent various variables types
@@ -141,22 +101,6 @@ struct IndexVariableInfo {
         , is_integer(is_integer) {}
 };
 
-
-/**
- * \class ShadowUseStatement
- * \brief Represents ion write statement during code generation
- *
- * Ion update statement needs use of shadow vectors for certain backends
- * as atomics operations are not supported on cpu backend.
- *
- * \todo If shadow_lhs is empty then we assume shadow statement not required
- */
-struct ShadowUseStatement {
-    std::string lhs;
-    std::string op;
-    std::string rhs;
-};
-
 /** @} */  // end of codegen_details
 
 
diff --git a/src/codegen/codegen_info.cpp b/src/codegen/codegen_info.cpp
index 8f6bd448f8..2219a18913 100644
--- a/src/codegen/codegen_info.cpp
+++ b/src/codegen/codegen_info.cpp
@@ -8,6 +8,7 @@
 #include "codegen/codegen_info.hpp"
 
 #include "ast/all.hpp"
+#include "utils/logger.hpp"
 #include "visitors/var_usage_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
@@ -15,6 +16,8 @@
 namespace nmodl {
 namespace codegen {
 
+using namespace fmt::literals;
+using symtab::syminfo::NmodlType;
 using visitor::VarUsageVisitor;
 
 /// if any ion has write variable
@@ -131,5 +134,76 @@ bool CodegenInfo::is_voltage_used_by_watch_statements() const {
     return false;
 }
 
+bool CodegenInfo::state_variable(const std::string& name) const {
+    // clang-format off
+    auto result = std::find_if(state_vars.begin(),
+                               state_vars.end(),
+                               [&name](const SymbolType& sym) {
+                                   return name == sym->get_name();
+                               }
+    );
+    // clang-format on
+    return result != state_vars.end();
+}
+
+std::pair<std::string, std::string> CodegenInfo::read_ion_variable_name(
+    const std::string& name) const {
+    return {name, "ion_" + name};
+}
+
+
+std::pair<std::string, std::string> CodegenInfo::write_ion_variable_name(
+    const std::string& name) const {
+    return {"ion_" + name, name};
+}
+
+
+/**
+ * \details Current variable used in breakpoint block could be local variable.
+ * In this case, neuron has already renamed the variable name by prepending
+ * "_l". In our implementation, the variable could have been renamed by
+ * one of the pass. And hence, we search all local variables and check if
+ * the variable is renamed. Note that we have to look into the symbol table
+ * of statement block and not breakpoint.
+ */
+std::string CodegenInfo::breakpoint_current(std::string current) const {
+    auto& breakpoint = breakpoint_node;
+    if (breakpoint == nullptr) {
+        return current;
+    }
+    const auto& symtab = breakpoint->get_statement_block()->get_symbol_table();
+    const auto& variables = symtab->get_variables_with_properties(NmodlType::local_var);
+    for (const auto& var: variables) {
+        std::string renamed_name = var->get_name();
+        std::string original_name = var->get_original_name();
+        if (current == original_name) {
+            current = renamed_name;
+            break;
+        }
+    }
+    return current;
+}
+
+
+bool CodegenInfo::is_an_instance_variable(const std::string& varname) const {
+    /// check if symbol of given name exist
+    auto check_symbol = [](const std::string& name, const std::vector<SymbolType>& symbols) {
+        for (auto& symbol: symbols) {
+            if (symbol->get_name() == name) {
+                return true;
+            }
+        }
+        return false;
+    };
+
+    /// check if variable exist into all possible types
+    if (check_symbol(varname, assigned_vars) || check_symbol(varname, state_vars) ||
+        check_symbol(varname, range_parameter_vars) || check_symbol(varname, range_assigned_vars) ||
+        check_symbol(varname, range_state_vars)) {
+        return true;
+    }
+    return false;
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/codegen_info.hpp b/src/codegen/codegen_info.hpp
index 805470dad6..2cd3c7b98f 100644
--- a/src/codegen/codegen_info.hpp
+++ b/src/codegen/codegen_info.hpp
@@ -16,6 +16,7 @@
 #include <unordered_set>
 
 #include "ast/ast.hpp"
+#include "codegen/codegen_naming.hpp"
 #include "symtab/symbol_table.hpp"
 
 namespace nmodl {
@@ -134,6 +135,59 @@ struct IndexSemantics {
         , size(size) {}
 };
 
+/**
+ * \enum BlockType
+ * \brief Helper to represent various block types
+ *
+ * Note: do not assign integers to these enums
+ *
+ */
+enum BlockType {
+    /// initial block
+    Initial,
+
+    /// destructor block
+    Destructor,
+
+    /// breakpoint block
+    Equation,
+
+    /// ode_* routines block (not used)
+    Ode,
+
+    /// derivative block
+    State,
+
+    /// watch block
+    Watch,
+
+    /// net_receive block
+    NetReceive,
+
+    /// fake ending block type for loops on the enums. Keep it at the end
+    BlockTypeEnd
+};
+
+/**
+ * \class ShadowUseStatement
+ * \brief Represents ion write statement during code generation
+ *
+ * Ion update statement needs use of shadow vectors for certain backends
+ * as atomics operations are not supported on cpu backend.
+ *
+ * \todo Currently `nrn_wrote_conc` is also added to shadow update statements
+ * list as it's corresponding to ion update statement in INITIAL block. This
+ * needs to be factored out.
+ * \todo This can be represented as AST node (like ast::CodegenAtomicStatement)
+ * but currently C backend use this same implementation. So we are using this
+ * same structure and then converting to ast::CodegenAtomicStatement for LLVM
+ * visitor.
+ */
+struct ShadowUseStatement {
+    std::string lhs;
+    std::string op;
+    std::string rhs;
+};
 
 /**
  * \class CodegenInfo
@@ -419,6 +473,46 @@ struct CodegenInfo {
     /// true if WatchStatement uses voltage v variable
     bool is_voltage_used_by_watch_statements() const;
 
+    /**
+     * Checks if the given variable name belongs to a state variable
+     * \param name The variable name
+     * \return     \c true if the variable is a state variable
+     */
+    bool state_variable(const std::string& name) const;
+
+    /**
+     * Return ion variable name and corresponding ion read variable name
+     * \param name The ion variable name
+     * \return     The ion read variable name
+     */
+    std::pair<std::string, std::string> read_ion_variable_name(const std::string& name) const;
+
+    /**
+     * Return ion variable name and corresponding ion write variable name
+     * \param name The ion variable name
+     * \return     The ion write variable name
+     */
+    std::pair<std::string, std::string> write_ion_variable_name(const std::string& name) const;
+
+    /**
+     * Determine the variable name for the "current" used in breakpoint block taking into account
+     * intermediate code transformations.
+     * \param current The variable name for the current used in the model
+     * \return        The name for the current to be printed in C
+     */
+    std::string breakpoint_current(std::string current) const;
+
+    /**
+     * Check if variable with given name is an instance variable
+     *
+     * Instance varaibles are local to each mechanism instance and
+     * needs to be accessed with an array index. Such variables are
+     * assigned, range, parameter+range etc.
+     * @param varname Name of the variable
+     * @return True if variable is per mechanism instance
+     */
+    bool is_an_instance_variable(const std::string& varname) const;
+
     /// if we need a call back to wrote_conc in neuron/coreneuron
     bool require_wrote_conc = false;
 };
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c52cc92a3d..341ab03fb6 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -9,6 +9,7 @@
 #include "codegen_llvm_helper_visitor.hpp"
 
 #include "ast/all.hpp"
+#include "codegen/codegen_helper_visitor.hpp"
 #include "utils/logger.hpp"
 #include "visitors/visitor_utils.hpp"
 
@@ -17,91 +18,496 @@ namespace codegen {
 
 using namespace fmt::literals;
 
-void CodegenLLVMHelperVisitor::visit_statement_block(ast::StatementBlock& node) {
-    node.visit_children(*this);
-
-    /// if local list statement exist, we have to replace it
-    auto local_statement = visitor::get_local_list_statement(node);
-    if (local_statement) {
-        /// create codegen variables from local variables
-        ast::CodegenVarVector variables;
-        for (const auto& var: local_statement->get_variables()) {
-            variables.emplace_back(new ast::CodegenVar(0, var->get_name()->clone()));
-        }
-
-        /// remove local list statement now
-        const auto& statements = node.get_statements();
-        node.erase_statement(statements.begin());
+/**
+ * \brief Create variable definition statement
+ *
+ * `LOCAL` variables in NMODL don't have type. These variables need
+ * to be defined with float type. Same for index, loop iteration and
+ * local variables. This helper function function is used to create
+ * type specific local variable.
+ *
+ * @param names Name of the variables to be defined
+ * @param type Type of the variables
+ * @return Statement defining variables
+ */
+static std::shared_ptr<ast::CodegenVarListStatement> create_local_variable_statement(
+    const std::vector<std::string>& names,
+    ast::AstNodeType type) {
+    /// create variables for the given name
+    ast::CodegenVarVector variables;
+    for (const auto& name: names) {
+        auto varname = new ast::Name(new ast::String(name));
+        variables.emplace_back(new ast::CodegenVar(0, varname));
+    }
+    auto var_type = new ast::CodegenVarType(type);
+    /// construct statement and return it
+    return std::make_shared<ast::CodegenVarListStatement>(var_type, variables);
+}
 
-        /// create new codegen variable statement
-        auto type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
-        auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+/**
+ * \brief Create expression for a given NMODL code statement
+ * @param code NMODL code statement
+ * @return Expression representing given NMODL code
+ */
+static std::shared_ptr<ast::Expression> create_statement_as_expression(const std::string& code) {
+    const auto& statement = visitor::create_statement(code);
+    auto expr_statement = std::dynamic_pointer_cast<ast::ExpressionStatement>(statement);
+    auto expr = expr_statement->get_expression()->clone();
+    return std::make_shared<ast::WrappedExpression>(expr);
+}
 
-        /// insert codegen variable statement
-        node.insert_statement(statements.begin(), statement);
-    }
+/**
+ * \brief Create expression for given NMODL code expression
+ * @param code NMODL code expression
+ * @return Expression representing NMODL code
+ */
+std::shared_ptr<ast::Expression> create_expression(const std::string& code) {
+    /// as provided code is only expression and not a full statement, create
+    /// a temporary assignment statement
+    const auto& wrapped_expr = create_statement_as_expression("some_var = " + code);
+    /// now extract RHS (representing original code) and return it as expression
+    auto expr = std::dynamic_pointer_cast<ast::WrappedExpression>(wrapped_expr)->get_expression();
+    auto rhs = std::dynamic_pointer_cast<ast::BinaryExpression>(expr)->get_rhs();
+    return std::make_shared<ast::WrappedExpression>(rhs->clone());
 }
 
-void CodegenLLVMHelperVisitor::add_function_procedure_node(ast::Block& node) {
+/**
+ * \brief Add code generation function for FUNCTION or PROCEDURE block
+ * @param node AST node representing FUNCTION or PROCEDURE
+ *
+ * When we have a PROCEDURE or FUNCTION like
+ *
+ * \code{.mod}
+ *      FUNCTION sum(x,y) {
+ *          LOCAL res
+ *          res = x + y
+ *          sum = res
+ *      }
+ * \endcode
+ *
+ * this gets typically converted to C/C++ code as:
+ *
+ * \code{.cpp}
+ *      double sum(double x, double y) {
+ *          double res;
+ *          double ret_sum;
+ *          res = x + y;
+ *          ret_sum = res;
+ *          return ret_sum;
+ * \endcode
+ *
+ * We perform following transformations so that code generation backends
+ * will have minimum logic:
+ *  - Add return type
+ *  - Add type for the function arguments
+ *  - Define variables and return variable
+ *  - Add return type (int for PROCEDURE and double for FUNCTION)
+ */
+void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
+    /// name of the function from the node
     std::string function_name = node.get_node_name();
-
-    const auto& source_node_type = node.get_node_type();
     auto name = new ast::Name(new ast::String(function_name));
+
+    /// return variable name has "ret_" prefix
     auto return_var = new ast::Name(new ast::String("ret_" + function_name));
-    ast::CodegenVarType* var_type = nullptr;
-    ast::CodegenVarType* return_type = nullptr;
 
     /// return type based on node type
-    bool is_function = source_node_type == ast::AstNodeType::FUNCTION_BLOCK;
-    if (is_function) {
-        var_type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
+    ast::CodegenVarType* ret_var_type = nullptr;
+    if (node.get_node_type() == ast::AstNodeType::FUNCTION_BLOCK) {
+        ret_var_type = new ast::CodegenVarType(FLOAT_TYPE);
     } else {
-        var_type = new ast::CodegenVarType(ast::AstNodeType::INTEGER);
+        ret_var_type = new ast::CodegenVarType(INTEGER_TYPE);
     }
 
-    /// return type is same as variable type
-    return_type = var_type->clone();
-
-    /// function body and it's statement
+    /// function body and it's statement, copy original block
     auto block = node.get_statement_block()->clone();
     const auto& statements = block->get_statements();
 
     /// insert return variable at the start of the block
     ast::CodegenVarVector codegen_vars;
     codegen_vars.emplace_back(new ast::CodegenVar(0, return_var->clone()));
-    auto statement = std::make_shared<ast::CodegenVarListStatement>(var_type, codegen_vars);
+    auto statement = std::make_shared<ast::CodegenVarListStatement>(ret_var_type, codegen_vars);
     block->insert_statement(statements.begin(), statement);
 
     /// add return statement
     auto return_statement = new ast::CodegenReturnStatement(return_var);
     block->emplace_back_statement(return_statement);
 
-    /// prepare arguments
-    ast::CodegenArgumentVector code_arguments;
-    const auto& arguments = node.get_parameters();
-    for (const auto& arg: arguments) {
-        auto type = new ast::CodegenVarType(ast::AstNodeType::DOUBLE);
-        auto var = arg->get_name()->clone();
-        code_arguments.emplace_back(new ast::CodegenArgument(type, var));
+    /// prepare function arguments based original node arguments
+    ast::CodegenArgumentVector arguments;
+    for (const auto& param: node.get_parameters()) {
+        /// create new type and name for creating new ast node
+        auto type = new ast::CodegenVarType(FLOAT_TYPE);
+        auto var = param->get_name()->clone();
+        arguments.emplace_back(new ast::CodegenArgument(type, var));
     }
 
-    /// add new node to AST
-    auto function =
-        std::make_shared<ast::CodegenFunction>(return_type, name, code_arguments, block);
+    /// return type of the function is same as return variable type
+    ast::CodegenVarType* fun_ret_type = ret_var_type->clone();
+
+    /// we have all information for code generation function, create a new node
+    /// which will be inserted later into AST
+    auto function = std::make_shared<ast::CodegenFunction>(fun_ret_type, name, arguments, block);
     codegen_functions.push_back(function);
 }
 
+static void append_statements_from_block(ast::StatementVector& statements,
+                                         const std::shared_ptr<ast::StatementBlock>& block) {
+    const auto& block_statements = block->get_statements();
+    statements.insert(statements.end(), block_statements.begin(), block_statements.end());
+}
+
+static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(std::string& lhs_str,
+                                                                            std::string& op_str,
+                                                                            std::string& rhs_str) {
+    auto lhs = std::make_shared<ast::Name>(new ast::String(lhs_str));
+    auto op = ast::BinaryOperator(ast::string_to_binaryop(op_str));
+    auto rhs = create_expression(rhs_str);
+    return std::make_shared<ast::CodegenAtomicStatement>(lhs, op, rhs);
+}
+
+/**
+ * For a given block type, add read ion statements
+ *
+ * Depending upon the block type, we have to update read ion variables
+ * during code generation. Depending on block/procedure being printed,
+ * this method adds necessary read ion variable statements and also
+ * corresponding index calculation statements. Note that index statements
+ * are added separately at the beginning for just readability purpose.
+ *
+ * @param type The type of code block being generated
+ * @param int_variables Index variables to be created
+ * @param double_variables Floating point variables to be created
+ * @param index_statements Statements for loading indexes (typically for ions)
+ * @param body_statements main compute/update statements
+ *
+ * \todo After looking into mod2c and neuron implementation, it seems like
+ * Ode block type is not used. Need to look into implementation details.
+ *
+ * \todo Ion copy optimization is not implemented yet. This is currently
+ * implemented in C backend using `ion_read_statements_optimized()`.
+ */
+void CodegenLLVMHelperVisitor::ion_read_statements(BlockType type,
+                                                   std::vector<std::string>& int_variables,
+                                                   std::vector<std::string>& double_variables,
+                                                   ast::StatementVector& index_statements,
+                                                   ast::StatementVector& body_statements) {
+    /// create read ion and corresponding index statements
+    auto create_read_statements = [&](std::pair<std::string, std::string> variable_names) {
+        // variable in current mechanism instance
+        std::string& varname = variable_names.first;
+        // ion variable to be read
+        std::string& ion_varname = variable_names.second;
+        // index for reading ion variable
+        std::string index_varname = "{}_id"_format(varname);
+        // first load the index
+        std::string index_statement = "{} = {}_index[id]"_format(index_varname, ion_varname);
+        // now assign the value
+        std::string read_statement = "{} = {}[{}]"_format(varname, ion_varname, index_varname);
+        // push index definition, index statement and actual read statement
+        int_variables.push_back(index_varname);
+        index_statements.push_back(visitor::create_statement(index_statement));
+        body_statements.push_back(visitor::create_statement(read_statement));
+    };
+
+    /// iterate over all ions and create statements for given block type
+    for (const auto& ion: info.ions) {
+        const std::string& name = ion.name;
+        for (const auto& var: ion.reads) {
+            if (type == BlockType::Ode && ion.is_ionic_conc(var) && info.state_variable(var)) {
+                continue;
+            }
+            auto variable_names = info.read_ion_variable_name(var);
+            create_read_statements(variable_names);
+        }
+        for (const auto& var: ion.writes) {
+            if (type == BlockType::Ode && ion.is_ionic_conc(var) && info.state_variable(var)) {
+                continue;
+            }
+            if (ion.is_ionic_conc(var)) {
+                auto variable_names = info.read_ion_variable_name(var);
+                create_read_statements(variable_names);
+            }
+        }
+    }
+}
+
+/**
+ * For a given block type, add write ion statements
+ *
+ * Depending upon the block type, we have to update write ion variables
+ * during code generation. Depending on block/procedure being printed,
+ * this method adds necessary write ion variable statements and also
+ * corresponding index calculation statements. Note that index statements
+ * are added separately at the beginning for just readability purpose.
+ *
+ * @param type The type of code block being generated
+ * @param int_variables Index variables to be created
+ * @param double_variables Floating point variables to be created
+ * @param index_statements Statements for loading indexes (typically for ions)
+ * @param body_statements main compute/update statements
+ *
+ * \todo If intra or extra cellular ionic concentration is written
+ * then it requires call to `nrn_wrote_conc`. In C backend this is
+ * implemented in `ion_write_statements()` itself but this is not
+ * handled yet.
+ */
+void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
+                                                    std::vector<std::string>& int_variables,
+                                                    std::vector<std::string>& double_variables,
+                                                    ast::StatementVector& index_statements,
+                                                    ast::StatementVector& body_statements) {
+    /// create write ion and corresponding index statements
+    auto create_write_statements = [&](std::string ion_varname, std::string op, std::string rhs) {
+        // index for writing ion variable
+        std::string index_varname = "{}_id"_format(ion_varname);
+        // load index
+        std::string index_statement = "{} = {}_index[id]"_format(index_varname, ion_varname);
+        // ion variable to write (with index)
+        std::string ion_to_write = "{}[{}]"_format(ion_varname, index_varname);
+        // push index definition, index statement and actual write statement
+        int_variables.push_back(index_varname);
+        index_statements.push_back(visitor::create_statement(index_statement));
+        body_statements.push_back(create_atomic_statement(ion_to_write, op, rhs));
+    };
+
+    /// iterate over all ions and create write ion statements for given block type
+    for (const auto& ion: info.ions) {
+        std::string concentration;
+        std::string name = ion.name;
+        for (const auto& var: ion.writes) {
+            auto variable_names = info.write_ion_variable_name(var);
+            /// ionic currents are accumulated
+            if (ion.is_ionic_current(var)) {
+                if (type == BlockType::Equation) {
+                    std::string current = info.breakpoint_current(var);
+                    std::string lhs = variable_names.first;
+                    std::string op = "+=";
+                    std::string rhs = current;
+                    // for synapse type
+                    if (info.point_process) {
+                        auto area = codegen::naming::NODE_AREA_VARIABLE;
+                        rhs += "*(1.e2/{})"_format(area);
+                    }
+                    create_write_statements(lhs, op, rhs);
+                }
+            } else {
+                if (!ion.is_rev_potential(var)) {
+                    concentration = var;
+                }
+                std::string lhs = variable_names.first;
+                std::string op = "=";
+                std::string rhs = variable_names.second;
+                create_write_statements(lhs, op, rhs);
+            }
+        }
+
+        /// still need to handle, need to define easy to use API
+        if (type == BlockType::Initial && !concentration.empty()) {
+            int index = 0;
+            if (ion.is_intra_cell_conc(concentration)) {
+                index = 1;
+            } else if (ion.is_extra_cell_conc(concentration)) {
+                index = 2;
+            } else {
+                /// \todo Unhandled case also in neuron implementation
+                throw std::logic_error("codegen error for {} ion"_format(ion.name));
+            }
+            std::string ion_type_name = "{}_type"_format(ion.name);
+            std::string lhs = "int {}"_format(ion_type_name);
+            std::string op = "=";
+            std::string rhs = ion_type_name;
+            create_write_statements(lhs, op, rhs);
+            logger->error("conc_write_statement() call is required but it's not supported");
+        }
+    }
+}
+
+/**
+ * Convert variables in given node to instance variables
+ *
+ * For code generation, variables of type range, assigned, state or parameter+range
+ * needs to be converted to instance variable i.e. they need to be accessed with
+ * loop index variable. For example, `h` variables needs to be converted to `h[id]`.
+ *
+ * @param node Ast node under which variables to be converted to instance type
+ */
+void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
+                                                            std::string& index_var) {
+    /// collect all variables in the node of type ast::VarName
+    auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
+    for (auto& v: variables) {
+        auto variable = std::dynamic_pointer_cast<ast::VarName>(v);
+        /// if variable is of type instance then convert it to index
+        if (info.is_an_instance_variable(variable->get_node_name())) {
+            auto name = variable->get_name()->clone();
+            auto index = new ast::Name(new ast::String(index_var));
+            auto indexed_name = std::make_shared<ast::IndexedName>(name, index);
+            variable->set_name(indexed_name);
+        }
+    }
+}
+
+/**
+ * \brief Visit StatementBlock and convert Local statement for code generation
+ * @param node AST node representing Statement block
+ *
+ * Statement blocks can have LOCAL statement and if it exist it's typically
+ * first statement in the vector. We have to remove LOCAL statement and convert
+ * it to CodegenVarListStatement that will represent all variables as double.
+ */
+void CodegenLLVMHelperVisitor::visit_statement_block(ast::StatementBlock& node) {
+    /// first process all children blocks if any
+    node.visit_children(*this);
+
+    /// check if block contains LOCAL statement
+    const auto& local_statement = visitor::get_local_list_statement(node);
+    if (local_statement) {
+        /// create codegen variables from local variables
+        /// clone variable to make new independent statement
+        ast::CodegenVarVector variables;
+        for (const auto& var: local_statement->get_variables()) {
+            variables.emplace_back(new ast::CodegenVar(0, var->get_name()->clone()));
+        }
+
+        /// remove local list statement now
+        const auto& statements = node.get_statements();
+        node.erase_statement(statements.begin());
+
+        /// create new codegen variable statement and insert at the beginning of the block
+        auto type = new ast::CodegenVarType(FLOAT_TYPE);
+        auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+        node.insert_statement(statements.begin(), statement);
+    }
+}
+
 void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
     node.visit_children(*this);
-    add_function_procedure_node(node);
+    create_function_for_node(node);
 }
 
 void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
     node.visit_children(*this);
-    add_function_procedure_node(node);
+    create_function_for_node(node);
+}
+
+/**
+ * \brief Convert ast::NrnStateBlock to corresponding code generation function nrn_state
+ * @param node AST node representing ast::NrnStateBlock
+ *
+ * Solver passes converts DERIVATIVE block from MOD into ast::NrnStateBlock node
+ * that represent `nrn_state` function in the generated CPP code. To help this
+ * code generation, we perform various transformation on ast::NrnStateBlock and
+ * create new code generation function.
+ */
+void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
+    /// statements for new function to be generated
+    ast::StatementVector function_statements;
+
+    /// create variable definition for loop index and insert at the beginning
+    std::string loop_index_var = "id";
+    std::vector<std::string> int_variables{"id"};
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+
+    /// create now main compute part : for loop over channel instances
+
+    /// loop constructs : initialization, condition and increment
+    const auto& initialization = create_statement_as_expression("id = 0");
+    const auto& condition = create_expression("id < node_count");
+    const auto& increment = create_statement_as_expression("id = id + 1");
+
+    /// loop body : initialization + solve blocks
+    ast::StatementVector loop_def_statements;
+    ast::StatementVector loop_index_statements;
+    ast::StatementVector loop_body_statements;
+    {
+        std::vector<std::string> int_variables{"node_id"};
+        std::vector<std::string> double_variables{"v"};
+
+        /// access node index and corresponding voltage
+        loop_index_statements.push_back(visitor::create_statement("node_id = node_index[id]"));
+        loop_body_statements.push_back(visitor::create_statement("v = voltage[node_id]"));
+
+        /// read ion variables
+        ion_read_statements(BlockType::State,
+                            int_variables,
+                            double_variables,
+                            loop_index_statements,
+                            loop_body_statements);
+
+        /// main compute node : extract solution expressions from the derivative block
+        const auto& solutions = collect_nodes(node, {ast::AstNodeType::SOLUTION_EXPRESSION});
+        for (const auto& statement: solutions) {
+            const auto& solution = std::dynamic_pointer_cast<ast::SolutionExpression>(statement);
+            const auto& block = std::dynamic_pointer_cast<ast::StatementBlock>(
+                solution->get_node_to_solve());
+            append_statements_from_block(loop_body_statements, block);
+        }
+
+        /// add breakpoint block if no current
+        if (info.currents.empty() && info.breakpoint_node != nullptr) {
+            auto block = info.breakpoint_node->get_statement_block();
+            append_statements_from_block(loop_body_statements, block);
+        }
+
+        /// write ion statements
+        ion_write_statements(BlockType::State,
+                             int_variables,
+                             double_variables,
+                             loop_index_statements,
+                             loop_body_statements);
+
+        loop_def_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+        loop_def_statements.push_back(
+            create_local_variable_statement(double_variables, FLOAT_TYPE));
+
+        // \todo handle process_shadow_update_statement and wrote_conc_call yet
+    }
+
+    ast::StatementVector loop_body;
+    loop_body.insert(loop_body.end(), loop_def_statements.begin(), loop_def_statements.end());
+    loop_body.insert(loop_body.end(), loop_index_statements.begin(), loop_index_statements.end());
+    loop_body.insert(loop_body.end(), loop_body_statements.begin(), loop_body_statements.end());
+
+    /// now construct a new code block which will become the body of the loop
+    auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
+
+    /// convert all variables inside loop body to instance variables
+    convert_to_instance_variable(*loop_block, loop_index_var);
+
+    /// create for loop node
+    auto for_loop_statement = std::make_shared<ast::CodegenForStatement>(initialization,
+                                                                         condition,
+                                                                         increment,
+                                                                         loop_block);
+
+    /// loop itself becomes one of the statement in the function
+    function_statements.push_back(for_loop_statement);
+
+    /// new block for the function
+    auto function_block = new ast::StatementBlock(function_statements);
+
+    /// name of the function and it's return type
+    std::string function_name = "nrn_state_" + stringutils::tolower(info.mod_suffix);
+    auto name = new ast::Name(new ast::String(function_name));
+    auto return_type = new ast::CodegenVarType(ast::AstNodeType::VOID);
+
+    /// \todo : currently there are no arguments
+    ast::CodegenArgumentVector code_arguments;
+
+    /// finally, create new function
+    auto function =
+        std::make_shared<ast::CodegenFunction>(return_type, name, code_arguments, function_block);
+    codegen_functions.push_back(function);
+
+    std::cout << nmodl::to_nmodl(function);
 }
 
 void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
+    /// run codegen helper visitor to collect information
+    CodegenHelperVisitor v;
+    info = v.analyze(node);
+
     logger->info("Running CodegenLLVMHelperVisitor");
     node.visit_children(*this);
     for (auto& fun: codegen_functions) {
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index b7ff57aec1..6b1684e7d1 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -28,19 +28,57 @@ namespace codegen {
 
 /**
  * \class CodegenLLVMHelperVisitor
- * \brief Helper visitor to gather AST information to help LLVM code generation
+ * \brief Helper visitor for AST information to help code generation backends
+ *
+ * Code generation backends convert NMODL AST to C++ code. But during this
+ * C++ code generation, various transformations happens and final code generated
+ * is quite different / large than actual kernel represented in MOD file ro
+ * NMODL AST.
+ *
+ * Currently, these transformations are embedded into code generation backends
+ * like ast::CodegenCVisitor. If we have to generate code for new simulator, there
+ * will be duplication of these transformations. Also, for completely new
+ * backends like NEURON simulator or SIMD library, we will have code duplication.
+ *
+ * In order to avoid this, we perform maximum transformations in this visitor.
+ * Currently we focus on transformations that will help LLVM backend but later
+ * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
+    /// newly generated code generation specific functions
     std::vector<std::shared_ptr<ast::CodegenFunction>> codegen_functions;
 
-    void add_function_procedure_node(ast::Block& node);
+    /// ast information for code generation
+    codegen::CodegenInfo info;
+
+    /// default integer and float node type
+    const ast::AstNodeType INTEGER_TYPE = ast::AstNodeType::INTEGER;
+    const ast::AstNodeType FLOAT_TYPE = ast::AstNodeType::DOUBLE;
+
+    /// create new function for FUNCTION or PROCEDURE block
+    void create_function_for_node(ast::Block& node);
 
   public:
     CodegenLLVMHelperVisitor() = default;
 
+    void ion_read_statements(BlockType type,
+                             std::vector<std::string>& int_variables,
+                             std::vector<std::string>& double_variables,
+                             ast::StatementVector& index_statements,
+                             ast::StatementVector& body_statements);
+
+    void ion_write_statements(BlockType type,
+                              std::vector<std::string>& int_variables,
+                              std::vector<std::string>& double_variables,
+                              ast::StatementVector& index_statements,
+                              ast::StatementVector& body_statements);
+
+    void convert_to_instance_variable(ast::Node& node, std::string& index_var);
+
     void visit_statement_block(ast::StatementBlock& node) override;
     void visit_procedure_block(ast::ProcedureBlock& node) override;
     void visit_function_block(ast::FunctionBlock& node) override;
+    void visit_nrn_state_block(ast::NrnStateBlock& node) override;
     void visit_program(ast::Program& node) override;
 };
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0fa0864d9a..6228b39d04 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -350,7 +350,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     // Keep this for easier development (maybe move to debug mode later).
     std::cout << print_module();
 
-    // not used yet
+    // not used yet : this will be used at the beginning of this function
     {
         CodegenLLVMHelperVisitor v;
         v.visit_program(const_cast<ast::Program&>(node));
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index f4847f8ec8..a2bece8b4a 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -66,6 +66,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/boolean.hpp
     ${PROJECT_BINARY_DIR}/src/ast/breakpoint_block.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_argument.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_atomic_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_for_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_function.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
@@ -194,6 +195,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/valence.hpp
     ${PROJECT_BINARY_DIR}/src/ast/var_name.hpp
     ${PROJECT_BINARY_DIR}/src/ast/verbatim.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/void.hpp
     ${PROJECT_BINARY_DIR}/src/ast/watch.hpp
     ${PROJECT_BINARY_DIR}/src/ast/watch_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/while_statement.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 6a3f7fbf83..104f41420e 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -28,6 +28,9 @@
             - Expression:
                 children:
                   - Number:
+                  - Void:
+                      nmodl: "VOID"
+                      brief: "Represent void type in code generation"
                   - Identifier:
                       children:
                         - CodegenVarType:
@@ -194,7 +197,7 @@
                             brief: "condition expression for the loop"
                             type: Expression
                             optional: true
-                            prefix: {value: ";"}
+                            prefix: {value: "; "}
                             suffix: {value: "; "}
                         - increment:
                             brief: "increment or decrement expression for the loop"
@@ -226,3 +229,35 @@
                             vector: true
                             separator: ", "
                             add: true
+                  - CodegenAtomicStatement:
+                      brief: "Represent atomic operation"
+                      description: |
+                        During code generation certain operations like ion updates, vec_rhs or
+                        vec_d updates (for synapse) needs to be atomic operations if executed by
+                        multiple threads. In case of SIMD, there are conflicts for `vec_d` and
+                        `vec_rhs` for synapse types. Here are some statements from C++ backend:
+
+                        \code{.cpp}
+                            vec_d[node_id] += g
+                            vec_rhs[node_id] -= rhs
+                            ion_ina[indexes[some_index]] += ina[id]
+                            ion_cai[indexes[some_index]] = cai[id]  // cai here is state variable
+                        \endcode
+
+                        These operations will be represented by atomic statement node type:
+                        * `vec_d[node_id]` : lhs
+                        * `+=` : atomic_op
+                        * `g` : rhs
+
+                      members:
+                        - lhs:
+                            brief: "Variable to be updated atomically"
+                            type: Identifier
+                        - atomic_op:
+                            brief: "Operator"
+                            type: BinaryOperator
+                            prefix: {value: " "}
+                            suffix: {value: " "}
+                        - rhs:
+                            brief: "Expression for atomic operation"
+                            type: Expression
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index f9c76827fd..a580d32477 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -129,10 +129,8 @@ endif()
 set(testvisitor_env "PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH}")
 if(NOT LINK_AGAINST_PYTHON)
   list(APPEND testvisitor_env "NMODL_PYLIB=$ENV{NMODL_PYLIB}")
-  list(
-    APPEND
-      testvisitor_env
-      "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  list(APPEND testvisitor_env
+       "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif()
 
 foreach(

From 92eae9011f71cb219ebd1da352dff2441ae33d14 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 8 Jan 2021 10:58:58 +0300
Subject: [PATCH 122/331] Running functions from MOD files via LLVM JIT (#482)

This commit introduces a functionality to execute functions from MOD file via LLVM jit.

For that, there is now:
- `JITDriver` class that, given a LLVM IR module, set ups the JIT compiler and is able to look up a function and execute it.
- `Runner` class that wraps around JIT driver. It helps to initialise JIT with LLVM IR module only once, and then run multiple functions from it.

To execute functions, `nmodl_llvm_runner` executable is used. It takes a single mod file and a specified entry-point function, and runs it via LLVM code generation pipeline and JIT driver. Only functions with double result types are supported at the moment.

For example, for MOD file `foo.mod`:
```
FUNCTION one() {
    one = 1
}

FUNCTION bar() {
    bar = one() + exp(1)
}
```
running `nmodl_llvm_runner -f foo.mod -e bar` gives
```
Result: 3.718282
```

Tests for execution of generated  IR have been added as well.

fixes #482

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 cmake/LLVMHelper.cmake                       |   2 +-
 src/codegen/llvm/CMakeLists.txt              |  38 ++++-
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  10 +-
 src/codegen/llvm/jit_driver.cpp              |  81 ++++++++++
 src/codegen/llvm/jit_driver.hpp              |  82 ++++++++++
 src/codegen/llvm/main.cpp                    |  74 +++++++++
 test/unit/CMakeLists.txt                     |  13 ++
 test/unit/codegen/codegen_llvm_execution.cpp | 162 +++++++++++++++++++
 8 files changed, 457 insertions(+), 5 deletions(-)
 create mode 100644 src/codegen/llvm/jit_driver.cpp
 create mode 100644 src/codegen/llvm/jit_driver.hpp
 create mode 100644 src/codegen/llvm/main.cpp
 create mode 100644 test/unit/codegen/codegen_llvm_execution.cpp

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 5d451697b9..a731fa0151 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -5,7 +5,7 @@
 find_package(LLVM REQUIRED CONFIG)
 
 # include LLVM header and core library
-llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core native)
+llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core orcjit native)
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
 set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index db16d4072c..bd54f4143d 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -5,12 +5,44 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp)
 
 # =============================================================================
-# LLVM codegen library
+# LLVM codegen library and executable
 # =============================================================================
 
 include_directories(${LLVM_INCLUDE_DIRS})
-add_library(llvm_codegen STATIC ${LLVM_CODEGEN_SOURCE_FILES})
+add_library(runner_obj OBJECT ${LLVM_CODEGEN_SOURCE_FILES})
+add_dependencies(runner_obj lexer_obj)
+set_property(TARGET runner_obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+add_library(llvm_codegen STATIC $<TARGET_OBJECTS:runner_obj>)
+
 add_dependencies(llvm_codegen lexer util visitor)
+
+if(NOT NMODL_AS_SUBPROJECT)
+  add_executable(nmodl_llvm_runner main.cpp)
+
+  target_link_libraries(
+    nmodl_llvm_runner
+    llvm_codegen
+    codegen
+    visitor
+    symtab
+    lexer
+    util
+    test_util
+    printer
+    ${NMODL_WRAPPER_LIBS}
+    ${LLVM_LIBS_TO_LINK})
+endif()
+
+# =============================================================================
+# Install executable
+# =============================================================================
+
+if(NOT NMODL_AS_SUBPROJECT)
+  install(TARGETS nmodl_llvm_runner DESTINATION ${NMODL_INSTALL_DIR_SUFFIX}bin)
+endif()
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index be4eb04867..599cfc7b58 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -154,6 +154,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     void emit_procedure_or_function_declaration(const ast::Block& node);
 
+    /**
+     * Return module pointer
+     * \return LLVM IR module pointer
+     */
+    std::unique_ptr<llvm::Module> get_module() {
+        return std::move(module);
+    }
+
     /**
      * Visit nmodl function or procedure
      * \param node the AST node representing the function or procedure in NMODL
@@ -173,7 +181,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
 
-    // TODO: use custom printer here
+    // \todo: move this to debug mode (e.g. -v option or --dump-ir)
     std::string print_module() const {
         std::string str;
         llvm::raw_string_ostream os(str);
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
new file mode 100644
index 0000000000..a7673bb2ff
--- /dev/null
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -0,0 +1,81 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "jit_driver.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+namespace nmodl {
+namespace runner {
+
+void JITDriver::init() {
+    llvm::InitializeNativeTarget();
+    llvm::InitializeNativeTargetAsmPrinter();
+
+    set_target_triple(module.get());
+    auto data_layout = module->getDataLayout();
+
+    // Create IR compile function callback.
+    auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
+        -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
+        auto tm = tm_builder.createTargetMachine();
+        if (!tm)
+            return tm.takeError();
+        return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(*tm));
+    };
+
+    auto jit_instance = cantFail(
+        llvm::orc::LLJITBuilder().setCompileFunctionCreator(compile_function_creator).create());
+
+    // Add a ThreadSafeModule to the driver.
+    llvm::orc::ThreadSafeModule tsm(std::move(module), std::make_unique<llvm::LLVMContext>());
+    cantFail(jit_instance->addIRModule(std::move(tsm)));
+    jit = std::move(jit_instance);
+
+    // Resolve symbols.
+    llvm::orc::JITDylib& sym_tab = jit->getMainJITDylib();
+    sym_tab.addGenerator(cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
+        data_layout.getGlobalPrefix())));
+}
+
+void JITDriver::set_target_triple(llvm::Module* module) {
+    auto target_triple = llvm::sys::getDefaultTargetTriple();
+    std::string error;
+    auto target = llvm::TargetRegistry::lookupTarget(target_triple, error);
+    if (!target)
+        throw std::runtime_error("Error: " + error + "\n");
+
+    std::string cpu(llvm::sys::getHostCPUName());
+    llvm::SubtargetFeatures features;
+    llvm::StringMap<bool> host_features;
+
+    if (llvm::sys::getHostCPUFeatures(host_features)) {
+        for (auto& f: host_features)
+            features.AddFeature(f.first(), f.second);
+    }
+
+    std::unique_ptr<llvm::TargetMachine> machine(
+        target->createTargetMachine(target_triple, cpu, features.getString(), {}, {}));
+    if (!machine)
+        throw std::runtime_error("Error: failed to create a target machine\n");
+
+    module->setDataLayout(machine->createDataLayout());
+    module->setTargetTriple(target_triple);
+}
+
+}  // namespace runner
+}  // namespace nmodl
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
new file mode 100644
index 0000000000..d1e9a9412f
--- /dev/null
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -0,0 +1,82 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief Implementation of LLVM's JIT-based execution engine to run functions from MOD files
+ *
+ * \file
+ * \brief \copybrief nmodl::runner::JITDriver
+ */
+
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+
+namespace nmodl {
+namespace runner {
+
+/**
+ * \class JITDriver
+ * \brief Driver to execute MOD file function via LLVM IR backend
+ */
+class JITDriver {
+  private:
+    std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
+
+    std::unique_ptr<llvm::orc::LLJIT> jit;
+
+    std::unique_ptr<llvm::Module> module;
+
+  public:
+    JITDriver(std::unique_ptr<llvm::Module> m)
+        : module(std::move(m)) {}
+
+    /// Initialize the JIT.
+    void init();
+
+    /// Lookup the entry-point in the JIT and execute it, returning the result.
+    template <typename T>
+    T execute(const std::string& entry_point) {
+        auto expected_symbol = jit->lookup(entry_point);
+        if (!expected_symbol)
+            throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
+
+        auto (*res)() = (T(*)())(intptr_t) expected_symbol->getAddress();
+        T result = res();
+        return result;
+    }
+
+    /// Set the target triple on the module.
+    static void set_target_triple(llvm::Module* module);
+};
+
+/**
+ * \class Runner
+ * \brief A wrapper around JITDriver to execute an entry point in the LLVM IR module.
+ */
+class Runner {
+  private:
+    std::unique_ptr<llvm::Module> module;
+
+    std::unique_ptr<JITDriver> driver = std::make_unique<JITDriver>(std::move(module));
+
+  public:
+    Runner(std::unique_ptr<llvm::Module> m)
+        : module(std::move(m)) {
+        driver->init();
+    }
+
+    /// Run the entry-point function.
+    template <typename T>
+    double run(const std::string& entry_point) {
+        return driver->execute<T>(entry_point);
+    }
+};
+
+}  // namespace runner
+}  // namespace nmodl
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
new file mode 100644
index 0000000000..11ea178cb4
--- /dev/null
+++ b/src/codegen/llvm/main.cpp
@@ -0,0 +1,74 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <CLI/CLI.hpp>
+
+#include "ast/program.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "jit_driver.hpp"
+#include "parser/nmodl_driver.hpp"
+#include "utils/logger.hpp"
+#include "visitors/symtab_visitor.hpp"
+
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+
+using namespace nmodl;
+using namespace runner;
+
+int main(int argc, const char* argv[]) {
+    CLI::App app{
+        "NMODL LLVM Runner : Executes functions from a MOD file via LLVM IR code generation"};
+
+    // Currently, only a single MOD file is supported, as well as an entry point with a double
+    // return type. While returning a double value is a general case in NMODL, it will be nice to
+    // have a more generic functionality. \todo: Add support for different return types (int, void).
+
+    std::string filename;
+    std::string entry_point_name = "main";
+
+    app.add_option("-f,--file,file", filename, "A single MOD file source")
+        ->required()
+        ->check(CLI::ExistingFile);
+    app.add_option("-e,--entry-point,entry-point",
+                   entry_point_name,
+                   "An entry point function from the MOD file");
+
+    CLI11_PARSE(app, argc, argv);
+
+    logger->info("Parsing MOD file to AST");
+    parser::NmodlDriver driver;
+    const auto& ast = driver.parse_file(filename);
+
+    logger->info("Running Symtab Visitor");
+    visitor::SymtabVisitor().visit_program(*ast);
+
+    logger->info("Running LLVM Visitor");
+    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", /*opt_passes=*/false);
+    llvm_visitor.visit_program(*ast);
+    std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+
+    // Check if the entry-point is valid for JIT driver to execute.
+    auto func = module->getFunction(entry_point_name);
+    if (!func)
+        throw std::runtime_error("Error: entry-point is not found\n");
+
+    if (func->getNumOperands() != 0)
+        throw std::runtime_error("Error: entry-point functions with arguments are not supported\n");
+
+    if (!func->getReturnType()->isDoubleTy())
+        throw std::runtime_error(
+            "Error: entry-point functions with non-double return type are not supported\n");
+
+    Runner runner(std::move(module));
+
+    // Since only double type is supported, provide explicit double type to the running function.
+    auto r = runner.run<double>(entry_point_name);
+    fprintf(stderr, "Result: %f\n", r);
+
+    return 0;
+}
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index a580d32477..4c59e148c4 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -107,6 +107,7 @@ target_link_libraries(
 if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS})
   add_executable(testllvm visitor/main.cpp codegen/llvm.cpp)
+  add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_llvm_execution.cpp)
   target_link_libraries(
     testllvm
     llvm_codegen
@@ -119,6 +120,18 @@ if(NMODL_ENABLE_LLVM)
     printer
     ${NMODL_WRAPPER_LIBS}
     ${LLVM_LIBS_TO_LINK})
+  target_link_libraries(
+    test_llvm_runner
+    llvm_codegen
+    codegen
+    visitor
+    symtab
+    lexer
+    util
+    test_util
+    printer
+    ${NMODL_WRAPPER_LIBS}
+    ${LLVM_LIBS_TO_LINK})
   set(CODEGEN_TEST testllvm)
 endif()
 
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
new file mode 100644
index 0000000000..6f1bf7b8ca
--- /dev/null
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -0,0 +1,162 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <catch/catch.hpp>
+#include <regex>
+
+#include "ast/program.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/jit_driver.hpp"
+#include "parser/nmodl_driver.hpp"
+#include "visitors/checkparent_visitor.hpp"
+#include "visitors/symtab_visitor.hpp"
+
+using namespace nmodl;
+using namespace runner;
+using namespace visitor;
+using nmodl::parser::NmodlDriver;
+
+static double EPSILON = 1e-15;
+
+//=============================================================================
+// No optimisations
+//=============================================================================
+
+SCENARIO("Arithmetic expression", "[llvm][runner]") {
+    GIVEN("Functions with some arithmetic expressions") {
+        std::string nmodl_text = R"(
+            FUNCTION exponential() {
+                LOCAL i
+                i = 1
+                exponential = exp(i)
+            }
+
+            FUNCTION constant() {
+                constant = 10
+            }
+
+            FUNCTION arithmetic() {
+                LOCAL x, y
+                x = 3
+                y = 7
+                arithmetic = x * y / (x + y)
+            }
+
+            FUNCTION bar() {
+                LOCAL i, j
+                i = 2
+                j = i + 2
+                bar = 2 * 3 + j
+            }
+
+            FUNCTION function_call() {
+                foo()
+                function_call = bar() / constant()
+            }
+
+            PROCEDURE foo() {}
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        SymtabVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/false);
+        llvm_visitor.visit_program(*ast);
+
+        std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
+        Runner runner(std::move(m));
+
+        THEN("functions are evaluated correctly") {
+            auto exp_result = runner.run<double>("exponential");
+            REQUIRE(fabs(exp_result - 2.718281828459045) < EPSILON);
+
+            auto constant_result = runner.run<double>("constant");
+            REQUIRE(fabs(constant_result - 10.0) < EPSILON);
+
+            auto arithmetic_result = runner.run<double>("arithmetic");
+            REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
+
+            auto function_call_result = runner.run<double>("function_call");
+            REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
+        }
+    }
+}
+
+//=============================================================================
+// With optimisations
+//=============================================================================
+
+SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
+    GIVEN("Functions with some arithmetic expressions") {
+        std::string nmodl_text = R"(
+            FUNCTION exponential() {
+                LOCAL i
+                i = 1
+                exponential = exp(i)
+            }
+
+            FUNCTION constant() {
+                constant = 10 * 2 - 100 / 50 * 5
+            }
+
+            FUNCTION arithmetic() {
+                LOCAL x, y
+                x = 3
+                y = 7
+                arithmetic = x * y / (x + y)
+            }
+
+            FUNCTION bar() {
+                LOCAL i, j
+                i = 2
+                j = i + 2
+                bar = 2 * 3 + j
+            }
+
+            FUNCTION function_call() {
+                foo()
+                function_call = bar() / constant()
+            }
+
+            PROCEDURE foo() {}
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        SymtabVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/true);
+        llvm_visitor.visit_program(*ast);
+
+        std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
+        Runner runner(std::move(m));
+
+        THEN("optimizations preserve function results") {
+            // Check exponential is turned into a constant.
+            auto exp_result = runner.run<double>("exponential");
+            REQUIRE(fabs(exp_result - 2.718281828459045) < EPSILON);
+
+            // Check constant folding.
+            auto constant_result = runner.run<double>("constant");
+            REQUIRE(fabs(constant_result - 10.0) < EPSILON);
+
+            // Check constant folding.
+            auto arithmetic_result = runner.run<double>("arithmetic");
+            REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
+
+            auto function_call_result = runner.run<double>("function_call");
+            REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
+        }
+    }
+}

From 5698fd7beec4de5aedbc95ddacd989bb5daaa0f7 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 12 Jan 2021 12:50:22 +0300
Subject: [PATCH 123/331] Extended support for binary ops and refactoring
 (#489)

* Added more bin ops and refactored code
   - Now, there are code generation functions for all comparison
      and logical operators.
   - Code generation functions are now split based on the expression "type"
      (assignment, arithmetic, comparison, logical). Moreover, the lhs and rhs
      expression results can be both double and integer. This is important for
      control flow code generation and for the new AST node CodegenVarType.
* Added support for NOT op
* Added default type flag to switch between float and double
* Added tests for single precision
* Renames LLVM test file to codegen_llvm_ir.cpp to follow convention.
* NOTE : Tests for new operators will be added when the first control
                flow node (most likely FOR node) will land.

fixes #453
---
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 158 ++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  47 +++++-
 src/main.cpp                                  |   8 +-
 test/unit/CMakeLists.txt                      |   2 +-
 .../codegen/{llvm.cpp => codegen_llvm_ir.cpp} |  20 ++-
 5 files changed, 188 insertions(+), 47 deletions(-)
 rename test/unit/codegen/{llvm.cpp => codegen_llvm_ir.cpp} (95%)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6228b39d04..6f134149e3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -65,6 +65,12 @@ unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& i
     return static_cast<unsigned>(*macro->get_value());
 }
 
+llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
+    if (use_single_precision)
+        return llvm::Type::getFloatTy(*context);
+    return llvm::Type::getDoubleTy(*context);
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
@@ -139,10 +145,10 @@ void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Block
     // Procedure or function parameters are doubles by default.
     std::vector<llvm::Type*> arg_types;
     for (size_t i = 0; i < parameters.size(); ++i)
-        arg_types.push_back(llvm::Type::getDoubleTy(*context));
+        arg_types.push_back(get_default_fp_type());
 
     // If visiting a function, the return type is a double by default.
-    llvm::Type* return_type = node.is_function_block() ? llvm::Type::getDoubleTy(*context)
+    llvm::Type* return_type = node.is_function_block() ? get_default_fp_type()
                                                        : llvm::Type::getVoidTy(*context);
 
     // Create a function that is automatically inserted into module's symbol table.
@@ -152,6 +158,90 @@ void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Block
                            *module);
 }
 
+llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
+                                                         llvm::Value* rhs,
+                                                         unsigned op) {
+    const auto& bin_op = static_cast<ast::BinaryOp>(op);
+    llvm::Type* lhs_type = lhs->getType();
+    llvm::Value* result;
+
+    switch (bin_op) {
+#define DISPATCH(binary_op, llvm_fp_op, llvm_int_op)         \
+    case binary_op:                                          \
+        if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
+            result = llvm_fp_op(lhs, rhs);                   \
+        else                                                 \
+            result = llvm_int_op(lhs, rhs);                  \
+        return result;
+
+        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd, builder.CreateAdd);
+        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv, builder.CreateSDiv);
+        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul, builder.CreateMul);
+        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub, builder.CreateSub);
+
+#undef DISPATCH
+
+    default:
+        return nullptr;
+    }
+}
+
+void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs) {
+    auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
+    if (!var) {
+        throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
+    }
+
+    const auto& identifier = var->get_name();
+    if (identifier->is_name()) {
+        llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
+        builder.CreateStore(rhs, alloca);
+    } else if (identifier->is_indexed_name()) {
+        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        builder.CreateStore(rhs, codegen_indexed_name(*indexed_name));
+    } else {
+        throw std::runtime_error("Error: Unsupported variable type");
+    }
+}
+
+llvm::Value* CodegenLLVMVisitor::visit_logical_bin_op(llvm::Value* lhs,
+                                                      llvm::Value* rhs,
+                                                      unsigned op) {
+    const auto& bin_op = static_cast<ast::BinaryOp>(op);
+    return bin_op == ast::BinaryOp::BOP_AND ? builder.CreateAnd(lhs, rhs)
+                                            : builder.CreateOr(lhs, rhs);
+}
+
+llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
+                                                         llvm::Value* rhs,
+                                                         unsigned op) {
+    const auto& bin_op = static_cast<ast::BinaryOp>(op);
+    llvm::Type* lhs_type = lhs->getType();
+    llvm::Value* result;
+
+    switch (bin_op) {
+#define DISPATCH(binary_op, f_llvm_op, i_llvm_op)            \
+    case binary_op:                                          \
+        if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
+            result = f_llvm_op(lhs, rhs);                    \
+        else                                                 \
+            result = i_llvm_op(lhs, rhs);                    \
+        return result;
+
+        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, builder.CreateICmpEQ, builder.CreateFCmpOEQ);
+        DISPATCH(ast::BinaryOp::BOP_GREATER, builder.CreateICmpSGT, builder.CreateFCmpOGT);
+        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL, builder.CreateICmpSGE, builder.CreateFCmpOGE);
+        DISPATCH(ast::BinaryOp::BOP_LESS, builder.CreateICmpSLT, builder.CreateFCmpOLT);
+        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, builder.CreateICmpSLE, builder.CreateFCmpOLE);
+        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, builder.CreateICmpNE, builder.CreateFCmpONE);
+
+#undef DISPATCH
+
+    default:
+        return nullptr;
+    }
+}
+
 void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
     const auto& name = node.get_node_name();
     const auto& parameters = node.get_parameters();
@@ -222,44 +312,39 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
     llvm::Value* rhs = values.back();
     values.pop_back();
     if (op == ast::BinaryOp::BOP_ASSIGN) {
-        auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
-        if (!var) {
-            throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
-        }
-
-        const auto& identifier = var->get_name();
-        if (identifier->is_name()) {
-            llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
-            builder.CreateStore(rhs, alloca);
-        } else if (identifier->is_indexed_name()) {
-            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-            builder.CreateStore(rhs, codegen_indexed_name(*indexed_name));
-        } else {
-            throw std::runtime_error("Error: Unsupported variable type");
-        }
+        visit_assign_op(node, rhs);
         return;
     }
 
     node.get_lhs()->accept(*this);
     llvm::Value* lhs = values.back();
     values.pop_back();
-    llvm::Value* result;
 
-    // \todo: Support other binary operators
+    llvm::Value* result;
     switch (op) {
-#define DISPATCH(binary_op, llvm_op) \
-    case binary_op:                  \
-        result = llvm_op(lhs, rhs);  \
-        values.push_back(result);    \
+    case ast::BOP_ADDITION:
+    case ast::BOP_DIVISION:
+    case ast::BOP_MULTIPLICATION:
+    case ast::BOP_SUBTRACTION:
+        result = visit_arithmetic_bin_op(lhs, rhs, op);
         break;
-
-        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd);
-        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv);
-        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul);
-        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub);
-
-#undef DISPATCH
+    case ast::BOP_AND:
+    case ast::BOP_OR:
+        result = visit_logical_bin_op(lhs, rhs, op);
+        break;
+    case ast::BOP_EXACT_EQUAL:
+    case ast::BOP_GREATER:
+    case ast::BOP_GREATER_EQUAL:
+    case ast::BOP_LESS:
+    case ast::BOP_LESS_EQUAL:
+    case ast::BOP_NOT_EQUAL:
+        result = visit_comparison_bin_op(lhs, rhs, op);
+        break;
+    default:
+        throw std::runtime_error("Error: binary operator is not supported\n");
     }
+
+    values.push_back(result);
 }
 
 void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
@@ -269,8 +354,7 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
 }
 
 void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
-    const auto& constant = llvm::ConstantFP::get(llvm::Type::getDoubleTy(*context),
-                                                 node.get_value());
+    const auto& constant = llvm::ConstantFP::get(get_default_fp_type(), node.get_value());
     values.push_back(constant);
 }
 
@@ -310,10 +394,10 @@ void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatemen
         if (identifier->is_indexed_name()) {
             auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
             unsigned length = get_array_index_or_length(*indexed_name);
-            var_type = llvm::ArrayType::get(llvm::Type::getDoubleTy(*context), length);
+            var_type = llvm::ArrayType::get(get_default_fp_type(), length);
         } else if (identifier->is_name()) {
             // This case corresponds to a scalar local variable. Its type is double by default.
-            var_type = llvm::Type::getDoubleTy(*context);
+            var_type = get_default_fp_type();
         } else {
             throw std::runtime_error("Error: Unsupported local variable type");
         }
@@ -367,10 +451,10 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
     llvm::Value* value = values.back();
     values.pop_back();
     if (op == ast::UOP_NEGATION) {
-        llvm::Value* result = builder.CreateFNeg(value);
-        values.push_back(result);
+        values.push_back(builder.CreateFNeg(value));
+    } else if (op == ast::UOP_NOT) {
+        values.push_back(builder.CreateNot(value));
     } else {
-        // Support only `double` operators for now.
         throw std::runtime_error("Error: unsupported unary operator\n");
     }
 }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 599cfc7b58..066bdf35e3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -76,6 +76,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Run optimisation passes if true.
     bool opt_passes;
 
+    // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
+    bool use_single_precision;
+
     /**
      *\brief Run LLVM optimisation passes on generated IR
      *
@@ -93,10 +96,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
-                       bool opt_passes)
+                       bool opt_passes,
+                       bool use_single_precision = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
+        , use_single_precision(use_single_precision)
         , builder(*context)
         , fpm(module.get()) {}
 
@@ -129,6 +134,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     unsigned get_array_index_or_length(const ast::IndexedName& node);
 
+    /**
+     * Returns 64-bit or 32-bit LLVM floating type
+     * \return     \c LLVM floating point type according to `use_single_precision` flag
+     */
+    llvm::Type* get_default_fp_type();
+
     /**
      * Create a function call to an external method
      * \param name external method name
@@ -162,6 +173,40 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         return std::move(module);
     }
 
+    /**
+     * Visit nmodl arithmetic binary operator
+     * \param lhs LLVM value of evaluated lhs expression
+     * \param rhs LLVM value of evaluated rhs expression
+     * \param op the AST binary operator (ADD, DIV, MUL, SUB)
+     * \return LLVM IR value result
+     */
+    llvm::Value* visit_arithmetic_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
+
+    /**
+     * Visit nmodl assignment operator (ASSIGN)
+     * \param node the AST node representing the binary expression in NMODL
+     * \param rhs LLVM value of evaluated rhs expression
+     */
+    void visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs);
+
+    /**
+     * Visit nmodl logical binary operator
+     * \param lhs LLVM value of evaluated lhs expression
+     * \param rhs LLVM value of evaluated rhs expression
+     * \param op the AST binary operator (AND, OR)
+     * \return LLVM IR value result
+     */
+    llvm::Value* visit_logical_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
+
+    /**
+     * Visit nmodl comparison binary operator
+     * \param lhs LLVM value of evaluated lhs expression
+     * \param rhs LLVM value of evaluated rhs expression
+     * \param op the AST binary operator (EXACT_EQUAL, GREATER, GREATER_EQUAL, LESS, LESS_EQUAL,
+     * NOT_EQUAL) \return LLVM IR value result
+     */
+    llvm::Value* visit_comparison_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
+
     /**
      * Visit nmodl function or procedure
      * \param node the AST node representing the function or procedure in NMODL
diff --git a/src/main.cpp b/src/main.cpp
index c1b508e21e..5b97b9a7f8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -172,6 +172,9 @@ int main(int argc, const char* argv[]) {
     /// generate llvm IR
     bool llvm_ir(false);
 
+    /// use single precision floating-point types
+    bool llvm_float_type(false);
+
     /// run llvm optimisation passes
     bool llvm_opt_passes(false);
 #endif
@@ -290,6 +293,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--opt",
         llvm_opt_passes,
         "Run LLVM optimisation passes ({})"_format(llvm_opt_passes))->ignore_case();
+    llvm_opt->add_flag("--single-precision",
+                       llvm_float_type,
+                       "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
 #endif
     // clang-format on
 
@@ -593,7 +599,7 @@ int main(int argc, const char* argv[]) {
 #ifdef NMODL_LLVM_BACKEND
             if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes);
+                CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes, llvm_float_type);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm"));
             }
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 4c59e148c4..e99f257f88 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -106,7 +106,7 @@ target_link_libraries(
 
 if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS})
-  add_executable(testllvm visitor/main.cpp codegen/llvm.cpp)
+  add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_llvm_execution.cpp)
   target_link_libraries(
     testllvm
diff --git a/test/unit/codegen/llvm.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
similarity index 95%
rename from test/unit/codegen/llvm.cpp
rename to test/unit/codegen/codegen_llvm_ir.cpp
index d644947e79..e44b2b15cd 100644
--- a/test/unit/codegen/llvm.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -22,13 +22,18 @@ using nmodl::parser::NmodlDriver;
 // Utility to get LLVM module as a string
 //=============================================================================
 
-std::string run_llvm_visitor(const std::string& text, bool opt = false) {
+std::string run_llvm_visitor(const std::string& text,
+                             bool opt = false,
+                             bool use_single_precision = false) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
 
-    codegen::CodegenLLVMVisitor llvm_visitor("unknown", ".", opt);
+    codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                             /*output_dir=*/".",
+                                             opt,
+                                             use_single_precision);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.print_module();
 }
@@ -47,14 +52,15 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
         )";
 
         THEN("variables are loaded and add instruction is created") {
-            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::string module_string =
+                run_llvm_visitor(nmodl_text, /*opt=*/false, /*use_single_precision=*/true);
             std::smatch m;
 
-            std::regex rhs(R"(%1 = load double, double\* %b)");
-            std::regex lhs(R"(%2 = load double, double\* %a)");
-            std::regex res(R"(%3 = fadd double %2, %1)");
+            std::regex rhs(R"(%1 = load float, float\* %b)");
+            std::regex lhs(R"(%2 = load float, float\* %a)");
+            std::regex res(R"(%3 = fadd float %2, %1)");
 
-            // Check the values are loaded correctly and added
+            // Check the float values are loaded correctly and added
             REQUIRE(std::regex_search(module_string, m, rhs));
             REQUIRE(std::regex_search(module_string, m, lhs));
             REQUIRE(std::regex_search(module_string, m, res));

From a171af103eb349ad85fc596c286e493dcead2e03 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Tue, 12 Jan 2021 10:55:23 +0100
Subject: [PATCH 124/331] Avoid converting LOCAL statement in all
 StatementBlocks (#492)

* visit_statement_block of all FUNCTION and PROCEDURE
    blocks was called resulting in changing LOCAL
    statement to DOUBLE statement
  * As statement block doesn't need to be visited for this
    purpose, rename function to convert_local_statement
  * Call convert_local_statement only when required i.e.
    only when codegen function creation time.

fixes #491
---
 src/codegen/llvm/codegen_llvm_helper_visitor.cpp | 8 +++++++-
 src/codegen/llvm/codegen_llvm_helper_visitor.hpp | 3 ++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 341ab03fb6..4dec93c52e 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -123,6 +123,9 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     auto block = node.get_statement_block()->clone();
     const auto& statements = block->get_statements();
 
+    /// convert local statement to codegenvar statement
+    convert_local_statement(*block);
+
     /// insert return variable at the start of the block
     ast::CodegenVarVector codegen_vars;
     codegen_vars.emplace_back(new ast::CodegenVar(0, return_var->clone()));
@@ -356,7 +359,7 @@ void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
  * first statement in the vector. We have to remove LOCAL statement and convert
  * it to CodegenVarListStatement that will represent all variables as double.
  */
-void CodegenLLVMHelperVisitor::visit_statement_block(ast::StatementBlock& node) {
+void CodegenLLVMHelperVisitor::convert_local_statement(ast::StatementBlock& node) {
     /// first process all children blocks if any
     node.visit_children(*this);
 
@@ -475,6 +478,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// convert all variables inside loop body to instance variables
     convert_to_instance_variable(*loop_block, loop_index_var);
 
+    /// convert local statement to codegenvar statement
+    convert_local_statement(*loop_block);
+
     /// create for loop node
     auto for_loop_statement = std::make_shared<ast::CodegenForStatement>(initialization,
                                                                          condition,
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 6b1684e7d1..1db659c1b4 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -75,7 +75,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
 
     void convert_to_instance_variable(ast::Node& node, std::string& index_var);
 
-    void visit_statement_block(ast::StatementBlock& node) override;
+    void convert_local_statement(ast::StatementBlock& node);
+
     void visit_procedure_block(ast::ProcedureBlock& node) override;
     void visit_function_block(ast::FunctionBlock& node) override;
     void visit_nrn_state_block(ast::NrnStateBlock& node) override;

From 0eab81c0c4fb62f795cad16ad3ab2702f1c74f9a Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Wed, 13 Jan 2021 11:55:57 +0100
Subject: [PATCH 125/331] Handle CodegenVarType type in JSON printer  (#494)

* Handle CodegenVarType type in JSON printer
  - As AstNodeType is enum type and node itself,
    we need to print that explicitly
* Indent json visitor jinja template
 - initially template was not indented as code generated
   was not looking good
 - now all generated code is automatically clang-formatted
   so it's less of a concern. Readability is important.

fixes #493
---
 src/language/node_info.py                     |  1 +
 src/language/nodes.py                         |  4 ++
 .../templates/visitors/json_visitor.cpp       | 47 +++++++++++--------
 3 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/src/language/node_info.py b/src/language/node_info.py
index bd81a0d14a..8b4e5fe0a2 100644
--- a/src/language/node_info.py
+++ b/src/language/node_info.py
@@ -169,6 +169,7 @@
 STRING_NODE = "String"
 UNIT_BLOCK = "UnitBlock"
 AST_NODETYPE_NODE= "AstNodeType"
+CODEGEN_VAR_TYPE_NODE = "CodegenVarType"
 
 # name of variable in prime node which represent order of derivative
 ORDER_VAR_NAME = "order"
diff --git a/src/language/nodes.py b/src/language/nodes.py
index 84cddd8d12..4b520cb51b 100644
--- a/src/language/nodes.py
+++ b/src/language/nodes.py
@@ -140,6 +140,10 @@ def is_name_node(self):
     def is_ast_nodetype_node(self):
         return self.class_name == node_info.AST_NODETYPE_NODE
 
+    @property
+    def is_codegen_var_type_node(self):
+        return self.class_name == node_info.CODEGEN_VAR_TYPE_NODE
+
     @property
     def is_enum_node(self):
         data_type = node_info.DATA_TYPES[self.class_name]
diff --git a/src/language/templates/visitors/json_visitor.cpp b/src/language/templates/visitors/json_visitor.cpp
index e96bcbf10c..2a0c6d68a9 100644
--- a/src/language/templates/visitors/json_visitor.cpp
+++ b/src/language/templates/visitors/json_visitor.cpp
@@ -22,33 +22,40 @@ using namespace ast;
 {% for node in nodes %}
 void JSONVisitor::visit_{{ node.class_name|snake_case }}(const {{ node.class_name }}& node) {
     {% if node.has_children() %}
-    printer->push_block(node.get_node_type_name());
-    if (embed_nmodl) {
-        printer->add_block_property("nmodl", to_nmodl(node));
-    }
-    node.visit_children(*this);
-    {% if node.is_data_type_node %}
+        printer->push_block(node.get_node_type_name());
+        if (embed_nmodl) {
+            printer->add_block_property("nmodl", to_nmodl(node));
+        }
+        node.visit_children(*this);
+        {% if node.is_data_type_node %}
             {% if node.is_integer_node %}
-    if(!node.get_macro()) {
-        std::stringstream ss;
-        ss << node.eval();
-        printer->add_node(ss.str());
-    }
+                if(!node.get_macro()) {
+                    std::stringstream ss;
+                    ss << node.eval();
+                    printer->add_node(ss.str());
+                }
             {% else %}
-    std::stringstream ss;
-    ss << node.eval();
-    printer->add_node(ss.str());
+                std::stringstream ss;
+                ss << node.eval();
+                printer->add_node(ss.str());
             {% endif %}
         {% endif %}
-    printer->pop_block();
+
+        {% if node.is_codegen_var_type_node %}
+            printer->add_node(ast::to_string(node.get_type()));
+        {% endif %}
+
+        printer->pop_block();
+
         {% if node.is_program_node %}
-    if (node.get_parent() == nullptr) {
-        flush();
-    }
+            if (node.get_parent() == nullptr) {
+                flush();
+            }
         {% endif %}
+
     {% else %}
-    (void)node;
-    printer->add_node("{{ node.class_name }}");
+        (void)node;
+        printer->add_node("{{ node.class_name }}");
     {% endif %}
 }
 

From 183acbe86b7131e91298ffc8a27eeb600d784210 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Mon, 25 Jan 2021 15:59:04 +0300
Subject: [PATCH 126/331] Integrating LLVM helper into LLVM visitor (#497)

* LLVM Helper visitor now can return a vector of `CodegenFunction`s.
* LLVM Helper visitor has been integrated into LLVM visitor:
   - The type of variables is still double by default, but can also be inferred from `CodegenVarType` node.
   - Procedure's return type changed to int (so that error codes can be returned in the future).
   - New visitor functions added: for `CodegenReturn`, `CodegenFunction`, `CodegenVarList` and `CodegenVarType`.
---
 .../llvm/codegen_llvm_helper_visitor.cpp      |   6 +-
 .../llvm/codegen_llvm_helper_visitor.hpp      |   8 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 234 ++++++++++--------
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  17 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  39 ++-
 5 files changed, 177 insertions(+), 127 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 4dec93c52e..751fecfc81 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -71,6 +71,11 @@ std::shared_ptr<ast::Expression> create_expression(const std::string& code) {
     return std::make_shared<ast::WrappedExpression>(rhs->clone());
 }
 
+CodegenFunctionVector CodegenLLVMHelperVisitor::get_codegen_functions(const ast::Program& node) {
+    const_cast<ast::Program&>(node).accept(*this);
+    return codegen_functions;
+}
+
 /**
  * \brief Add code generation function for FUNCTION or PROCEDURE block
  * @param node AST node representing FUNCTION or PROCEDURE
@@ -98,7 +103,6 @@ std::shared_ptr<ast::Expression> create_expression(const std::string& code) {
  *
  * We perform following transformations so that code generation backends
  * will have minimum logic:
- *  - Add return type
  *  - Add type for the function arguments
  *  - Define variables and return variable
  *  - Add return type (int for PROCEDURE and double for FUNCTION)
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 1db659c1b4..0ec3792b9d 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -21,6 +21,9 @@
 namespace nmodl {
 namespace codegen {
 
+
+typedef std::vector<std::shared_ptr<ast::CodegenFunction>> CodegenFunctionVector;
+
 /**
  * @addtogroup llvm_codegen_details
  * @{
@@ -46,7 +49,7 @@ namespace codegen {
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     /// newly generated code generation specific functions
-    std::vector<std::shared_ptr<ast::CodegenFunction>> codegen_functions;
+    CodegenFunctionVector codegen_functions;
 
     /// ast information for code generation
     codegen::CodegenInfo info;
@@ -61,6 +64,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
   public:
     CodegenLLVMHelperVisitor() = default;
 
+    /// run visitor and return code generation functions
+    CodegenFunctionVector get_codegen_functions(const ast::Program& node);
+
     void ion_read_statements(BlockType type,
                              std::vector<std::string>& int_variables,
                              std::vector<std::string>& double_variables,
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6f134149e3..2d762c0e92 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -9,7 +9,6 @@
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 
 #include "ast/all.hpp"
-#include "codegen/codegen_helper_visitor.hpp"
 #include "visitors/rename_visitor.hpp"
 
 #include "llvm/IR/BasicBlock.h"
@@ -28,8 +27,10 @@ namespace codegen {
 /****************************************************************************************/
 
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
-    llvm::Type* array_type =
-        local_named_values->lookup(node.get_node_name())->getType()->getPointerElementType();
+    llvm::Type* array_type = current_func->getValueSymbolTable()
+                                 ->lookup(node.get_node_name())
+                                 ->getType()
+                                 ->getPointerElementType();
     unsigned length = array_type->getArrayNumElements();
     return 0 <= index && index < length;
 }
@@ -40,7 +41,7 @@ llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, unsigned in
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
     indices.push_back(llvm::ConstantInt::get(index_type, index));
 
-    return builder.CreateInBoundsGEP(local_named_values->lookup(name), indices);
+    return builder.CreateInBoundsGEP(current_func->getValueSymbolTable()->lookup(name), indices);
 }
 
 llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
@@ -65,6 +66,21 @@ unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& i
     return static_cast<unsigned>(*macro->get_value());
 }
 
+llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType& node) {
+    switch (node.get_type()) {
+    case ast::AstNodeType::BOOLEAN:
+        return llvm::Type::getInt1Ty(*context);
+    case ast::AstNodeType::DOUBLE:
+        return get_default_fp_type();
+    case ast::AstNodeType::INTEGER:
+        return llvm::Type::getInt32Ty(*context);
+    case ast::AstNodeType::VOID:
+        return llvm::Type::getVoidTy(*context);
+    default:
+        throw std::runtime_error("Error: expecting a type in CodegenVarType node\n");
+    }
+}
+
 llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
     if (use_single_precision)
         return llvm::Type::getFloatTy(*context);
@@ -138,18 +154,16 @@ void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
     values.push_back(call);
 }
 
-void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Block& node) {
+void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::CodegenFunction& node) {
     const auto& name = node.get_node_name();
-    const auto& parameters = node.get_parameters();
+    const auto& arguments = node.get_arguments();
 
     // Procedure or function parameters are doubles by default.
     std::vector<llvm::Type*> arg_types;
-    for (size_t i = 0; i < parameters.size(); ++i)
-        arg_types.push_back(get_default_fp_type());
+    for (size_t i = 0; i < arguments.size(); ++i)
+        arg_types.push_back(get_codegen_var_type(*arguments[i]->get_type()));
 
-    // If visiting a function, the return type is a double by default.
-    llvm::Type* return_type = node.is_function_block() ? get_default_fp_type()
-                                                       : llvm::Type::getVoidTy(*context);
+    llvm::Type* return_type = get_codegen_var_type(*node.get_return_type());
 
     // Create a function that is automatically inserted into module's symbol table.
     llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
@@ -194,7 +208,7 @@ void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm
 
     const auto& identifier = var->get_name();
     if (identifier->is_name()) {
-        llvm::Value* alloca = local_named_values->lookup(var->get_node_name());
+        llvm::Value* alloca = current_func->getValueSymbolTable()->lookup(var->get_node_name());
         builder.CreateStore(rhs, alloca);
     } else if (identifier->is_indexed_name()) {
         auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
@@ -242,62 +256,6 @@ llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
     }
 }
 
-void CodegenLLVMVisitor::visit_procedure_or_function(const ast::Block& node) {
-    const auto& name = node.get_node_name();
-    const auto& parameters = node.get_parameters();
-    llvm::Function* func = module->getFunction(name);
-
-    // Create the entry basic block of the function/procedure and point the local named values table
-    // to the symbol table.
-    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
-    builder.SetInsertPoint(body);
-    local_named_values = func->getValueSymbolTable();
-
-    // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
-    // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
-    // symbolic conflicts. Then, allocate the return variable on the local stack.
-    std::string return_var_name = "ret_" + name;
-    const auto& block = node.get_statement_block();
-    if (node.is_function_block()) {
-        visitor::RenameVisitor v(name, return_var_name);
-        block->accept(v);
-        builder.CreateAlloca(llvm::Type::getDoubleTy(*context),
-                             /*ArraySize=*/nullptr,
-                             return_var_name);
-    }
-
-    // Allocate parameters on the stack and add them to the symbol table.
-    unsigned i = 0;
-    for (auto& arg: func->args()) {
-        std::string arg_name = parameters[i++].get()->get_node_name();
-        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
-        arg.setName(arg_name);
-        builder.CreateStore(&arg, alloca);
-    }
-
-    // Process function or procedure body.
-    const auto& statements = block->get_statements();
-    for (const auto& statement: statements) {
-        // \todo: Support other statement types.
-        if (statement->is_local_list_statement() || statement->is_expression_statement())
-            statement->accept(*this);
-    }
-
-    // Add the terminator. If visiting function, we need to return the value specified by
-    // ret_<function_name>.
-    if (node.is_function_block()) {
-        llvm::Value* return_var = builder.CreateLoad(local_named_values->lookup(return_var_name));
-        builder.CreateRet(return_var);
-    } else {
-        builder.CreateRetVoid();
-    }
-
-    // Clear local values stack and remove the pointer to the local symbol table.
-    values.clear();
-    local_named_values = nullptr;
-}
-
-
 /****************************************************************************************/
 /*                            Overloaded visitor routines                               */
 /****************************************************************************************/
@@ -353,13 +311,101 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
     values.push_back(constant);
 }
 
+void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node) {
+    const auto& name = node.get_node_name();
+    const auto& arguments = node.get_arguments();
+    llvm::Function* func = module->getFunction(name);
+    current_func = func;
+
+    // Create the entry basic block of the function/procedure and point the local named values table
+    // to the symbol table.
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
+    builder.SetInsertPoint(body);
+
+    // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
+    // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
+    // symbolic conflicts.
+    std::string return_var_name = "ret_" + name;
+    const auto& block = node.get_statement_block();
+    visitor::RenameVisitor v(name, return_var_name);
+    block->accept(v);
+
+
+    // Allocate parameters on the stack and add them to the symbol table.
+    unsigned i = 0;
+    for (auto& arg: func->args()) {
+        std::string arg_name = arguments[i++].get()->get_node_name();
+        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
+        arg.setName(arg_name);
+        builder.CreateStore(&arg, alloca);
+    }
+
+    // Process function or procedure body. The return statement is handled in a separate visitor.
+    const auto& statements = block->get_statements();
+    for (const auto& statement: statements) {
+        // \todo: Support other statement types.
+        if (statement->is_codegen_var_list_statement() || statement->is_expression_statement() ||
+            statement->is_codegen_return_statement())
+            statement->accept(*this);
+    }
+
+    // If function has a void return type, add a terminator not handled by CodegenReturnVar.
+    if (node.is_void())
+        builder.CreateRetVoid();
+
+    // Clear local values stack and remove the pointer to the local symbol table.
+    values.clear();
+    current_func = nullptr;
+}
+
+void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturnStatement& node) {
+    if (!node.get_statement()->is_name())
+        throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
+
+    std::string ret = "ret_" + current_func->getName().str();
+    llvm::Value* ret_value = builder.CreateLoad(current_func->getValueSymbolTable()->lookup(ret));
+    builder.CreateRet(ret_value);
+}
+
+void CodegenLLVMVisitor::visit_codegen_var_list_statement(
+    const ast::CodegenVarListStatement& node) {
+    llvm::Type* scalar_var_type = get_codegen_var_type(*node.get_var_type());
+    for (const auto& variable: node.get_variables()) {
+        std::string name = variable->get_node_name();
+        const auto& identifier = variable->get_name();
+        // Local variable can be a scalar (Node AST class) or an array (IndexedName AST class). For
+        // each case, create memory allocations with the corresponding LLVM type.
+        llvm::Type* var_type;
+        if (identifier->is_indexed_name()) {
+            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+            unsigned length = get_array_index_or_length(*indexed_name);
+            var_type = llvm::ArrayType::get(scalar_var_type, length);
+        } else if (identifier->is_name()) {
+            // This case corresponds to a scalar local variable. Its type is double by default.
+            var_type = scalar_var_type;
+        } else {
+            throw std::runtime_error("Error: Unsupported local variable type");
+        }
+        llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+
+        // Check if the variable we process is a procedure return variable (i.e. it has a name
+        // "ret_<current_function_name>" and the function return type is integer). If so, initialise
+        // it to 0.
+        std::string ret_val_name = "ret_" + current_func->getName().str();
+        if (name == ret_val_name && current_func->getReturnType()->isIntegerTy()) {
+            llvm::Value* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 0);
+            builder.CreateStore(zero, alloca);
+        }
+    }
+}
+
 void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
     const auto& constant = llvm::ConstantFP::get(get_default_fp_type(), node.get_value());
     values.push_back(constant);
 }
 
 void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
-    visit_procedure_or_function(node);
+    // do nothing. \todo: remove old function blocks from ast.
 }
 
 void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
@@ -384,41 +430,19 @@ void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
     values.push_back(constant);
 }
 
-void CodegenLLVMVisitor::visit_local_list_statement(const ast::LocalListStatement& node) {
-    for (const auto& variable: node.get_variables()) {
-        std::string name = variable->get_node_name();
-        const auto& identifier = variable->get_name();
-        // Local variable can be a scalar (Node AST class) or an array (IndexedName AST class). For
-        // each case, create memory allocations with the corresponding LLVM type.
-        llvm::Type* var_type;
-        if (identifier->is_indexed_name()) {
-            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-            unsigned length = get_array_index_or_length(*indexed_name);
-            var_type = llvm::ArrayType::get(get_default_fp_type(), length);
-        } else if (identifier->is_name()) {
-            // This case corresponds to a scalar local variable. Its type is double by default.
-            var_type = get_default_fp_type();
-        } else {
-            throw std::runtime_error("Error: Unsupported local variable type");
-        }
-        builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
-    }
-}
-
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
-    // Before generating LLVM, gather information about AST. For now, information about functions
-    // and procedures is used only.
-    CodegenHelperVisitor v;
-    CodegenInfo info = v.analyze(node);
-
-    // For every function and procedure, generate its declaration. Thus, we can look up
+    // Before generating LLVM:
+    //   - convert function and procedure blocks into CodegenFunctions
+    //   - gather information about AST. For now, information about functions
+    //     and procedures is used only.
+    CodegenLLVMHelperVisitor v;
+    const auto& functions = v.get_codegen_functions(node);
+
+    // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
-    for (const auto& func: info.functions) {
+    for (const auto& func: functions) {
         emit_procedure_or_function_declaration(*func);
     }
-    for (const auto& proc: info.procedures) {
-        emit_procedure_or_function_declaration(*proc);
-    }
 
     // Set the AST symbol table.
     sym_tab = node.get_symbol_table();
@@ -433,16 +457,10 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     // Keep this for easier development (maybe move to debug mode later).
     std::cout << print_module();
-
-    // not used yet : this will be used at the beginning of this function
-    {
-        CodegenLLVMHelperVisitor v;
-        v.visit_program(const_cast<ast::Program&>(node));
-    }
 }
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
-    visit_procedure_or_function(node);
+    // do nothing. \todo: remove old procedures from ast.
 }
 
 void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node) {
@@ -466,7 +484,7 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 
     llvm::Value* ptr;
     if (identifier->is_name())
-        ptr = local_named_values->lookup(node.get_node_name());
+        ptr = current_func->getValueSymbolTable()->lookup(node.get_node_name());
 
     if (identifier->is_indexed_name()) {
         auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 066bdf35e3..c6123a040d 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -67,8 +67,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Stack to hold visited values
     std::vector<llvm::Value*> values;
 
-    // Pointer to the local symbol table.
-    llvm::ValueSymbolTable* local_named_values = nullptr;
+    // Pointer to the current function.
+    llvm::Function* current_func = nullptr;
 
     // Pointer to AST symbol table.
     symtab::SymbolTable* sym_tab;
@@ -134,6 +134,13 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     unsigned get_array_index_or_length(const ast::IndexedName& node);
 
+    /**
+     * Returns LLVM type for the given CodegenVarType node
+     * \param node CodegenVarType
+     * \return LLVM type
+     */
+    llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
+
     /**
      * Returns 64-bit or 32-bit LLVM floating type
      * \return     \c LLVM floating point type according to `use_single_precision` flag
@@ -163,7 +170,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      *
      * \param node the AST node representing the function or procedure in NMODL
      */
-    void emit_procedure_or_function_declaration(const ast::Block& node);
+    void emit_procedure_or_function_declaration(const ast::CodegenFunction& node);
 
     /**
      * Return module pointer
@@ -216,11 +223,13 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Visitors
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
+    void visit_codegen_function(const ast::CodegenFunction& node) override;
+    void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
+    void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
     void visit_function_call(const ast::FunctionCall& node) override;
     void visit_integer(const ast::Integer& node) override;
-    void visit_local_list_statement(const ast::LocalListStatement& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index e44b2b15cd..c328113f93 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -198,12 +198,12 @@ SCENARIO("Function call", "[visitor][llvm]") {
             }
         )";
 
-        THEN("a void call instruction is created") {
+        THEN("an int call instruction is created") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
             // Check for call instruction.
-            std::regex call(R"(call void @bar\(\))");
+            std::regex call(R"(call i32 @bar\(\))");
             REQUIRE(std::regex_search(module_string, m, call));
         }
     }
@@ -408,13 +408,20 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             PROCEDURE empty() {}
         )";
 
-        THEN("empty void function is produced") {
+        THEN("a function returning 0 integer is produced") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check procedure has empty body with a void return.
-            std::regex procedure(R"(define void @empty\(\) \{\n(\s)*ret void\n\})");
-            REQUIRE(std::regex_search(module_string, m, procedure));
+            // Check procedure has empty body with a dummy 0 allocation.
+            std::regex signature(R"(define i32 @empty)");
+            std::regex alloc(R"(%ret_empty = alloca i32)");
+            std::regex store(R"(store i32 0, i32\* %ret_empty)");
+            std::regex load(R"(%1 = load i32, i32\* %ret_empty)");
+            std::regex ret(R"(ret i32 %1)");
+            REQUIRE(std::regex_search(module_string, m, signature));
+            REQUIRE(std::regex_search(module_string, m, alloc));
+            REQUIRE(std::regex_search(module_string, m, store));
+            REQUIRE(std::regex_search(module_string, m, ret));
         }
     }
 
@@ -423,23 +430,29 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             PROCEDURE with_argument(x) {}
         )";
 
-        THEN("void function is produced with arguments allocated on stack") {
+        THEN("int function is produced with arguments allocated on stack") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
             // Check procedure signature.
-            std::regex function_signature(R"(define void @with_argument\(double %x1\) \{)");
+            std::regex function_signature(R"(define i32 @with_argument\(double %x1\) \{)");
             REQUIRE(std::regex_search(module_string, m, function_signature));
 
+            // Check dummy return.
+            std::regex dummy_alloca(R"(%ret_with_argument = alloca i32)");
+            std::regex dummy_store(R"(store i32 0, i32\* %ret_with_argument)");
+            std::regex dummy_load(R"(%1 = load i32, i32\* %ret_with_argument)");
+            std::regex ret(R"(ret i32 %1)");
+            REQUIRE(std::regex_search(module_string, m, dummy_alloca));
+            REQUIRE(std::regex_search(module_string, m, dummy_store));
+            REQUIRE(std::regex_search(module_string, m, dummy_load));
+            REQUIRE(std::regex_search(module_string, m, ret));
+
             // Check that procedure arguments are allocated on the local stack.
             std::regex alloca_instr(R"(%x = alloca double)");
             std::regex store_instr(R"(store double %x1, double\* %x)");
             REQUIRE(std::regex_search(module_string, m, alloca_instr));
             REQUIRE(std::regex_search(module_string, m, store_instr));
-
-            // Check terminator.
-            std::regex terminator(R"(ret void)");
-            REQUIRE(std::regex_search(module_string, m, terminator));
         }
     }
 }
@@ -493,7 +506,7 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
 
             // Check if the values are optimised out
             std::regex empty_proc(
-                R"(define void @add\(double %a1, double %b2\) \{\n(\s)*ret void\n\})");
+                R"(define i32 @add\(double %a1, double %b2\) \{\n(\s)*ret i32 0\n\})");
             REQUIRE(std::regex_search(module_string, m, empty_proc));
         }
     }

From 0bf5e9bea0d935f15d5b087cbb85970b0991326c Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Mon, 25 Jan 2021 17:06:22 +0300
Subject: [PATCH 127/331] LLVM code generation for if/else statements (#499)

* Added a new code generation function for conditional statements (`if`, `else if`, `else` and their nested variations).
* Added tests for the new code generation:
   - IR unit tests.
   - Execution tests.
* Fixed FP and integer comparison ordering in macros.

fixes #468
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    |  86 +++++++-
 src/codegen/llvm/codegen_llvm_visitor.hpp    |   1 +
 test/unit/codegen/codegen_llvm_execution.cpp |  28 +++
 test/unit/codegen/codegen_llvm_ir.cpp        | 203 +++++++++++++++++++
 4 files changed, 314 insertions(+), 4 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 2d762c0e92..bde36f3dd4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -26,6 +26,11 @@ namespace codegen {
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
+static bool is_supported_statement(const ast::Statement& statement) {
+    return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
+           statement.is_codegen_return_statement() || statement.is_if_statement();
+}
+
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
     llvm::Type* array_type = current_func->getValueSymbolTable()
                                  ->lookup(node.get_node_name())
@@ -234,7 +239,7 @@ llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
     llvm::Value* result;
 
     switch (bin_op) {
-#define DISPATCH(binary_op, f_llvm_op, i_llvm_op)            \
+#define DISPATCH(binary_op, i_llvm_op, f_llvm_op)            \
     case binary_op:                                          \
         if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
             result = f_llvm_op(lhs, rhs);                    \
@@ -343,9 +348,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     // Process function or procedure body. The return statement is handled in a separate visitor.
     const auto& statements = block->get_statements();
     for (const auto& statement: statements) {
-        // \todo: Support other statement types.
-        if (statement->is_codegen_var_list_statement() || statement->is_expression_statement() ||
-            statement->is_codegen_return_statement())
+        if (is_supported_statement(*statement))
             statement->accept(*this);
     }
 
@@ -424,6 +427,81 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
     }
 }
 
+void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
+    // Get the current and the next blocks within the function.
+    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* next = curr_block->getNextNode();
+    llvm::Function* func = curr_block->getParent();
+
+    // Add a true block and a merge block where the control flow merges.
+    llvm::BasicBlock* true_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+    llvm::BasicBlock* merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+
+    // Add condition to the current block.
+    node.get_condition()->accept(*this);
+    llvm::Value* cond = values.back();
+    values.pop_back();
+
+    // Process the true block.
+    builder.SetInsertPoint(true_block);
+    for (const auto& statement: node.get_statement_block()->get_statements()) {
+        if (is_supported_statement(*statement))
+            statement->accept(*this);
+    }
+    builder.CreateBr(merge_block);
+
+    // Save the merge block and proceed with codegen for `else if` statements.
+    llvm::BasicBlock* exit = merge_block;
+    for (const auto& else_if: node.get_elseifs()) {
+        // Link the current block to the true and else blocks.
+        llvm::BasicBlock* else_block =
+            llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
+        builder.SetInsertPoint(curr_block);
+        builder.CreateCondBr(cond, true_block, else_block);
+
+        // Process else block.
+        builder.SetInsertPoint(else_block);
+        else_if->get_condition()->accept(*this);
+        cond = values.back();
+        values.pop_back();
+
+        // Reassign true and merge blocks respectively. Note that the new merge block has to be
+        // connected to the old merge block (tmp).
+        true_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
+        llvm::BasicBlock* tmp = merge_block;
+        merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
+        builder.SetInsertPoint(merge_block);
+        builder.CreateBr(tmp);
+
+        // Process true block.
+        builder.SetInsertPoint(true_block);
+        for (const auto& statement: else_if->get_statement_block()->get_statements()) {
+            if (is_supported_statement(*statement))
+                statement->accept(*this);
+        }
+        builder.CreateBr(merge_block);
+        curr_block = else_block;
+    }
+
+    // Finally, generate code for `else` statement if it exists.
+    const auto& elses = node.get_elses();
+    llvm::BasicBlock* else_block;
+    if (elses) {
+        else_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
+        builder.SetInsertPoint(else_block);
+        for (const auto& statement: elses->get_statement_block()->get_statements()) {
+            if (is_supported_statement(*statement))
+                statement->accept(*this);
+        }
+        builder.CreateBr(merge_block);
+    } else {
+        else_block = merge_block;
+    }
+    builder.SetInsertPoint(curr_block);
+    builder.CreateCondBr(cond, true_block, else_block);
+    builder.SetInsertPoint(exit);
+}
+
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
                                                   node.get_value());
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index c6123a040d..28129b2fb8 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -229,6 +229,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
     void visit_function_call(const ast::FunctionCall& node) override;
+    void visit_if_statement(const ast::IfStatement& node) override;
     void visit_integer(const ast::Integer& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 6f1bf7b8ca..34311bf2c3 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -114,6 +114,30 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
                 arithmetic = x * y / (x + y)
             }
 
+            FUNCTION conditionals() {
+                LOCAL x, y, z
+                x = 100
+                y = -100
+                z = 0
+                if (x == 200) {
+                    conditionals = 1
+                } else if (x == 400) {
+                    conditionals = 2
+                } else if (x == 100) {
+                    if (y == -100 && z != 0) {
+                        conditionals = 3
+                    } else {
+                        if (y < -99 && z == 0) {
+                          conditionals = 4
+                        } else {
+                            conditionals = 5
+                        }
+                    }
+                } else {
+                    conditionals = 6
+                }
+            }
+
             FUNCTION bar() {
                 LOCAL i, j
                 i = 2
@@ -151,6 +175,10 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
             auto constant_result = runner.run<double>("constant");
             REQUIRE(fabs(constant_result - 10.0) < EPSILON);
 
+            // Check nested conditionals
+            auto conditionals_result = runner.run<double>("conditionals");
+            REQUIRE(fabs(conditionals_result - 4.0) < EPSILON);
+
             // Check constant folding.
             auto arithmetic_result = runner.run<double>("arithmetic");
             REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index c328113f93..292256193c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -147,6 +147,209 @@ SCENARIO("Define", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// If/Else statements and comparison operators
+//=============================================================================
+
+SCENARIO("Comparison", "[visitor][llvm]") {
+    GIVEN("Procedure with comparison operators") {
+        std::string nmodl_text = R"(
+            PROCEDURE foo(x) {
+                if (x < 10) {
+
+                } else if (x >= 10 && x <= 100) {
+
+                } else if (x == 120) {
+
+                } else if (!(x != 200)) {
+
+                }
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check less than.
+            std::regex lt(R"(fcmp olt double %(.+), 1\.000000e\+01)");
+            REQUIRE(std::regex_search(module_string, m, lt));
+
+            // Check greater or equal than and logical and.
+            std::regex ge(R"(fcmp ole double %(.+), 1\.000000e\+02)");
+            std::regex logical_and(R"(and i1 %(.+), %(.+))");
+            REQUIRE(std::regex_search(module_string, m, ge));
+            REQUIRE(std::regex_search(module_string, m, logical_and));
+
+            // Check equals.
+            std::regex eq(R"(fcmp oeq double %(.+), 1\.200000e\+02)");
+            REQUIRE(std::regex_search(module_string, m, eq));
+
+            // Check not equals.
+            std::regex ne(R"(fcmp one double %(.+), 2\.000000e\+02)");
+            REQUIRE(std::regex_search(module_string, m, ne));
+        }
+    }
+}
+
+SCENARIO("If/Else", "[visitor][llvm]") {
+    GIVEN("Function with only if statement") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(y) {
+                LOCAL x
+                x = 100
+                if (y == 20) {
+                    x = 20
+                }
+                foo = x + y
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex cond_br(
+                "br i1 %2, label %3, label %4\n"
+                "\n"
+                "3:.*\n"
+                "  store double 2\\.000000e\\+01, double\\* %x.*\n"
+                "  br label %4\n"
+                "\n"
+                "4:");
+            REQUIRE(std::regex_search(module_string, m, cond_br));
+        }
+    }
+
+    GIVEN("Function with both if and else statements") {
+        std::string nmodl_text = R"(
+            FUNCTION sign(x) {
+                LOCAL s
+                if (x < 0) {
+                    s = -1
+                } else {
+                    s = 1
+                }
+                sign = s
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex if_else_br(
+                "br i1 %2, label %3, label %4\n"
+                "\n"
+                "3:.*\n"
+                "  store double -1\\.000000e\\+00, double\\* %s.*\n"
+                "  br label %5\n"
+                "\n"
+                "4:.*\n"
+                "  store double 1\\.000000e\\+00, double\\* %s.*\n"
+                "  br label %5\n"
+                "\n"
+                "5:");
+            REQUIRE(std::regex_search(module_string, m, if_else_br));
+        }
+    }
+
+    GIVEN("Function with both if and else if statements") {
+        std::string nmodl_text = R"(
+            FUNCTION bar(x) {
+                LOCAL s
+                s = -1
+                if (x <= 0) {
+                    s = 0
+                } else if (0 < x && x <= 1) {
+                    s = 1
+                }
+                bar = s
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex if_else_if(
+                "br i1 %2, label %3, label %4\n"
+                "\n"
+                "3:.*\n"
+                "  .*\n"
+                "  br label %12\n"
+                "\n"
+                "4:.*\n"
+                "  .*\n"
+                "  .*\n"
+                "  .*\n"
+                "  .*\n"
+                "  %.+ = and i1 %.+, %.+\n"
+                "  br i1 %.+, label %10, label %11\n"
+                "\n"
+                "10:.*\n"
+                "  .*\n"
+                "  br label %11\n"
+                "\n"
+                "11:.*\n"
+                "  br label %12\n"
+                "\n"
+                "12:");
+            REQUIRE(std::regex_search(module_string, m, if_else_if));
+        }
+    }
+
+    GIVEN("Function with if, else if anf else statements") {
+        std::string nmodl_text = R"(
+            FUNCTION bar(x) {
+                LOCAL s
+                if (x <= 0) {
+                    s = 0
+                } else if (0 < x && x <= 1) {
+                    s = 1
+                } else {
+                    s = 100
+                }
+                bar = s
+            }
+        )";
+
+        THEN("correct LLVM instructions are produced") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex if_else_if_else(
+                "br i1 %2, label %3, label %4\n"
+                "\n"
+                "3:.*\n"
+                "  .*\n"
+                "  br label %13\n"
+                "\n"
+                "4:.*\n"
+                "  .*\n"
+                "  .*\n"
+                "  .*\n"
+                "  .*\n"
+                "  %9 = and i1 %.+, %.+\n"
+                "  br i1 %9, label %10, label %11\n"
+                "\n"
+                "10:.*\n"
+                "  .*\n"
+                "  br label %12\n"
+                "\n"
+                "11:.*\n"
+                "  .*\n"
+                "  br label %12\n"
+                "\n"
+                "12:.*\n"
+                "  br label %13\n"
+                "\n"
+                "13:");
+            REQUIRE(std::regex_search(module_string, m, if_else_if_else));
+        }
+    }
+}
+
 //=============================================================================
 // FunctionBlock
 //=============================================================================

From 5d7f2eff4c24588ad0573ee6b0d1ae4fede4aa76 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 26 Jan 2021 09:27:31 +0300
Subject: [PATCH 128/331] Added error handling for values not in scope (#502)

Added error handling when a non-scope value is looked up. Before, such a lookup would yield a nullptr, therefore leading to a segmentation fault. This PR adds a lookup function that wraps around value symbol lookup, and throws an error with a message if nullptr is returned.
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 18 +++++++++++-------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  6 ++++++
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index bde36f3dd4..86619b899e 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -32,10 +32,7 @@ static bool is_supported_statement(const ast::Statement& statement) {
 }
 
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
-    llvm::Type* array_type = current_func->getValueSymbolTable()
-                                 ->lookup(node.get_node_name())
-                                 ->getType()
-                                 ->getPointerElementType();
+    llvm::Type* array_type = lookup(node.get_node_name())->getType()->getPointerElementType();
     unsigned length = array_type->getArrayNumElements();
     return 0 <= index && index < length;
 }
@@ -46,7 +43,7 @@ llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, unsigned in
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
     indices.push_back(llvm::ConstantInt::get(index_type, index));
 
-    return builder.CreateInBoundsGEP(current_func->getValueSymbolTable()->lookup(name), indices);
+    return builder.CreateInBoundsGEP(lookup(name), indices);
 }
 
 llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
@@ -177,6 +174,13 @@ void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Codeg
                            *module);
 }
 
+llvm::Value* CodegenLLVMVisitor::lookup(const std::string& name) {
+    auto val = current_func->getValueSymbolTable()->lookup(name);
+    if (!val)
+        throw std::runtime_error("Error: variable " + name + " is not in scope\n");
+    return val;
+}
+
 llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
                                                          llvm::Value* rhs,
                                                          unsigned op) {
@@ -213,7 +217,7 @@ void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm
 
     const auto& identifier = var->get_name();
     if (identifier->is_name()) {
-        llvm::Value* alloca = current_func->getValueSymbolTable()->lookup(var->get_node_name());
+        llvm::Value* alloca = lookup(var->get_node_name());
         builder.CreateStore(rhs, alloca);
     } else if (identifier->is_indexed_name()) {
         auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
@@ -562,7 +566,7 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 
     llvm::Value* ptr;
     if (identifier->is_name())
-        ptr = current_func->getValueSymbolTable()->lookup(node.get_node_name());
+        ptr = lookup(node.get_node_name());
 
     if (identifier->is_indexed_name()) {
         auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 28129b2fb8..82c0c038ca 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -180,6 +180,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         return std::move(module);
     }
 
+    /**
+     * Lookup the given name in the current function's symbol table
+     * \return LLVM value
+     */
+    llvm::Value* lookup(const std::string& name);
+
     /**
      * Visit nmodl arithmetic binary operator
      * \param lhs LLVM value of evaluated lhs expression

From bb829342b4db179b27060503a3b59ae9b8245578 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 26 Jan 2021 11:19:06 +0300
Subject: [PATCH 129/331] Added support for WHILE statement (#501)

Added support for WHILE statement code generation. Corresponding tests for IR generation and execution were also added.

Additional visitor for StatementBlock was added to reduce code duplication.

fixes #500
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    | 59 ++++++++++++++------
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  2 +
 test/unit/codegen/codegen_llvm_execution.cpp | 21 +++++++
 test/unit/codegen/codegen_llvm_ir.cpp        | 44 +++++++++++++++
 4 files changed, 108 insertions(+), 18 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 86619b899e..831c43317a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -28,7 +28,8 @@ namespace codegen {
 
 static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_codegen_return_statement() || statement.is_if_statement();
+           statement.is_codegen_return_statement() || statement.is_if_statement() ||
+           statement.is_while_statement();
 }
 
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
@@ -314,6 +315,14 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
     values.push_back(result);
 }
 
+void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node) {
+    const auto& statements = node.get_statements();
+    for (const auto& statement: statements) {
+        if (is_supported_statement(*statement))
+            statement->accept(*this);
+    }
+}
+
 void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*context),
                                                   node.get_value());
@@ -350,11 +359,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     }
 
     // Process function or procedure body. The return statement is handled in a separate visitor.
-    const auto& statements = block->get_statements();
-    for (const auto& statement: statements) {
-        if (is_supported_statement(*statement))
-            statement->accept(*this);
-    }
+    block->accept(*this);
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
     if (node.is_void())
@@ -448,10 +453,7 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
 
     // Process the true block.
     builder.SetInsertPoint(true_block);
-    for (const auto& statement: node.get_statement_block()->get_statements()) {
-        if (is_supported_statement(*statement))
-            statement->accept(*this);
-    }
+    node.get_statement_block()->accept(*this);
     builder.CreateBr(merge_block);
 
     // Save the merge block and proceed with codegen for `else if` statements.
@@ -479,10 +481,7 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
 
         // Process true block.
         builder.SetInsertPoint(true_block);
-        for (const auto& statement: else_if->get_statement_block()->get_statements()) {
-            if (is_supported_statement(*statement))
-                statement->accept(*this);
-        }
+        else_if->get_statement_block()->accept(*this);
         builder.CreateBr(merge_block);
         curr_block = else_block;
     }
@@ -493,10 +492,7 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     if (elses) {
         else_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
         builder.SetInsertPoint(else_block);
-        for (const auto& statement: elses->get_statement_block()->get_statements()) {
-            if (is_supported_statement(*statement))
-                statement->accept(*this);
-        }
+        elses->get_statement_block()->accept(*this);
         builder.CreateBr(merge_block);
     } else {
         else_block = merge_block;
@@ -578,5 +574,32 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
     values.push_back(var);
 }
 
+void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
+    // Get the current and the next blocks within the function.
+    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* next = curr_block->getNextNode();
+    llvm::Function* func = curr_block->getParent();
+
+    // Add a header and the body blocks.
+    llvm::BasicBlock* header = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+    llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
+
+    builder.CreateBr(header);
+    builder.SetInsertPoint(header);
+
+    // Generate code for condition and create branch to the body block.
+    node.get_condition()->accept(*this);
+    llvm::Value* condition = values.back();
+    values.pop_back();
+    builder.CreateCondBr(condition, body, exit);
+
+    builder.SetInsertPoint(body);
+    node.get_statement_block()->accept(*this);
+    builder.CreateBr(header);
+
+    builder.SetInsertPoint(exit);
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 82c0c038ca..3003a119b5 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -229,6 +229,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Visitors
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
+    void visit_statement_block(const ast::StatementBlock& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
     void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
@@ -241,6 +242,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_program(const ast::Program& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
+    void visit_while_statement(const ast::WhileStatement& node) override;
 
     // \todo: move this to debug mode (e.g. -v option or --dump-ir)
     std::string print_module() const {
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 34311bf2c3..90e8fb3cc2 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -59,6 +59,23 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
             }
 
             PROCEDURE foo() {}
+
+            FUNCTION loop() {
+                LOCAL i, j, sum, result
+                result = 0
+                j = 0
+                WHILE (j < 2) {
+                    i = 0
+                    sum = 0
+                    WHILE (i < 10) {
+                        sum = sum + i
+                        i = i + 1
+                    }
+                    j = j + 1
+                    result = result + sum
+                }
+                loop = result
+            }
         )";
 
 
@@ -86,6 +103,9 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
 
             auto function_call_result = runner.run<double>("function_call");
             REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
+
+            auto loop_result = runner.run<double>("loop");
+            REQUIRE(fabs(loop_result - 90.0) < EPSILON);
         }
     }
 }
@@ -151,6 +171,7 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
             }
 
             PROCEDURE foo() {}
+
         )";
 
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 292256193c..d16b02b2f5 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -690,6 +690,50 @@ SCENARIO("Unary expression", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// WhileStatement
+//=============================================================================
+
+SCENARIO("While", "[visitor][llvm]") {
+    GIVEN("Procedure with a simple while loop") {
+        std::string nmodl_text = R"(
+            FUNCTION loop() {
+                LOCAL i
+                i = 0
+                WHILE (i < 10) {
+                    i = i + 1
+                }
+                loop = 0
+            }
+        )";
+
+        THEN("correct loop is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            std::regex loop(
+                "  br label %1\n"
+                "\n"
+                "1:.*\n"
+                "  %2 = load double, double\\* %i.*\n"
+                "  %3 = fcmp olt double %2, 1\\.000000e\\+01\n"
+                "  br i1 %3, label %4, label %7\n"
+                "\n"
+                "4:.*\n"
+                "  %5 = load double, double\\* %i.*\n"
+                "  %6 = fadd double %5, 1\\.000000e\\+00\n"
+                "  store double %6, double\\* %i.*\n"
+                "  br label %1\n"
+                "\n"
+                "7:.*\n"
+                "  store double 0\\.000000e\\+00, double\\* %ret_loop.*\n");
+            // Check that 3 blocks are created: header, body and exit blocks. Also, there must be
+            // a backedge from the body to the header.
+            REQUIRE(std::regex_search(module_string, m, loop));
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From 0dd8f72f288647358b8e35e9a9fa4c6f4560186d Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 1 Feb 2021 22:01:33 +0100
Subject: [PATCH 130/331] Create mechanism instance struct in LLVM IR (#507)

* Moved info related function to codegen_info
  - Moved get_float_variables, codegen_int_variables,
     codegen_global_variables, codegen_shadow_variables
     into CodegenHelper
  - Move small utility functions from CodegenCVisitor to codeged_utils
* Add proper variables to the mech_Instance
* Adding LLVMStructBlock
* Added test and visitor
* Fix llvm codegen tests with x[0-9].*
---
 src/codegen/codegen_c_visitor.cpp             | 294 ++----------------
 src/codegen/codegen_c_visitor.hpp             | 123 --------
 src/codegen/codegen_helper_visitor.cpp        |   5 +
 src/codegen/codegen_helper_visitor.hpp        |  10 +
 src/codegen/codegen_info.cpp                  | 197 ++++++++++++
 src/codegen/codegen_info.hpp                  | 135 ++++++++
 src/codegen/codegen_ispc_visitor.cpp          |  18 +-
 .../llvm/codegen_llvm_helper_visitor.cpp      |  21 ++
 .../llvm/codegen_llvm_helper_visitor.hpp      |   3 +
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  17 +
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  10 +
 src/language/code_generator.cmake             |   1 +
 src/language/nmodl.yaml                       |  12 +
 test/unit/codegen/codegen_llvm_ir.cpp         |  46 ++-
 14 files changed, 494 insertions(+), 398 deletions(-)

diff --git a/src/codegen/codegen_c_visitor.cpp b/src/codegen/codegen_c_visitor.cpp
index b7bfc5e1ee..79ee3967e0 100644
--- a/src/codegen/codegen_c_visitor.cpp
+++ b/src/codegen/codegen_c_visitor.cpp
@@ -356,49 +356,6 @@ bool CodegenCVisitor::statement_to_skip(const Statement& node) const {
 }
 
 
-bool CodegenCVisitor::net_send_buffer_required() const noexcept {
-    if (net_receive_required() && !info.artificial_cell) {
-        if (info.net_event_used || info.net_send_used || info.is_watch_used()) {
-            return true;
-        }
-    }
-    return false;
-}
-
-
-bool CodegenCVisitor::net_receive_buffering_required() const noexcept {
-    return info.point_process && !info.artificial_cell && info.net_receive_node != nullptr;
-}
-
-
-bool CodegenCVisitor::nrn_state_required() const noexcept {
-    if (info.artificial_cell) {
-        return false;
-    }
-    return info.nrn_state_block != nullptr || breakpoint_exist();
-}
-
-
-bool CodegenCVisitor::nrn_cur_required() const noexcept {
-    return info.breakpoint_node != nullptr && !info.currents.empty();
-}
-
-
-bool CodegenCVisitor::net_receive_exist() const noexcept {
-    return info.net_receive_node != nullptr;
-}
-
-
-bool CodegenCVisitor::breakpoint_exist() const noexcept {
-    return info.breakpoint_node != nullptr;
-}
-
-
-bool CodegenCVisitor::net_receive_required() const noexcept {
-    return net_receive_exist();
-}
-
-
 /**
  * \details When floating point data type is not default (i.e. double) then we
  * have to copy old array to new type (for range variables).
@@ -423,7 +380,7 @@ bool CodegenCVisitor::state_variable(const std::string& name) const {
 
 int CodegenCVisitor::position_of_float_var(const std::string& name) const {
     int index = 0;
-    for (const auto& var: codegen_float_variables) {
+    for (const auto& var: info.codegen_float_variables) {
         if (var->get_name() == name) {
             return index;
         }
@@ -435,7 +392,7 @@ int CodegenCVisitor::position_of_float_var(const std::string& name) const {
 
 int CodegenCVisitor::position_of_int_var(const std::string& name) const {
     int index = 0;
-    for (const auto& var: codegen_int_variables) {
+    for (const auto& var: info.codegen_int_variables) {
         if (var.symbol->get_name() == name) {
             return index;
         }
@@ -560,11 +517,11 @@ int CodegenCVisitor::float_variables_size() const {
         float_size++;
     }
     /// for g_unused variable
-    if (breakpoint_exist()) {
+    if (info.breakpoint_exist()) {
         float_size++;
     }
     /// for tsave variable
-    if (net_receive_exist()) {
+    if (info.net_receive_exist()) {
         float_size++;
     }
     return float_size;
@@ -828,188 +785,6 @@ void CodegenCVisitor::update_index_semantics() {
 }
 
 
-std::vector<SymbolType> CodegenCVisitor::get_float_variables() {
-    // sort with definition order
-    auto comparator = [](const SymbolType& first, const SymbolType& second) -> bool {
-        return first->get_definition_order() < second->get_definition_order();
-    };
-
-    auto assigned = info.assigned_vars;
-    auto states = info.state_vars;
-
-    // each state variable has corresponding Dstate variable
-    for (auto& state: states) {
-        auto name = "D" + state->get_name();
-        auto symbol = make_symbol(name);
-        if (state->is_array()) {
-            symbol->set_as_array(state->get_length());
-        }
-        symbol->set_definition_order(state->get_definition_order());
-        assigned.push_back(symbol);
-    }
-    std::sort(assigned.begin(), assigned.end(), comparator);
-
-    auto variables = info.range_parameter_vars;
-    variables.insert(variables.end(),
-                     info.range_assigned_vars.begin(),
-                     info.range_assigned_vars.end());
-    variables.insert(variables.end(), info.range_state_vars.begin(), info.range_state_vars.end());
-    variables.insert(variables.end(), assigned.begin(), assigned.end());
-
-    if (info.vectorize) {
-        variables.push_back(make_symbol(naming::VOLTAGE_UNUSED_VARIABLE));
-    }
-    if (breakpoint_exist()) {
-        std::string name = info.vectorize ? naming::CONDUCTANCE_UNUSED_VARIABLE
-                                          : naming::CONDUCTANCE_VARIABLE;
-        variables.push_back(make_symbol(name));
-    }
-    if (net_receive_exist()) {
-        variables.push_back(make_symbol(naming::T_SAVE_VARIABLE));
-    }
-    return variables;
-}
-
-
-/**
- * IndexVariableInfo has following constructor arguments:
- *      - symbol
- *      - is_vdata   (false)
- *      - is_index   (false
- *      - is_integer (false)
- *
- * Which variables are constant qualified?
- *
- *  - node area is read only
- *  - read ion variables are read only
- *  - style_ionname is index / offset
- */
-std::vector<IndexVariableInfo> CodegenCVisitor::get_int_variables() {
-    std::vector<IndexVariableInfo> variables;
-    if (info.point_process) {
-        variables.emplace_back(make_symbol(naming::NODE_AREA_VARIABLE));
-        variables.back().is_constant = true;
-        /// note that this variable is not printed in neuron implementation
-        if (info.artificial_cell) {
-            variables.emplace_back(make_symbol(naming::POINT_PROCESS_VARIABLE), true);
-        } else {
-            variables.emplace_back(make_symbol(naming::POINT_PROCESS_VARIABLE), false, false, true);
-            variables.back().is_constant = true;
-        }
-    }
-
-    for (const auto& ion: info.ions) {
-        bool need_style = false;
-        std::unordered_map<std::string, int> ion_vars;  // used to keep track of the variables to
-                                                        // not have doubles between read/write. Same
-                                                        // name variables are allowed
-        for (const auto& var: ion.reads) {
-            const std::string name = naming::ION_VARNAME_PREFIX + var;
-            variables.emplace_back(make_symbol(name));
-            variables.back().is_constant = true;
-            ion_vars[name] = variables.size() - 1;
-        }
-
-        /// symbol for di_ion_dv var
-        std::shared_ptr<symtab::Symbol> ion_di_dv_var = nullptr;
-
-        for (const auto& var: ion.writes) {
-            const std::string name = naming::ION_VARNAME_PREFIX + var;
-
-            const auto ion_vars_it = ion_vars.find(name);
-            if (ion_vars_it != ion_vars.end()) {
-                variables[ion_vars_it->second].is_constant = false;
-            } else {
-                variables.emplace_back(make_symbol(naming::ION_VARNAME_PREFIX + var));
-            }
-            if (ion.is_ionic_current(var)) {
-                ion_di_dv_var = make_symbol(std::string(naming::ION_VARNAME_PREFIX) + "di" +
-                                            ion.name + "dv");
-            }
-            if (ion.is_intra_cell_conc(var) || ion.is_extra_cell_conc(var)) {
-                need_style = true;
-            }
-        }
-
-        /// insert after read/write variables but before style ion variable
-        if (ion_di_dv_var != nullptr) {
-            variables.emplace_back(ion_di_dv_var);
-        }
-
-        if (need_style) {
-            variables.emplace_back(make_symbol("style_" + ion.name), false, true);
-            variables.back().is_constant = true;
-        }
-    }
-
-    for (const auto& var: info.pointer_variables) {
-        auto name = var->get_name();
-        if (var->has_any_property(NmodlType::pointer_var)) {
-            variables.emplace_back(make_symbol(name));
-        } else {
-            variables.emplace_back(make_symbol(name), true);
-        }
-    }
-
-    if (info.diam_used) {
-        variables.emplace_back(make_symbol(naming::DIAM_VARIABLE));
-    }
-
-    if (info.area_used) {
-        variables.emplace_back(make_symbol(naming::AREA_VARIABLE));
-    }
-
-    // for non-artificial cell, when net_receive buffering is enabled
-    // then tqitem is an offset
-    if (info.net_send_used) {
-        if (info.artificial_cell) {
-            variables.emplace_back(make_symbol(naming::TQITEM_VARIABLE), true);
-        } else {
-            variables.emplace_back(make_symbol(naming::TQITEM_VARIABLE), false, false, true);
-            variables.back().is_constant = true;
-        }
-        info.tqitem_index = variables.size() - 1;
-    }
-
-    /**
-     * \note Variables for watch statements : there is one extra variable
-     * used in coreneuron compared to actual watch statements for compatibility
-     * with neuron (which uses one extra Datum variable)
-     */
-    if (!info.watch_statements.empty()) {
-        for (int i = 0; i < info.watch_statements.size() + 1; i++) {
-            variables.emplace_back(make_symbol("watch{}"_format(i)), false, false, true);
-        }
-    }
-    return variables;
-}
-
-
-/**
- * \details When we enable fine level parallelism at channel level, we have do updates
- * to ion variables in atomic way. As cpus don't have atomic instructions in
- * simd loop, we have to use shadow vectors for every ion variables. Here
- * we return list of all such variables.
- *
- * \todo If conductances are specified, we don't need all below variables
- */
-std::vector<SymbolType> CodegenCVisitor::get_shadow_variables() {
-    std::vector<SymbolType> variables;
-    for (const auto& ion: info.ions) {
-        for (const auto& var: ion.writes) {
-            variables.push_back({make_symbol(shadow_varname(naming::ION_VARNAME_PREFIX + var))});
-            if (ion.is_ionic_current(var)) {
-                variables.push_back({make_symbol(shadow_varname(
-                    std::string(naming::ION_VARNAME_PREFIX) + "di" + ion.name + "dv"))});
-            }
-        }
-    }
-    variables.push_back({make_symbol("ml_rhs")});
-    variables.push_back({make_symbol("ml_d")});
-    return variables;
-}
-
-
 /****************************************************************************************/
 /*                      Routines must be overloaded in backend                          */
 /****************************************************************************************/
@@ -1136,7 +911,7 @@ bool CodegenCVisitor::nrn_cur_reduction_loop_required() {
 
 
 bool CodegenCVisitor::shadow_vector_setup_required() {
-    return (channel_task_dependency_enabled() && !codegen_shadow_variables.empty());
+    return (channel_task_dependency_enabled() && !info.codegen_shadow_variables.empty());
 }
 
 
@@ -2081,8 +1856,8 @@ std::string CodegenCVisitor::process_verbatim_text(std::string text) {
 
 
 std::string CodegenCVisitor::register_mechanism_arguments() const {
-    auto nrn_cur = nrn_cur_required() ? method_name(naming::NRN_CUR_METHOD) : "NULL";
-    auto nrn_state = nrn_state_required() ? method_name(naming::NRN_STATE_METHOD) : "NULL";
+    auto nrn_cur = info.nrn_cur_required() ? method_name(naming::NRN_CUR_METHOD) : "NULL";
+    auto nrn_state = info.nrn_state_required() ? method_name(naming::NRN_STATE_METHOD) : "NULL";
     auto nrn_alloc = method_name(naming::NRN_ALLOC_METHOD);
     auto nrn_init = method_name(naming::NRN_INIT_METHOD);
     return "mechanism, {}, {}, NULL, {}, {}, first_pointer_var_index()"
@@ -2200,7 +1975,7 @@ void CodegenCVisitor::print_num_variable_getter() {
 
 
 void CodegenCVisitor::print_net_receive_arg_size_getter() {
-    if (!net_receive_exist()) {
+    if (!info.net_receive_exist()) {
         return;
     }
     printer->add_newline(2);
@@ -2391,17 +2166,18 @@ std::string CodegenCVisitor::get_variable_name(const std::string& name, bool use
     // clang-format on
 
     // float variable
-    auto f = std::find_if(codegen_float_variables.begin(),
-                          codegen_float_variables.end(),
+    auto f = std::find_if(info.codegen_float_variables.begin(),
+                          info.codegen_float_variables.end(),
                           symbol_comparator);
-    if (f != codegen_float_variables.end()) {
+    if (f != info.codegen_float_variables.end()) {
         return float_variable_name(*f, use_instance);
     }
 
     // integer variable
-    auto i =
-        std::find_if(codegen_int_variables.begin(), codegen_int_variables.end(), index_comparator);
-    if (i != codegen_int_variables.end()) {
+    auto i = std::find_if(info.codegen_int_variables.begin(),
+                          info.codegen_int_variables.end(),
+                          index_comparator);
+    if (i != info.codegen_int_variables.end()) {
         return int_variable_name(*i, varname, use_instance);
     }
 
@@ -2414,10 +2190,10 @@ std::string CodegenCVisitor::get_variable_name(const std::string& name, bool use
     }
 
     // shadow variable
-    auto s = std::find_if(codegen_shadow_variables.begin(),
-                          codegen_shadow_variables.end(),
+    auto s = std::find_if(info.codegen_shadow_variables.begin(),
+                          info.codegen_shadow_variables.end(),
                           symbol_comparator);
-    if (s != codegen_shadow_variables.end()) {
+    if (s != info.codegen_shadow_variables.end()) {
         return ion_shadow_variable_name(*s);
     }
 
@@ -2898,7 +2674,7 @@ void CodegenCVisitor::print_mechanism_register() {
     if (info.artificial_cell) {
         printer->add_line("add_nrn_artcell(mech_type, {});"_format(info.tqitem_index));
     }
-    if (net_receive_buffering_required()) {
+    if (info.net_receive_buffering_required()) {
         printer->add_line("hoc_register_net_receive_buffering({}, mech_type);"_format(
             method_name("net_buf_receive")));
     }
@@ -3008,13 +2784,13 @@ void CodegenCVisitor::print_mechanism_range_var_structure() {
     printer->add_newline(2);
     printer->add_line("/** all mechanism instance variables */");
     printer->start_block("struct {} "_format(instance_struct()));
-    for (auto& var: codegen_float_variables) {
+    for (auto& var: info.codegen_float_variables) {
         auto name = var->get_name();
         auto type = get_range_var_float_type(var);
         auto qualifier = is_constant_variable(name) ? k_const() : "";
         printer->add_line("{}{}* {}{};"_format(qualifier, type, ptr_type_qualifier(), name));
     }
-    for (auto& var: codegen_int_variables) {
+    for (auto& var: info.codegen_int_variables) {
         auto name = var.symbol->get_name();
         if (var.is_index || var.is_integer) {
             auto qualifier = var.is_constant ? k_const() : "";
@@ -3027,7 +2803,7 @@ void CodegenCVisitor::print_mechanism_range_var_structure() {
         }
     }
     if (channel_task_dependency_enabled()) {
-        for (auto& var: codegen_shadow_variables) {
+        for (auto& var: info.codegen_shadow_variables) {
             auto name = var->get_name();
             printer->add_line("{}* {}{};"_format(float_type, ptr_type_qualifier(), name));
         }
@@ -3245,7 +3021,7 @@ void CodegenCVisitor::print_shadow_vector_setup() {
     printer->start_block("static inline void setup_shadow_vectors({}) "_format(args));
     if (channel_task_dependency_enabled()) {
         printer->add_line("int nodecount = ml->nodecount;");
-        for (auto& var: codegen_shadow_variables) {
+        for (auto& var: info.codegen_shadow_variables) {
             auto name = var->get_name();
             auto type = default_float_data_type();
             auto allocation = "({0}*) mem_alloc(nodecount, sizeof({0}))"_format(type);
@@ -3258,7 +3034,7 @@ void CodegenCVisitor::print_shadow_vector_setup() {
     args = "{}* inst"_format(instance_struct());
     printer->start_block("static inline void free_shadow_vectors({}) "_format(args));
     if (channel_task_dependency_enabled()) {
-        for (auto& var: codegen_shadow_variables) {
+        for (auto& var: info.codegen_shadow_variables) {
             auto name = var->get_name();
             printer->add_line("mem_free(inst->{});"_format(name));
         }
@@ -3325,7 +3101,7 @@ void CodegenCVisitor::print_instance_variable_setup() {
     printer->add_line("/** initialize mechanism instance variables */");
     printer->start_block("static inline void setup_instance(NrnThread* nt, Memb_list* ml) ");
     printer->add_line("{0}* inst = ({0}*) mem_alloc(1, sizeof({0}));"_format(instance_struct()));
-    if (channel_task_dependency_enabled() && !codegen_shadow_variables.empty()) {
+    if (channel_task_dependency_enabled() && !info.codegen_shadow_variables.empty()) {
         printer->add_line("setup_shadow_vectors(inst, ml);");
     }
 
@@ -3343,7 +3119,7 @@ void CodegenCVisitor::print_instance_variable_setup() {
     int id = 0;
     std::vector<std::string> variables_to_free;
 
-    for (auto& var: codegen_float_variables) {
+    for (auto& var: info.codegen_float_variables) {
         auto name = var->get_name();
         auto range_var_type = get_range_var_float_type(var);
         if (float_type == range_var_type) {
@@ -3358,7 +3134,7 @@ void CodegenCVisitor::print_instance_variable_setup() {
         id += var->get_length();
     }
 
-    for (auto& var: codegen_int_variables) {
+    for (auto& var: info.codegen_int_variables) {
         auto name = var.symbol->get_name();
         std::string variable = name;
         std::string type = "";
@@ -4024,7 +3800,7 @@ void CodegenCVisitor::print_net_receive_loop_end() {
 
 
 void CodegenCVisitor::print_net_receive_buffering(bool need_mech_inst) {
-    if (!net_receive_required() || info.artificial_cell) {
+    if (!info.net_receive_required() || info.artificial_cell) {
         return;
     }
     printer->add_newline(2);
@@ -4076,7 +3852,7 @@ void CodegenCVisitor::print_net_send_buffering_grow() {
 }
 
 void CodegenCVisitor::print_net_send_buffering() {
-    if (!net_send_buffer_required()) {
+    if (!info.net_send_buffer_required()) {
         return;
     }
 
@@ -4140,7 +3916,7 @@ void CodegenCVisitor::visit_for_netcon(const ast::ForNetcon& node) {
 }
 
 void CodegenCVisitor::print_net_receive_kernel() {
-    if (!net_receive_required()) {
+    if (!info.net_receive_required()) {
         return;
     }
     codegen = true;
@@ -4203,7 +3979,7 @@ void CodegenCVisitor::print_net_receive_kernel() {
 
 
 void CodegenCVisitor::print_net_receive() {
-    if (!net_receive_required()) {
+    if (!info.net_receive_required()) {
         return;
     }
     codegen = true;
@@ -4351,7 +4127,7 @@ void CodegenCVisitor::visit_solution_expression(const SolutionExpression& node)
 
 
 void CodegenCVisitor::print_nrn_state() {
-    if (!nrn_state_required()) {
+    if (!info.nrn_state_required()) {
         return;
     }
     codegen = true;
@@ -4565,7 +4341,7 @@ void CodegenCVisitor::print_fast_imem_calculation() {
 }
 
 void CodegenCVisitor::print_nrn_cur() {
-    if (!nrn_cur_required()) {
+    if (!info.nrn_cur_required()) {
         return;
     }
 
@@ -4729,10 +4505,6 @@ void CodegenCVisitor::setup(const Program& node) {
         logger->warn("CodegenCVisitor : MOD file uses non-thread safe constructs of NMODL");
     }
 
-    codegen_float_variables = get_float_variables();
-    codegen_int_variables = get_int_variables();
-    codegen_shadow_variables = get_shadow_variables();
-
     update_index_semantics();
     rename_function_arguments();
 }
diff --git a/src/codegen/codegen_c_visitor.hpp b/src/codegen/codegen_c_visitor.hpp
index a1eda7497b..096b0b845d 100644
--- a/src/codegen/codegen_c_visitor.hpp
+++ b/src/codegen/codegen_c_visitor.hpp
@@ -66,41 +66,6 @@ enum class MemberType {
     thread
 };
 
-
-/**
- * \class IndexVariableInfo
- * \brief Helper to represent information about index/int variables
- *
- */
-struct IndexVariableInfo {
-    /// symbol for the variable
-    const std::shared_ptr<symtab::Symbol> symbol;
-
-    /// if variable reside in vdata field of NrnThread
-    /// typically true for bbcore pointer
-    bool is_vdata = false;
-
-    /// if this is pure index (e.g. style_ion) variables is directly
-    /// index and shouldn't be printed with data/vdata
-    bool is_index = false;
-
-    /// if this is an integer (e.g. tqitem, point_process) variable which
-    /// is printed as array accesses
-    bool is_integer = false;
-
-    /// if the variable is qualified as constant (this is property of IndexVariable)
-    bool is_constant = false;
-
-    IndexVariableInfo(std::shared_ptr<symtab::Symbol> symbol,
-                      bool is_vdata = false,
-                      bool is_index = false,
-                      bool is_integer = false)
-        : symbol(std::move(symbol))
-        , is_vdata(is_vdata)
-        , is_index(is_index)
-        , is_integer(is_integer) {}
-};
-
 /** @} */  // end of codegen_details
 
 
@@ -164,11 +129,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
      */
     symtab::SymbolTable* program_symtab = nullptr;
 
-    /**
-     * All float variables for the model
-     */
-    std::vector<SymbolType> codegen_float_variables;
-
     /**
      * All int variables for the model
      */
@@ -357,26 +317,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     }
 
 
-    /**
-     * Constructs a shadow variable name
-     * \param name The name of the variable
-     * \return     The name of the variable prefixed with \c shadow_
-     */
-    std::string shadow_varname(const std::string& name) const {
-        return "shadow_" + name;
-    }
-
-
-    /**
-     * Creates a temporary symbol
-     * \param name The name of the symbol
-     * \return     A symbol based on the given name
-     */
-    SymbolType make_symbol(const std::string& name) const {
-        return std::make_shared<symtab::Symbol>(name, ModToken());
-    }
-
-
     /**
      * Checks if the given variable name belongs to a state variable
      * \param name The variable name
@@ -385,36 +325,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     bool state_variable(const std::string& name) const;
 
 
-    /**
-     * Check if net receive/send buffering kernels required
-     */
-    bool net_receive_buffering_required() const noexcept;
-
-
-    /**
-     * Check if nrn_state function is required
-     */
-    bool nrn_state_required() const noexcept;
-
-
-    /**
-     * Check if nrn_cur function is required
-     */
-    bool nrn_cur_required() const noexcept;
-
-
-    /**
-     * Check if net_receive function is required
-     */
-    bool net_receive_required() const noexcept;
-
-
-    /**
-     * Check if net_send_buffer is required
-     */
-    bool net_send_buffer_required() const noexcept;
-
-
     /**
      * Check if setup_range_variable function is required
      * \return
@@ -422,18 +332,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     bool range_variable_setup_required() const noexcept;
 
 
-    /**
-     * Check if net_receive node exist
-     */
-    bool net_receive_exist() const noexcept;
-
-
-    /**
-     * Check if breakpoint node exist
-     */
-    bool breakpoint_exist() const noexcept;
-
-
     /**
      * Check if given method is defined in this model
      * \param name The name of the method to check
@@ -599,27 +497,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     void update_index_semantics();
 
 
-    /**
-     * Determine all \c float variables required during code generation
-     * \return A \c vector of \c float variables
-     */
-    std::vector<SymbolType> get_float_variables();
-
-
-    /**
-     * Determine all \c int variables required during code generation
-     * \return A \c vector of \c int variables
-     */
-    std::vector<IndexVariableInfo> get_int_variables();
-
-
-    /**
-     * Determine all ion write variables that require shadow vectors during code generation
-     * \return A \c vector of ion variables
-     */
-    std::vector<SymbolType> get_shadow_variables();
-
-
     /**
      * Print the items in a vector as a list
      *
diff --git a/src/codegen/codegen_helper_visitor.cpp b/src/codegen/codegen_helper_visitor.cpp
index 236ff79a83..169a093abb 100644
--- a/src/codegen/codegen_helper_visitor.cpp
+++ b/src/codegen/codegen_helper_visitor.cpp
@@ -24,6 +24,7 @@ using namespace ast;
 using symtab::syminfo::NmodlType;
 using symtab::syminfo::Status;
 
+
 /**
  * How symbols are stored in NEURON? See notes written in markdown file.
  *
@@ -284,6 +285,7 @@ void CodegenHelperVisitor::find_non_range_variables() {
     // clang-format on
 }
 
+
 /**
  * Find range variables i.e. ones that are belong to per instance allocation
  *
@@ -696,6 +698,9 @@ void CodegenHelperVisitor::visit_program(const ast::Program& node) {
     find_range_variables();
     find_non_range_variables();
     find_table_variables();
+    info.get_int_variables();
+    info.get_shadow_variables();
+    info.get_float_variables();
 }
 
 
diff --git a/src/codegen/codegen_helper_visitor.hpp b/src/codegen/codegen_helper_visitor.hpp
index 11008668b5..0906c9a7de 100644
--- a/src/codegen/codegen_helper_visitor.hpp
+++ b/src/codegen/codegen_helper_visitor.hpp
@@ -75,6 +75,16 @@ class CodegenHelperVisitor: public visitor::ConstAstVisitor {
     void find_non_range_variables();
     void sort_with_mod2c_symbol_order(std::vector<SymbolType>& symbols) const;
 
+    /**
+     * Check if breakpoint node exist
+     */
+    bool breakpoint_exist() const noexcept;
+
+    /**
+     * Check if net_receive node exist
+     */
+    bool net_receive_exist() const noexcept;
+
   public:
     CodegenHelperVisitor() = default;
 
diff --git a/src/codegen/codegen_info.cpp b/src/codegen/codegen_info.cpp
index 2219a18913..26696fbc18 100644
--- a/src/codegen/codegen_info.cpp
+++ b/src/codegen/codegen_info.cpp
@@ -20,6 +20,16 @@ using namespace fmt::literals;
 using symtab::syminfo::NmodlType;
 using visitor::VarUsageVisitor;
 
+SymbolType make_symbol(const std::string& name) {
+    return std::make_shared<symtab::Symbol>(name, ModToken());
+}
+
+
+std::string shadow_varname(const std::string& name) {
+    return "shadow_" + name;
+}
+
+
 /// if any ion has write variable
 bool CodegenInfo::ion_has_write_variable() const {
     for (const auto& ion: ions) {
@@ -205,5 +215,192 @@ bool CodegenInfo::is_an_instance_variable(const std::string& varname) const {
     return false;
 }
 
+
+/**
+ * IndexVariableInfo has following constructor arguments:
+ *      - symbol
+ *      - is_vdata   (false)
+ *      - is_index   (false
+ *      - is_integer (false)
+ *
+ * Which variables are constant qualified?
+ *
+ *  - node area is read only
+ *  - read ion variables are read only
+ *  - style_ionname is index / offset
+ */
+void CodegenInfo::get_int_variables() {
+    if (point_process) {
+        codegen_int_variables.emplace_back(make_symbol(naming::NODE_AREA_VARIABLE));
+        codegen_int_variables.back().is_constant = true;
+        /// note that this variable is not printed in neuron implementation
+        if (artificial_cell) {
+            codegen_int_variables.emplace_back(make_symbol(naming::POINT_PROCESS_VARIABLE), true);
+        } else {
+            codegen_int_variables.emplace_back(make_symbol(naming::POINT_PROCESS_VARIABLE),
+                                               false,
+                                               false,
+                                               true);
+            codegen_int_variables.back().is_constant = true;
+        }
+    }
+
+    for (const auto& ion: ions) {
+        bool need_style = false;
+        std::unordered_map<std::string, int> ion_vars;  // used to keep track of the variables to
+                                                        // not have doubles between read/write. Same
+                                                        // name variables are allowed
+        for (const auto& var: ion.reads) {
+            const std::string name = "ion_" + var;
+            codegen_int_variables.emplace_back(make_symbol(name));
+            codegen_int_variables.back().is_constant = true;
+            ion_vars[name] = codegen_int_variables.size() - 1;
+        }
+
+        /// symbol for di_ion_dv var
+        std::shared_ptr<symtab::Symbol> ion_di_dv_var = nullptr;
+
+        for (const auto& var: ion.writes) {
+            const std::string name = "ion_" + var;
+
+            const auto ion_vars_it = ion_vars.find(name);
+            if (ion_vars_it != ion_vars.end()) {
+                codegen_int_variables[ion_vars_it->second].is_constant = false;
+            } else {
+                codegen_int_variables.emplace_back(make_symbol("ion_" + var));
+            }
+            if (ion.is_ionic_current(var)) {
+                ion_di_dv_var = make_symbol("ion_di" + ion.name + "dv");
+            }
+            if (ion.is_intra_cell_conc(var) || ion.is_extra_cell_conc(var)) {
+                need_style = true;
+            }
+        }
+
+        /// insert after read/write variables but before style ion variable
+        if (ion_di_dv_var != nullptr) {
+            codegen_int_variables.emplace_back(ion_di_dv_var);
+        }
+
+        if (need_style) {
+            codegen_int_variables.emplace_back(make_symbol("style_" + ion.name), false, true);
+            codegen_int_variables.back().is_constant = true;
+        }
+    }
+
+    for (const auto& var: pointer_variables) {
+        auto name = var->get_name();
+        if (var->has_any_property(NmodlType::pointer_var)) {
+            codegen_int_variables.emplace_back(make_symbol(name));
+        } else {
+            codegen_int_variables.emplace_back(make_symbol(name), true);
+        }
+    }
+
+    if (diam_used) {
+        codegen_int_variables.emplace_back(make_symbol(naming::DIAM_VARIABLE));
+    }
+
+    if (area_used) {
+        codegen_int_variables.emplace_back(make_symbol(naming::AREA_VARIABLE));
+    }
+
+    // for non-artificial cell, when net_receive buffering is enabled
+    // then tqitem is an offset
+    if (net_send_used) {
+        if (artificial_cell) {
+            codegen_int_variables.emplace_back(make_symbol(naming::TQITEM_VARIABLE), true);
+        } else {
+            codegen_int_variables.emplace_back(make_symbol(naming::TQITEM_VARIABLE),
+                                               false,
+                                               false,
+                                               true);
+            codegen_int_variables.back().is_constant = true;
+        }
+        tqitem_index = codegen_int_variables.size() - 1;
+    }
+
+    /**
+     * \note Variables for watch statements : there is one extra variable
+     * used in coreneuron compared to actual watch statements for compatibility
+     * with neuron (which uses one extra Datum variable)
+     */
+    if (!watch_statements.empty()) {
+        for (int i = 0; i < watch_statements.size() + 1; i++) {
+            codegen_int_variables.emplace_back(make_symbol("watch{}"_format(i)),
+                                               false,
+                                               false,
+                                               true);
+        }
+    }
+}
+
+
+/**
+ * \details When we enable fine level parallelism at channel level, we have do updates
+ * to ion variables in atomic way. As cpus don't have atomic instructions in
+ * simd loop, we have to use shadow vectors for every ion variables. Here
+ * we return list of all such variables.
+ *
+ * \todo If conductances are specified, we don't need all below variables
+ */
+void CodegenInfo::get_shadow_variables() {
+    for (const auto& ion: ions) {
+        for (const auto& var: ion.writes) {
+            codegen_shadow_variables.push_back({make_symbol(shadow_varname("ion_" + var))});
+            if (ion.is_ionic_current(var)) {
+                codegen_shadow_variables.push_back(
+                    {make_symbol(shadow_varname("ion_di" + ion.name + "dv"))});
+            }
+        }
+    }
+    codegen_shadow_variables.push_back({make_symbol("ml_rhs")});
+    codegen_shadow_variables.push_back({make_symbol("ml_d")});
+}
+
+
+void CodegenInfo::get_float_variables() {
+    // sort with definition order
+    auto comparator = [](const SymbolType& first, const SymbolType& second) -> bool {
+        return first->get_definition_order() < second->get_definition_order();
+    };
+
+    auto assigned = assigned_vars;
+    auto states = state_vars;
+
+    // each state variable has corresponding Dstate variable
+    for (auto& state: states) {
+        auto name = "D" + state->get_name();
+        auto symbol = make_symbol(name);
+        if (state->is_array()) {
+            symbol->set_as_array(state->get_length());
+        }
+        symbol->set_definition_order(state->get_definition_order());
+        assigned.push_back(symbol);
+    }
+    std::sort(assigned.begin(), assigned.end(), comparator);
+
+    codegen_float_variables = range_parameter_vars;
+    codegen_float_variables.insert(codegen_float_variables.end(),
+                                   range_assigned_vars.begin(),
+                                   range_assigned_vars.end());
+    codegen_float_variables.insert(codegen_float_variables.end(),
+                                   range_state_vars.begin(),
+                                   range_state_vars.end());
+    codegen_float_variables.insert(codegen_float_variables.end(), assigned.begin(), assigned.end());
+
+    if (vectorize) {
+        codegen_float_variables.push_back(make_symbol(naming::VOLTAGE_UNUSED_VARIABLE));
+    }
+    if (breakpoint_exist()) {
+        std::string name = vectorize ? naming::CONDUCTANCE_UNUSED_VARIABLE
+                                     : naming::CONDUCTANCE_VARIABLE;
+        codegen_float_variables.push_back(make_symbol(name));
+    }
+    if (net_receive_exist()) {
+        codegen_float_variables.push_back(make_symbol(naming::T_SAVE_VARIABLE));
+    }
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/codegen_info.hpp b/src/codegen/codegen_info.hpp
index 2cd3c7b98f..17e4102700 100644
--- a/src/codegen/codegen_info.hpp
+++ b/src/codegen/codegen_info.hpp
@@ -22,6 +22,56 @@
 namespace nmodl {
 namespace codegen {
 
+using SymbolType = std::shared_ptr<symtab::Symbol>;
+
+/**
+ * Creates a temporary symbol
+ * \param name The name of the symbol
+ * \return     A symbol based on the given name
+ */
+SymbolType make_symbol(const std::string& name);
+
+/**
+ * Constructs a shadow variable name
+ * \param name The name of the variable
+ * \return     The name of the variable prefixed with \c shadow_
+ */
+std::string shadow_varname(const std::string& name);
+
+/**
+ * \class IndexVariableInfo
+ * \brief Helper to represent information about index/int variables
+ *
+ */
+struct IndexVariableInfo {
+    /// symbol for the variable
+    const std::shared_ptr<symtab::Symbol> symbol;
+
+    /// if variable reside in vdata field of NrnThread
+    /// typically true for bbcore pointer
+    bool is_vdata = false;
+
+    /// if this is pure index (e.g. style_ion) variables is directly
+    /// index and shouldn't be printed with data/vdata
+    bool is_index = false;
+
+    /// if this is an integer (e.g. tqitem, point_process) variable which
+    /// is printed as array accesses
+    bool is_integer = false;
+
+    /// if the variable is qualified as constant (this is property of IndexVariable)
+    bool is_constant = false;
+
+    IndexVariableInfo(std::shared_ptr<symtab::Symbol> symbol,
+                      bool is_vdata = false,
+                      bool is_index = false,
+                      bool is_integer = false)
+        : symbol(std::move(symbol))
+        , is_vdata(is_vdata)
+        , is_index(is_index)
+        , is_integer(is_integer) {}
+};
+
 /**
  * @addtogroup codegen_details
  * @{
@@ -389,6 +439,15 @@ struct CodegenInfo {
     /// new one used in print_ion_types
     std::vector<SymbolType> use_ion_variables;
 
+    /// all int variables for the model
+    std::vector<IndexVariableInfo> codegen_int_variables;
+
+    /// all ion variables that could be possibly written
+    std::vector<SymbolType> codegen_shadow_variables;
+
+    /// all float variables for the model
+    std::vector<SymbolType> codegen_float_variables;
+
     /// this is the order in which they appear in derivative block
     /// this is required while printing them in initlist function
     std::vector<SymbolType> prime_variables_by_order;
@@ -473,6 +532,64 @@ struct CodegenInfo {
     /// true if WatchStatement uses voltage v variable
     bool is_voltage_used_by_watch_statements() const;
 
+    /**
+     * Check if net_send_buffer is required
+     */
+    bool net_send_buffer_required() const noexcept {
+        if (net_receive_required() && !artificial_cell) {
+            if (net_event_used || net_send_used || is_watch_used()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Check if net receive/send buffering kernels required
+     */
+    bool net_receive_buffering_required() const noexcept {
+        return point_process && !artificial_cell && net_receive_node != nullptr;
+    }
+
+    /**
+     * Check if nrn_state function is required
+     */
+    bool nrn_state_required() const noexcept {
+        if (artificial_cell) {
+            return false;
+        }
+        return nrn_state_block != nullptr || currents.empty();
+    }
+
+    /**
+     * Check if nrn_cur function is required
+     */
+    bool nrn_cur_required() const noexcept {
+        return breakpoint_node != nullptr && !currents.empty();
+    }
+
+    /**
+     * Check if net_receive node exist
+     */
+    bool net_receive_exist() const noexcept {
+        return net_receive_node != nullptr;
+    }
+
+    /**
+     * Check if breakpoint node exist
+     */
+    bool breakpoint_exist() const noexcept {
+        return breakpoint_node != nullptr;
+    }
+
+
+    /**
+     * Check if net_receive function is required
+     */
+    bool net_receive_required() const noexcept {
+        return net_receive_exist();
+    }
+
     /**
      * Checks if the given variable name belongs to a state variable
      * \param name The variable name
@@ -515,6 +632,24 @@ struct CodegenInfo {
 
     /// if we need a call back to wrote_conc in neuron/coreneuron
     bool require_wrote_conc = false;
+
+    /**
+     * Determine all \c int variables required during code generation
+     * \return A \c vector of \c int variables
+     */
+    void get_int_variables();
+
+    /**
+     * Determine all ion write variables that require shadow vectors during code generation
+     * \return A \c vector of ion variables
+     */
+    void get_shadow_variables();
+
+    /**
+     * Determine all \c float variables required during code generation
+     * \return A \c vector of \c float variables
+     */
+    void get_float_variables();
 };
 
 /** @} */  // end of codegen_backends
diff --git a/src/codegen/codegen_ispc_visitor.cpp b/src/codegen/codegen_ispc_visitor.cpp
index dca97d426c..5d1c2de485 100644
--- a/src/codegen/codegen_ispc_visitor.cpp
+++ b/src/codegen/codegen_ispc_visitor.cpp
@@ -449,7 +449,7 @@ void CodegenIspcVisitor::print_ion_variable() {
 /****************************************************************************************/
 
 void CodegenIspcVisitor::print_net_receive_buffering_wrapper() {
-    if (!net_receive_required() || info.artificial_cell) {
+    if (!info.net_receive_required() || info.artificial_cell) {
         return;
     }
     printer->add_newline(2);
@@ -527,19 +527,19 @@ void CodegenIspcVisitor::print_backend_compute_routine_decl() {
             "extern \"C\" void {}({});"_format(compute_function, get_parameter_str(params)));
     }
 
-    if (nrn_cur_required() && !emit_fallback[BlockType::Equation]) {
+    if (info.nrn_cur_required() && !emit_fallback[BlockType::Equation]) {
         compute_function = compute_method_name(BlockType::Equation);
         printer->add_line(
             "extern \"C\" void {}({});"_format(compute_function, get_parameter_str(params)));
     }
 
-    if (nrn_state_required() && !emit_fallback[BlockType::State]) {
+    if (info.nrn_state_required() && !emit_fallback[BlockType::State]) {
         compute_function = compute_method_name(BlockType::State);
         printer->add_line(
             "extern \"C\" void {}({});"_format(compute_function, get_parameter_str(params)));
     }
 
-    if (net_receive_required()) {
+    if (info.net_receive_required()) {
         auto net_recv_params = ParamVector();
         net_recv_params.emplace_back("", "{}*"_format(instance_struct()), "", "inst");
         net_recv_params.emplace_back("", "NrnThread*", "", "nt");
@@ -559,7 +559,7 @@ bool CodegenIspcVisitor::check_incompatibilities() {
     };
 
     // instance vars
-    if (check_incompatible_var_name<SymbolType>(codegen_float_variables,
+    if (check_incompatible_var_name<SymbolType>(info.codegen_float_variables,
                                                 get_name_from_symbol_type_vector)) {
         return true;
     }
@@ -626,11 +626,11 @@ bool CodegenIspcVisitor::check_incompatibilities() {
                                    visitor::calls_function(*info.net_receive_node, "net_send")));
 
     emit_fallback[BlockType::Equation] = emit_fallback[BlockType::Equation] ||
-                                         (nrn_cur_required() && info.breakpoint_node &&
+                                         (info.nrn_cur_required() && info.breakpoint_node &&
                                           has_incompatible_nodes(*info.breakpoint_node));
 
     emit_fallback[BlockType::State] = emit_fallback[BlockType::State] ||
-                                      (nrn_state_required() && info.nrn_state_block &&
+                                      (info.nrn_state_required() && info.nrn_state_block &&
                                        has_incompatible_nodes(*info.nrn_state_block));
 
 
@@ -687,7 +687,7 @@ void CodegenIspcVisitor::print_block_wrappers_initial_equation_state() {
         print_wrapper_routine(naming::NRN_INIT_METHOD, BlockType::Initial);
     }
 
-    if (nrn_cur_required()) {
+    if (info.nrn_cur_required()) {
         if (emit_fallback[BlockType::Equation]) {
             logger->warn("Falling back to C backend for emitting breakpoint block");
             fallback_codegen.print_nrn_cur();
@@ -696,7 +696,7 @@ void CodegenIspcVisitor::print_block_wrappers_initial_equation_state() {
         }
     }
 
-    if (nrn_state_required()) {
+    if (info.nrn_state_required()) {
         if (emit_fallback[BlockType::State]) {
             logger->warn("Falling back to C backend for emitting state block");
             fallback_codegen.print_nrn_state();
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 751fecfc81..fc8fda3d04 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -158,6 +158,23 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     codegen_functions.push_back(function);
 }
 
+std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_struct() {
+    ast::CodegenVarVector codegen_vars;
+    /// float variables are standard pointers to float vectors
+    for (auto& float_var: info.codegen_float_variables) {
+        auto name = new ast::Name(new ast::String(float_var->get_name()));
+        auto codegen_var = new ast::CodegenVar(1, name);
+        codegen_vars.emplace_back(codegen_var);
+    }
+    /// int variables are pointers to indexes for other vectors
+    for (auto& int_var: info.codegen_int_variables) {
+        auto name = new ast::Name(new ast::String(int_var.symbol->get_name()));
+        auto codegen_var = new ast::CodegenVar(1, name);
+        codegen_vars.emplace_back(codegen_var);
+    }
+    return std::make_shared<ast::InstanceStruct>(codegen_vars);
+}
+
 static void append_statements_from_block(ast::StatementVector& statements,
                                          const std::shared_ptr<ast::StatementBlock>& block) {
     const auto& block_statements = block->get_statements();
@@ -523,7 +540,11 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
+
+    auto llvm_instance_struct = create_instance_struct();
+    node.emplace_back_node(llvm_instance_struct);
 }
 
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 0ec3792b9d..5634d39bd8 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -61,6 +61,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     /// create new function for FUNCTION or PROCEDURE block
     void create_function_for_node(ast::Block& node);
 
+    /// create new InstanceStruct
+    std::shared_ptr<ast::InstanceStruct> create_instance_struct();
+
   public:
     CodegenLLVMHelperVisitor() = default;
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 831c43317a..1433b5a648 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -90,6 +90,12 @@ llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
     return llvm::Type::getDoubleTy(*context);
 }
 
+llvm::Type* CodegenLLVMVisitor::get_default_fp_ptr_type() {
+    if (use_single_precision)
+        return llvm::Type::getFloatPtrTy(*context);
+    return llvm::Type::getDoublePtrTy(*context);
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
@@ -574,6 +580,17 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
     values.push_back(var);
 }
 
+void CodegenLLVMVisitor::visit_instance_struct(const ast::InstanceStruct& node) {
+    std::vector<llvm::Type*> members;
+    for (const auto& variable: node.get_codegen_vars()) {
+        members.push_back(get_default_fp_ptr_type());
+    }
+
+    llvm_struct = llvm::StructType::create(*context, mod_filename + "_Instance");
+    llvm_struct->setBody(members);
+    module->getOrInsertGlobal("inst", llvm_struct);
+}
+
 void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
     // Get the current and the next blocks within the function.
     llvm::BasicBlock* curr_block = builder.GetInsertBlock();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 3003a119b5..7a5488de43 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -79,6 +79,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
     bool use_single_precision;
 
+    // LLVM mechanism struct
+    llvm::StructType* llvm_struct;
+
     /**
      *\brief Run LLVM optimisation passes on generated IR
      *
@@ -147,6 +150,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Type* get_default_fp_type();
 
+    /**
+     * Returns pointer to 64-bit or 32-bit LLVM floating type
+     * \return     \c LLVM pointer to floating point type according to `use_single_precision` flag
+     */
+    llvm::Type* get_default_fp_ptr_type();
+
     /**
      * Create a function call to an external method
      * \param name external method name
@@ -242,6 +251,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_program(const ast::Program& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
+    void visit_instance_struct(const ast::InstanceStruct& node) override;
     void visit_while_statement(const ast::WhileStatement& node) override;
 
     // \todo: move this to debug mode (e.g. -v option or --dump-ir)
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index a2bece8b4a..4c7f27046a 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -117,6 +117,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/independent_definition.hpp
     ${PROJECT_BINARY_DIR}/src/ast/indexed_name.hpp
     ${PROJECT_BINARY_DIR}/src/ast/initial_block.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/instance_struct.hpp
     ${PROJECT_BINARY_DIR}/src/ast/integer.hpp
     ${PROJECT_BINARY_DIR}/src/ast/kinetic_block.hpp
     ${PROJECT_BINARY_DIR}/src/ast/lag_statement.hpp
diff --git a/src/language/nmodl.yaml b/src/language/nmodl.yaml
index 2bafd00af5..23dcb7bd10 100644
--- a/src/language/nmodl.yaml
+++ b/src/language/nmodl.yaml
@@ -438,6 +438,18 @@
                               is base class and defines common interface for these nodes.
 
                       children:
+                        - InstanceStruct:
+                            nmodl: "INSTANCE_STRUCT "
+                            members:
+                              - codegen_vars:
+                                  brief: "Vector of CodegenVars"
+                                  type: CodegenVar
+                                  vector: true
+                                  add: true
+                                  separator: "\\n    "
+                                  prefix: {value: "{\\n    ", force: true}
+                                  suffix: {value: "\\n}", force: true}
+                            brief: "LLVM IR Struct that holds the mechanism instance's variables"
 
                         - ParamBlock:
                             nmodl: "PARAMETER "
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index d16b02b2f5..2d5ca7ef2a 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -367,12 +367,12 @@ SCENARIO("Function", "[visitor][llvm]") {
             std::smatch m;
 
             // Check function signature. The return type should be the default double type.
-            std::regex function_signature(R"(define double @foo\(double %x1\) \{)");
+            std::regex function_signature(R"(define double @foo\(double %x[0-9].*\) \{)");
             REQUIRE(std::regex_search(module_string, m, function_signature));
 
             // Check that function arguments are allocated on the local stack.
             std::regex alloca_instr(R"(%x = alloca double)");
-            std::regex store_instr(R"(store double %x1, double\* %x)");
+            std::regex store_instr(R"(store double %x[0-9].*, double\* %x)");
             REQUIRE(std::regex_search(module_string, m, alloca_instr));
             REQUIRE(std::regex_search(module_string, m, store_instr));
 
@@ -638,7 +638,7 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             std::smatch m;
 
             // Check procedure signature.
-            std::regex function_signature(R"(define i32 @with_argument\(double %x1\) \{)");
+            std::regex function_signature(R"(define i32 @with_argument\(double %x[0-9].*\) \{)");
             REQUIRE(std::regex_search(module_string, m, function_signature));
 
             // Check dummy return.
@@ -653,7 +653,7 @@ SCENARIO("Procedure", "[visitor][llvm]") {
 
             // Check that procedure arguments are allocated on the local stack.
             std::regex alloca_instr(R"(%x = alloca double)");
-            std::regex store_instr(R"(store double %x1, double\* %x)");
+            std::regex store_instr(R"(store double %x[0-9].*, double\* %x)");
             REQUIRE(std::regex_search(module_string, m, alloca_instr));
             REQUIRE(std::regex_search(module_string, m, store_instr));
         }
@@ -753,8 +753,44 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
 
             // Check if the values are optimised out
             std::regex empty_proc(
-                R"(define i32 @add\(double %a1, double %b2\) \{\n(\s)*ret i32 0\n\})");
+                R"(define i32 @add\(double %a[0-9].*, double %b[0-9].*\) \{\n(\s)*ret i32 0\n\})");
             REQUIRE(std::regex_search(module_string, m, empty_proc));
         }
     }
 }
+
+//=============================================================================
+// Create Instance Struct
+//=============================================================================
+
+SCENARIO("Creation of Instance Struct", "[visitor][llvm][instance_struct]") {
+    GIVEN("NEURON block with RANGE variables and IONS") {
+        std::string nmodl_text = R"(
+            NEURON {
+                USEION na READ ena WRITE ina
+                NONSPECIFIC_CURRENT il
+                RANGE minf, hinf
+            }
+
+            STATE {
+                m
+            }
+
+            ASSIGNED {
+                v (mV)
+                celsius (degC)
+                minf
+                hinf
+            }
+        )";
+
+        THEN("create struct with the declared variables") {
+            std::string module_string = run_llvm_visitor(nmodl_text, true);
+            std::smatch m;
+
+            std::regex instance_struct_declaration(
+                R"(%unknown_Instance = type \{ double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\* \})");
+            REQUIRE(std::regex_search(module_string, m, instance_struct_declaration));
+        }
+    }
+}

From 11a186ee28c43d8cd872889180aa84c2cda0c600 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Wed, 3 Feb 2021 22:45:41 +0300
Subject: [PATCH 131/331] Printf support in LLVM IR codegen (#510)

- Added support for string function arguments. These are
   converted into global `i8` array values.
- Added support for `printf` function call with variable number
   of arguments.
- Refactored function/procedure call argument processing into
   a separate function.

fixes #510
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 57 +++++++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp | 18 +++++--
 test/unit/codegen/codegen_llvm_ir.cpp     | 36 ++++++++++++++
 3 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 1433b5a648..3bb3b38dfc 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -116,6 +116,11 @@ void CodegenLLVMVisitor::run_llvm_opt_passes() {
 
 void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
                                                      const ast::ExpressionVector& arguments) {
+    if (name == "printf") {
+        create_printf_call(arguments);
+        return;
+    }
+
     std::vector<llvm::Value*> argument_values;
     std::vector<llvm::Type*> argument_types;
     for (const auto& arg: arguments) {
@@ -145,24 +150,39 @@ void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
                                               const std::string& name,
                                               const ast::ExpressionVector& arguments) {
     // Check that function is called with the expected number of arguments.
-    if (arguments.size() != func->arg_size()) {
+    if (!func->isVarArg() && arguments.size() != func->arg_size()) {
         throw std::runtime_error("Error: Incorrect number of arguments passed");
     }
 
-    // Process each argument and add it to a vector to pass to the function call instruction. Note
-    // that type checks are not needed here as NMODL operates on doubles by default.
+    // Pack function call arguments to vector and create a call instruction.
     std::vector<llvm::Value*> argument_values;
-    for (const auto& arg: arguments) {
-        arg->accept(*this);
-        llvm::Value* value = values.back();
-        values.pop_back();
-        argument_values.push_back(value);
-    }
-
+    argument_values.reserve(arguments.size());
+    pack_function_call_arguments(arguments, argument_values);
     llvm::Value* call = builder.CreateCall(func, argument_values);
     values.push_back(call);
 }
 
+void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& arguments) {
+    // First, create printf declaration or insert it if it does not exit.
+    std::string name = "printf";
+    llvm::Function* printf = module->getFunction(name);
+    if (!printf) {
+        llvm::Type* ptr_type = llvm::Type::getInt8PtrTy(*context);
+        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+        llvm::FunctionType* printf_type =
+            llvm::FunctionType::get(i32_type, ptr_type, /*isVarArg=*/true);
+
+        printf =
+            llvm::Function::Create(printf_type, llvm::Function::ExternalLinkage, name, *module);
+    }
+
+    // Create a call instruction.
+    std::vector<llvm::Value*> argument_values;
+    argument_values.reserve(arguments.size());
+    pack_function_call_arguments(arguments, argument_values);
+    builder.CreateCall(printf, argument_values);
+}
+
 void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::CodegenFunction& node) {
     const auto& name = node.get_node_name();
     const auto& arguments = node.get_arguments();
@@ -188,6 +208,23 @@ llvm::Value* CodegenLLVMVisitor::lookup(const std::string& name) {
     return val;
 }
 
+void CodegenLLVMVisitor::pack_function_call_arguments(const ast::ExpressionVector& arguments,
+                                                      std::vector<llvm::Value*>& arg_values) {
+    for (const auto& arg: arguments) {
+        if (arg->is_string()) {
+            // If the argument is a string, create a global i8* variable with it.
+            auto string_arg = std::dynamic_pointer_cast<ast::String>(arg);
+            llvm::Value* str = builder.CreateGlobalStringPtr(string_arg->get_value());
+            arg_values.push_back(str);
+        } else {
+            arg->accept(*this);
+            llvm::Value* value = values.back();
+            values.pop_back();
+            arg_values.push_back(value);
+        }
+    }
+}
+
 llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
                                                          llvm::Value* rhs,
                                                          unsigned op) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 7a5488de43..9bdbdef7e9 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -173,6 +173,11 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void create_function_call(llvm::Function* func,
                               const std::string& name,
                               const ast::ExpressionVector& arguments);
+    /**
+     * Create a function call to printf function
+     * \param arguments expressions passed as arguments to the printf call
+     */
+    void create_printf_call(const ast::ExpressionVector& arguments);
 
     /**
      * Emit function or procedure declaration in LLVM given the node
@@ -195,6 +200,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Value* lookup(const std::string& name);
 
+    /**
+     * Fills values vector with processed NMODL function call arguments
+     * \param arguments expression vector
+     * \param arg_values vector of LLVM IR values to fill
+     */
+    void pack_function_call_arguments(const ast::ExpressionVector& arguments,
+                                      std::vector<llvm::Value*>& arg_values);
+
     /**
      * Visit nmodl arithmetic binary operator
      * \param lhs LLVM value of evaluated lhs expression
@@ -229,11 +242,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Value* visit_comparison_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
 
-    /**
-     * Visit nmodl function or procedure
-     * \param node the AST node representing the function or procedure in NMODL
-     */
-    void visit_procedure_or_function(const ast::Block& node);
 
     // Visitors
     void visit_binary_expression(const ast::BinaryExpression& node) override;
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 2d5ca7ef2a..ba0c725c0c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -474,6 +474,42 @@ SCENARIO("Function call", "[visitor][llvm]") {
         }
     }
 
+    GIVEN("A call to printf") {
+        std::string nmodl_text = R"(
+            PROCEDURE bar() {
+                LOCAL i
+                i = 0
+                printf("foo")
+                printf("bar %d", i)
+            }
+        )";
+
+        THEN("printf is declared and global string values are created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check for global string values.
+            std::regex str1(
+                R"(@[0-9]+ = private unnamed_addr constant \[6 x i8\] c\"\\22foo\\22\\00\")");
+            std::regex str2(
+                R"(@[0-9]+ = private unnamed_addr constant \[9 x i8\] c\"\\22bar %d\\22\\00\")");
+            REQUIRE(std::regex_search(module_string, m, str1));
+            REQUIRE(std::regex_search(module_string, m, str2));
+
+            // Check for printf declaration.
+            std::regex declaration(R"(declare i32 @printf\(i8\*, \.\.\.\))");
+            REQUIRE(std::regex_search(module_string, m, declaration));
+
+            // Check the correct calls are made.
+            std::regex call1(
+                R"(call i32 \(i8\*, \.\.\.\) @printf\(i8\* getelementptr inbounds \(\[6 x i8\], \[6 x i8\]\* @[0-9]+, i32 0, i32 0\)\))");
+            std::regex call2(
+                R"(call i32 \(i8\*, \.\.\.\) @printf\(i8\* getelementptr inbounds \(\[9 x i8\], \[9 x i8\]\* @[0-9]+, i32 0, i32 0\), double %[0-9]+\))");
+            REQUIRE(std::regex_search(module_string, m, call1));
+            REQUIRE(std::regex_search(module_string, m, call2));
+        }
+    }
+
     GIVEN("A call to function with the wrong number of arguments") {
         std::string nmodl_text = R"(
             FUNCTION foo(x, y) {

From fa01b013cc9762b96be991b3781f53c5bb1dfd1e Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 15 Feb 2021 11:21:58 +0100
Subject: [PATCH 132/331] =?UTF-8?q?Fix=20issue=20error:=20=E2=80=98runtime?=
 =?UTF-8?q?=5Ferror=E2=80=99=20is=20not=20a=20member=20of=20=E2=80=98std?=
 =?UTF-8?q?=E2=80=99=20(#512)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/language/templates/ast/ast_decl.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/language/templates/ast/ast_decl.hpp b/src/language/templates/ast/ast_decl.hpp
index 17faa90d6f..50383dc401 100644
--- a/src/language/templates/ast/ast_decl.hpp
+++ b/src/language/templates/ast/ast_decl.hpp
@@ -14,6 +14,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <stdexcept>
 
 /// \file
 /// \brief Auto generated  AST node types and aliases declaration

From 748bfefa6f2866ec092379da59691afb04ee9baa Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Fri, 5 Mar 2021 19:56:29 +0100
Subject: [PATCH 133/331] Move code gen specific InstanceStruct node to
 codegen.yaml (#526)

* Move code gen specific InstanceStruct node to codegen.yaml
  - nmodl.yaml file is more for language constructs
  - InstanceStruct is specific for code generation and hence
    move it to codegen.yaml
* Update CI scripts
* fix cmake-format with v==0.6.13
---
 src/language/codegen.yaml | 12 ++++++++++++
 src/language/nmodl.yaml   | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 104f41420e..67ef48c371 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -143,6 +143,18 @@
                                   brief: "Body of the function"
                                   type: StatementBlock
                                   getter: {override: true}
+                        - InstanceStruct:
+                            nmodl: "INSTANCE_STRUCT "
+                            members:
+                              - codegen_vars:
+                                  brief: "Vector of CodegenVars"
+                                  type: CodegenVar
+                                  vector: true
+                                  add: true
+                                  separator: "\\n    "
+                                  prefix: {value: "{\\n    ", force: true}
+                                  suffix: {value: "\\n}", force: true}
+                            brief: "LLVM IR Struct that holds the mechanism instance's variables"
                   - WrappedExpression:
                       brief: "Wrap any other expression type"
                       members:
diff --git a/src/language/nmodl.yaml b/src/language/nmodl.yaml
index 23dcb7bd10..2bafd00af5 100644
--- a/src/language/nmodl.yaml
+++ b/src/language/nmodl.yaml
@@ -438,18 +438,6 @@
                               is base class and defines common interface for these nodes.
 
                       children:
-                        - InstanceStruct:
-                            nmodl: "INSTANCE_STRUCT "
-                            members:
-                              - codegen_vars:
-                                  brief: "Vector of CodegenVars"
-                                  type: CodegenVar
-                                  vector: true
-                                  add: true
-                                  separator: "\\n    "
-                                  prefix: {value: "{\\n    ", force: true}
-                                  suffix: {value: "\\n}", force: true}
-                            brief: "LLVM IR Struct that holds the mechanism instance's variables"
 
                         - ParamBlock:
                             nmodl: "PARAMETER "

From 7c8e16f019c00d96a02e56cc7e74f331dc74e639 Mon Sep 17 00:00:00 2001
From: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Sat, 27 Feb 2021 13:15:09 +0100
Subject: [PATCH 134/331] * Improvements to codegen helper (Part I)  - instance
 structure now contains all global variables  - instance structure now
 contains index variables for ions  - nrn_state kernel now has all variables
 converted to instance  - InstanceVarHelper added to query variable and it's
 location * Support for codegen variable with type * Add nmodl_to_json helper
 added in main.cpp * Added --vector-width CLI option * Add instance struct
 argument to nrn_state_hh * Add comments as TODOs to support LLVM IR
 generation

Note that this commit and next commit (Part II) are required to
make LLVM IR code generation working. Vector IR generation is
working except indirect indexes. See comment in #531.
---
 src/codegen/codegen_naming.hpp                |  6 ++
 .../llvm/codegen_llvm_helper_visitor.cpp      | 78 ++++++++++++++-----
 .../llvm/codegen_llvm_helper_visitor.hpp      | 71 ++++++++++++++++-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 39 +++++++++-
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  8 ++
 src/language/code_generator.cmake             |  3 +-
 src/language/codegen.yaml                     | 25 ++++--
 src/language/node_info.py                     |  1 +
 src/language/nodes.py                         |  4 +
 .../templates/visitors/nmodl_visitor.cpp      |  7 +-
 src/main.cpp                                  | 66 +++++++++-------
 11 files changed, 250 insertions(+), 58 deletions(-)

diff --git a/src/codegen/codegen_naming.hpp b/src/codegen/codegen_naming.hpp
index 73c09df055..910d35e4c1 100644
--- a/src/codegen/codegen_naming.hpp
+++ b/src/codegen/codegen_naming.hpp
@@ -80,6 +80,12 @@ static constexpr char VOLTAGE_UNUSED_VARIABLE[] = "v_unused";
 /// variable t indicating last execution time of net receive block
 static constexpr char T_SAVE_VARIABLE[] = "tsave";
 
+/// global variable celsius
+static constexpr char CELSIUS_VARIABLE[] = "celsius";
+
+/// global variable second_order
+static constexpr char SECOND_ORDER_VARIABLE[] = "secondorder";
+
 /// shadow rhs variable in neuron thread structure
 static constexpr char NTHREAD_RHS_SHADOW[] = "_shadow_rhs";
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index fc8fda3d04..b3f75b9372 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -141,12 +141,12 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     block->emplace_back_statement(return_statement);
 
     /// prepare function arguments based original node arguments
-    ast::CodegenArgumentVector arguments;
+    ast::CodegenVarWithTypeVector arguments;
     for (const auto& param: node.get_parameters()) {
         /// create new type and name for creating new ast node
         auto type = new ast::CodegenVarType(FLOAT_TYPE);
         auto var = param->get_name()->clone();
-        arguments.emplace_back(new ast::CodegenArgument(type, var));
+        arguments.emplace_back(new ast::CodegenVarWithType(type, 0, var));
     }
 
     /// return type of the function is same as return variable type
@@ -159,19 +159,43 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
 }
 
 std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_struct() {
-    ast::CodegenVarVector codegen_vars;
+    ast::CodegenVarWithTypeVector codegen_vars;
+
+    auto add_var_with_type =
+        [&](const std::string& name, const ast::AstNodeType type, int is_pointer) {
+            auto var_name = new ast::Name(new ast::String(name));
+            auto var_type = new ast::CodegenVarType(type);
+            auto codegen_var = new ast::CodegenVarWithType(var_type, is_pointer, var_name);
+            codegen_vars.emplace_back(codegen_var);
+        };
+
     /// float variables are standard pointers to float vectors
     for (auto& float_var: info.codegen_float_variables) {
-        auto name = new ast::Name(new ast::String(float_var->get_name()));
-        auto codegen_var = new ast::CodegenVar(1, name);
-        codegen_vars.emplace_back(codegen_var);
+        add_var_with_type(float_var->get_name(), FLOAT_TYPE, 1);
     }
+
     /// int variables are pointers to indexes for other vectors
     for (auto& int_var: info.codegen_int_variables) {
-        auto name = new ast::Name(new ast::String(int_var.symbol->get_name()));
-        auto codegen_var = new ast::CodegenVar(1, name);
-        codegen_vars.emplace_back(codegen_var);
+        add_var_with_type(int_var.symbol->get_name(), FLOAT_TYPE, 1);
+    }
+
+    // for integer variables, there should be index
+    for (auto& int_var: info.codegen_int_variables) {
+        std::string var_name = int_var.symbol->get_name() + "_index";
+        add_var_with_type(var_name, INTEGER_TYPE, 1);
     }
+
+    // add voltage and node index
+    add_var_with_type("voltage", FLOAT_TYPE, 1);
+    add_var_with_type("node_index", INTEGER_TYPE, 1);
+
+    // add dt, t, celsius
+    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, 0);
+    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, 0);
+    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, 0);
+    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, 0);
+    add_var_with_type(MECH_NODECOUNT_VAR, INTEGER_TYPE, 0);
+
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
 
@@ -362,13 +386,24 @@ void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
     auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
     for (auto& v: variables) {
         auto variable = std::dynamic_pointer_cast<ast::VarName>(v);
-        /// if variable is of type instance then convert it to index
-        if (info.is_an_instance_variable(variable->get_node_name())) {
+        auto variable_name = variable->get_node_name();
+
+        /// all instance variables defined in the mod file should be converted to
+        /// indexed variables based on the loop iteration variable
+        if (info.is_an_instance_variable(variable_name)) {
             auto name = variable->get_name()->clone();
             auto index = new ast::Name(new ast::String(index_var));
             auto indexed_name = std::make_shared<ast::IndexedName>(name, index);
             variable->set_name(indexed_name);
         }
+
+        /// instance_var_helper check of instance variables from mod file as well
+        /// as extra variables like ion index variables added for code generation
+        if (instance_var_helper.is_an_instance_variable(variable_name)) {
+            auto name = new ast::Name(new ast::String(MECH_INSTANCE_VAR));
+            auto var = std::make_shared<ast::CodegenInstanceVar>(name, variable->clone());
+            variable->set_name(var);
+        }
     }
 }
 
@@ -438,7 +473,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// loop constructs : initialization, condition and increment
     const auto& initialization = create_statement_as_expression("id = 0");
     const auto& condition = create_expression("id < node_count");
-    const auto& increment = create_statement_as_expression("id = id + 1");
+    const auto& increment = create_statement_as_expression("id = id + {}"_format(vector_width));
 
     /// loop body : initialization + solve blocks
     ast::StatementVector loop_def_statements;
@@ -496,9 +531,6 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// now construct a new code block which will become the body of the loop
     auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
 
-    /// convert all variables inside loop body to instance variables
-    convert_to_instance_variable(*loop_block, loop_index_var);
-
     /// convert local statement to codegenvar statement
     convert_local_statement(*loop_block);
 
@@ -508,6 +540,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
                                                                          increment,
                                                                          loop_block);
 
+    /// convert all variables inside loop body to instance variables
+    convert_to_instance_variable(*for_loop_statement, loop_index_var);
+
     /// loop itself becomes one of the statement in the function
     function_statements.push_back(for_loop_statement);
 
@@ -520,7 +555,12 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     auto return_type = new ast::CodegenVarType(ast::AstNodeType::VOID);
 
     /// \todo : currently there are no arguments
-    ast::CodegenArgumentVector code_arguments;
+    ast::CodegenVarWithTypeVector code_arguments;
+
+    auto instance_var_type = new ast::CodegenVarType(ast::AstNodeType::INSTANCE_STRUCT);
+    auto instance_var_name = new ast::Name(new ast::String("mech"));
+    auto instance_var = new ast::CodegenVarWithType(instance_var_type, 1, instance_var_name);
+    code_arguments.emplace_back(instance_var);
 
     /// finally, create new function
     auto function =
@@ -535,14 +575,16 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     CodegenHelperVisitor v;
     info = v.analyze(node);
 
+    instance_var_helper.instance = create_instance_struct();
+    node.emplace_back_node(instance_var_helper.instance);
+
     logger->info("Running CodegenLLVMHelperVisitor");
     node.visit_children(*this);
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
 
-    auto llvm_instance_struct = create_instance_struct();
-    node.emplace_back_node(llvm_instance_struct);
+    std::cout << nmodl::to_nmodl(node);
 }
 
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 5634d39bd8..981372b4d5 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -14,6 +14,7 @@
 
 #include <string>
 
+#include "ast/instance_struct.hpp"
 #include "codegen/codegen_info.hpp"
 #include "symtab/symbol_table.hpp"
 #include "visitors/ast_visitor.hpp"
@@ -21,7 +22,7 @@
 namespace nmodl {
 namespace codegen {
 
-
+using namespace fmt::literals;
 typedef std::vector<std::shared_ptr<ast::CodegenFunction>> CodegenFunctionVector;
 
 /**
@@ -29,6 +30,57 @@ typedef std::vector<std::shared_ptr<ast::CodegenFunction>> CodegenFunctionVector
  * @{
  */
 
+/**
+ * \class InstanceVarHelper
+ * \brief Helper to query instance variables information
+ *
+ * For LLVM IR generation we need to know the variable, it's type and
+ * location in the instance structure. This helper provides convenient
+ * functions to query this information.
+ */
+struct InstanceVarHelper {
+    /// pointer to instance node in the AST
+    std::shared_ptr<ast::InstanceStruct> instance;
+
+    /// find variable with given name and return the iterator
+    ast::CodegenVarWithTypeVector::const_iterator find_variable(
+        const ast::CodegenVarWithTypeVector& vars,
+        const std::string& name) {
+        return find_if(vars.begin(),
+                       vars.end(),
+                       [&](const std::shared_ptr<ast::CodegenVarWithType>& v) {
+                           return v->get_node_name() == name;
+                       });
+    }
+
+    /// check if given variable is instance variable
+    bool is_an_instance_variable(const std::string& name) {
+        const auto& vars = instance->get_codegen_vars();
+        return find_variable(vars, name) != vars.end();
+    }
+
+    /// return codegen variable with a given name
+    const std::shared_ptr<ast::CodegenVarWithType>& get_variable(const std::string& name) {
+        const auto& vars = instance->get_codegen_vars();
+        auto it = find_variable(vars, name);
+        if (it == vars.end()) {
+            throw std::runtime_error("Can not find variable with name {}"_format(name));
+        }
+        return *it;
+    }
+
+    /// return position of the variable in the instance structure
+    int get_variable_index(const std::string& name) {
+        const auto& vars = instance->get_codegen_vars();
+        auto it = find_variable(vars, name);
+        if (it == vars.end()) {
+            throw std::runtime_error("Can not find codegen variable with name {}"_format(name));
+        }
+        return (it - vars.begin());
+    }
+};
+
+
 /**
  * \class CodegenLLVMHelperVisitor
  * \brief Helper visitor for AST information to help code generation backends
@@ -48,16 +100,26 @@ typedef std::vector<std::shared_ptr<ast::CodegenFunction>> CodegenFunctionVector
  * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
+    // explicit vectorisation width
+    int vector_width;
+
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
 
     /// ast information for code generation
     codegen::CodegenInfo info;
 
+    /// mechanism data helper
+    InstanceVarHelper instance_var_helper;
+
     /// default integer and float node type
     const ast::AstNodeType INTEGER_TYPE = ast::AstNodeType::INTEGER;
     const ast::AstNodeType FLOAT_TYPE = ast::AstNodeType::DOUBLE;
 
+    /// name of the mechanism instance parameter
+    const std::string MECH_INSTANCE_VAR = "mech";
+    const std::string MECH_NODECOUNT_VAR = "node_count";
+
     /// create new function for FUNCTION or PROCEDURE block
     void create_function_for_node(ast::Block& node);
 
@@ -65,7 +127,12 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     std::shared_ptr<ast::InstanceStruct> create_instance_struct();
 
   public:
-    CodegenLLVMHelperVisitor() = default;
+    CodegenLLVMHelperVisitor(int vector_width)
+        : vector_width(vector_width){};
+
+    const InstanceVarHelper& get_instance_var_helper() {
+        return instance_var_helper;
+    }
 
     /// run visitor and return code generation functions
     CodegenFunctionVector get_codegen_functions(const ast::Program& node);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 3bb3b38dfc..80bdfd20e3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -6,7 +6,6 @@
  *************************************************************************/
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 
 #include "ast/all.hpp"
 #include "visitors/rename_visitor.hpp"
@@ -79,6 +78,8 @@ llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType&
         return llvm::Type::getInt32Ty(*context);
     case ast::AstNodeType::VOID:
         return llvm::Type::getVoidTy(*context);
+    // TODO :: George/Ioannis : Here we have to also return INSTANCE_STRUCT type
+    //         as it is used as an argument to nrn_state function
     default:
         throw std::runtime_error("Error: expecting a type in CodegenVarType node\n");
     }
@@ -556,8 +557,13 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v;
+    CodegenLLVMHelperVisitor v{vector_width};
     const auto& functions = v.get_codegen_functions(node);
+    instance_var_helper = v.get_instance_var_helper();
+
+    // TODO :: George / Ioannis :: before emitting procedures, we have
+    //         to emmit INSTANCE_STRUCT type as it's used as an argument.
+    //         Currently it's done in node.visit_children which is late.
 
     // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
@@ -603,6 +609,16 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
     if (!identifier->is_name() && !identifier->is_indexed_name())
         throw std::runtime_error("Error: Unsupported variable type");
 
+    // TODO :: George :: here instance_var_helper can be used to query
+    // variable type and it's index into structure
+    auto name = node.get_node_name();
+
+    auto codegen_var_with_type = instance_var_helper.get_variable(name);
+    auto codegen_var_index = instance_var_helper.get_variable_index(name);
+    // this will be INTEGER or DOUBLE
+    auto var_type = codegen_var_with_type->get_type()->get_type();
+    auto is_pointer = codegen_var_with_type->get_is_pointer();
+
     llvm::Value* ptr;
     if (identifier->is_name())
         ptr = lookup(node.get_node_name());
@@ -620,7 +636,24 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 void CodegenLLVMVisitor::visit_instance_struct(const ast::InstanceStruct& node) {
     std::vector<llvm::Type*> members;
     for (const auto& variable: node.get_codegen_vars()) {
-        members.push_back(get_default_fp_ptr_type());
+        // TODO :: Ioannis / George :: we have now double*, int*, double and int
+        //         variables in the instance structure. Each variable is of type
+        //         ast::CodegenVarWithType. So we can query variable type and if
+        //         it's pointer.
+        auto is_pointer = variable->get_is_pointer();
+        auto type = variable->get_type()->get_type();
+
+        // todo : clean up ?
+        if (type == ast::AstNodeType::DOUBLE) {
+            auto llvm_type = is_pointer ? get_default_fp_ptr_type() : get_default_fp_type();
+            members.push_back(llvm_type);
+        } else {
+            if (is_pointer) {
+                members.push_back(llvm::Type::getInt32PtrTy(*context));
+            } else {
+                members.push_back(llvm::Type::getInt32Ty(*context));
+            }
+        }
     }
 
     llvm_struct = llvm::StructType::create(*context, mod_filename + "_Instance");
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 9bdbdef7e9..b20a19bac7 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -18,6 +18,7 @@
 #include <ostream>
 #include <string>
 
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
@@ -56,6 +57,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     std::string output_dir;
 
   private:
+    InstanceVarHelper instance_var_helper;
+
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
 
     std::unique_ptr<llvm::Module> module = std::make_unique<llvm::Module>(mod_filename, *context);
@@ -79,6 +82,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
     bool use_single_precision;
 
+    // explicit vectorisation width
+    int vector_width;
+
     // LLVM mechanism struct
     llvm::StructType* llvm_struct;
 
@@ -100,11 +106,13 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
                        bool opt_passes,
+                       int vector_width = 1,
                        bool use_single_precision = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , use_single_precision(use_single_precision)
+        , vector_width(vector_width)
         , builder(*context)
         , fpm(module.get()) {}
 
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 4c7f27046a..17123fc833 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -65,15 +65,16 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/block_comment.hpp
     ${PROJECT_BINARY_DIR}/src/ast/boolean.hpp
     ${PROJECT_BINARY_DIR}/src/ast/breakpoint_block.hpp
-    ${PROJECT_BINARY_DIR}/src/ast/codegen_argument.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_atomic_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_for_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_function.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_instance_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_struct.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_list_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_type.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_var_with_type.hpp
     ${PROJECT_BINARY_DIR}/src/ast/compartment.hpp
     ${PROJECT_BINARY_DIR}/src/ast/conductance_hint.hpp
     ${PROJECT_BINARY_DIR}/src/ast/conserve.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 67ef48c371..30bae4c5c5 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -49,17 +49,30 @@
                                   brief: "Name of the variable"
                                   type: Identifier
                                   node_name: true
-                        - CodegenArgument:
-                            brief: "Represent argument to a function"
+                        - CodegenVarWithType:
+                            brief: "Represent variable used for code generation"
                             members:
                               - type:
-                                  brief: "Type of the argument"
+                                  brief: "Type of the variable"
                                   type: CodegenVarType
                                   suffix: {value: " "}
+                              - is_pointer:
+                                  brief: "If variable is pointer type"
+                                  type: int
                               - name:
-                                  brief: "Name of the argument"
+                                  brief: "Name of the variable"
                                   type: Identifier
                                   node_name: true
+                        - CodegenInstanceVar:
+                            brief: "Represent instance variable"
+                            members:
+                              - instance_var:
+                                  brief: "Instance variable"
+                                  type: Name
+                                  suffix: {value: "->"}
+                              - member_var:
+                                  brief: "Member variable within instance"
+                                  type: Identifier
                   - Block:
                       children:
                         - NrnStateBlock:
@@ -134,7 +147,7 @@
                                   node_name: true
                               - arguments:
                                   brief: "Vector of the parameters to the function"
-                                  type: CodegenArgument
+                                  type: CodegenVarWithType
                                   vector: true
                                   prefix: {value: "(", force: true}
                                   suffix: {value: ")", force: true}
@@ -148,7 +161,7 @@
                             members:
                               - codegen_vars:
                                   brief: "Vector of CodegenVars"
-                                  type: CodegenVar
+                                  type: CodegenVarWithType
                                   vector: true
                                   add: true
                                   separator: "\\n    "
diff --git a/src/language/node_info.py b/src/language/node_info.py
index 8b4e5fe0a2..57833af229 100644
--- a/src/language/node_info.py
+++ b/src/language/node_info.py
@@ -170,6 +170,7 @@
 UNIT_BLOCK = "UnitBlock"
 AST_NODETYPE_NODE= "AstNodeType"
 CODEGEN_VAR_TYPE_NODE = "CodegenVarType"
+CODEGEN_VAR_WITH_TYPE_NODE = "CodegenVarWithType"
 
 # name of variable in prime node which represent order of derivative
 ORDER_VAR_NAME = "order"
diff --git a/src/language/nodes.py b/src/language/nodes.py
index 4b520cb51b..fbb7f07c65 100644
--- a/src/language/nodes.py
+++ b/src/language/nodes.py
@@ -144,6 +144,10 @@ def is_ast_nodetype_node(self):
     def is_codegen_var_type_node(self):
         return self.class_name == node_info.CODEGEN_VAR_TYPE_NODE
 
+    @property
+    def is_codegen_var_with_type_node(self):
+        return self.class_name == node_info.CODEGEN_VAR_WITH_TYPE_NODE
+
     @property
     def is_enum_node(self):
         data_type = node_info.DATA_TYPES[self.class_name]
diff --git a/src/language/templates/visitors/nmodl_visitor.cpp b/src/language/templates/visitors/nmodl_visitor.cpp
index f7bb8279ca..01b470e70d 100644
--- a/src/language/templates/visitors/nmodl_visitor.cpp
+++ b/src/language/templates/visitors/nmodl_visitor.cpp
@@ -115,7 +115,12 @@ void NmodlPrintVisitor::visit_{{ node.class_name|snake_case}}(const {{ node.clas
     {% endif %}
     {% for child in node.children %}
         {% call guard(child.force_prefix, child.force_suffix) -%}
-        {% if child.is_base_type_node %}
+
+        {% if node.is_codegen_var_with_type_node and child.varname == "is_pointer" %}
+             if(node.get_{{ child.varname }}()) {
+                printer->add_element("*");
+             }
+        {% elif child.is_base_type_node %}
             {% if child.is_ast_nodetype_node %}
                printer->add_element(ast::to_string(node.get_{{child.varname}}()));
             {% endif %}
diff --git a/src/main.cpp b/src/main.cpp
index 5b97b9a7f8..248cb2b5ad 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -177,6 +177,9 @@ int main(int argc, const char* argv[]) {
 
     /// run llvm optimisation passes
     bool llvm_opt_passes(false);
+
+    /// llvm vector width;
+    int llvm_vec_width = 1;
 #endif
 
     app.get_formatter()->column_width(40);
@@ -296,6 +299,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
+    llvm_opt->add_option("--vector-width",
+        llvm_vec_width,
+        "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
 #endif
     // clang-format on
 
@@ -325,15 +331,24 @@ int main(int argc, const char* argv[]) {
         }
     };
 
+    /// write ast to nmodl
+    const auto ast_to_json = [json_ast](ast::Program& ast, const std::string& filepath) {
+        if (json_ast) {
+            JSONVisitor(filepath).write(ast);
+            logger->info("AST to JSON transformation written to {}", filepath);
+        }
+    };
+
     for (const auto& file: mod_files) {
         logger->info("Processing {}", file);
 
         const auto modfile = utils::remove_extension(utils::base_name(file));
 
         /// create file path for nmodl file
-        auto filepath = [scratch_dir, modfile](const std::string& suffix) {
+        auto filepath = [scratch_dir, modfile](const std::string& suffix, const std::string& ext) {
             static int count = 0;
-            return "{}/{}.{}.{}.mod"_format(scratch_dir, modfile, std::to_string(count++), suffix);
+            return "{}/{}.{}.{}.{}"_format(
+                scratch_dir, modfile, std::to_string(count++), suffix, ext);
         };
 
         /// driver object creates lexer and parser, just call parser method
@@ -367,7 +382,7 @@ int main(int argc, const char* argv[]) {
         {
             logger->info("Running CVode to cnexp visitor");
             AfterCVodeToCnexpVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("after_cvode_to_cnexp"));
+            ast_to_nmodl(*ast, filepath("after_cvode_to_cnexp", "mod"));
         }
 
         /// Rename variables that match ISPC compiler double constants
@@ -375,7 +390,7 @@ int main(int argc, const char* argv[]) {
             logger->info("Running ISPC variables rename visitor");
             IspcRenameVisitor(ast).visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("ispc_double_rename"));
+            ast_to_nmodl(*ast, filepath("ispc_double_rename", "mod"));
         }
 
         /// GLOBAL to RANGE rename visitor
@@ -388,7 +403,7 @@ int main(int argc, const char* argv[]) {
             logger->info("Running GlobalToRange visitor");
             GlobalToRangeVisitor(ast).visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("global_to_range"));
+            ast_to_nmodl(*ast, filepath("global_to_range", "mod"));
         }
 
         /// LOCAL to ASSIGNED visitor
@@ -397,7 +412,7 @@ int main(int argc, const char* argv[]) {
             PerfVisitor().visit_program(*ast);
             LocalToAssignedVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("local_to_assigned"));
+            ast_to_nmodl(*ast, filepath("local_to_assigned", "mod"));
         }
 
         {
@@ -423,31 +438,26 @@ int main(int argc, const char* argv[]) {
             symtab->print(std::cout);
         }
 
-        ast_to_nmodl(*ast, filepath("ast"));
-
-        if (json_ast) {
-            auto file = scratch_dir + "/" + modfile + ".ast.json";
-            logger->info("Writing AST into {}", file);
-            JSONVisitor(file).write(*ast);
-        }
+        ast_to_nmodl(*ast, filepath("ast", "mod"));
+        ast_to_json(*ast, filepath("ast", "json"));
 
         if (verbatim_rename) {
             logger->info("Running verbatim rename visitor");
             VerbatimVarRenameVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("verbatim_rename"));
+            ast_to_nmodl(*ast, filepath("verbatim_rename", "mod"));
         }
 
         if (nmodl_const_folding) {
             logger->info("Running nmodl constant folding visitor");
             ConstantFolderVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("constfold"));
+            ast_to_nmodl(*ast, filepath("constfold", "mod"));
         }
 
         if (nmodl_unroll) {
             logger->info("Running nmodl loop unroll visitor");
             LoopUnrollVisitor().visit_program(*ast);
             ConstantFolderVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("unroll"));
+            ast_to_nmodl(*ast, filepath("unroll", "mod"));
             SymtabVisitor(update_symtab).visit_program(*ast);
         }
 
@@ -459,7 +469,7 @@ int main(int argc, const char* argv[]) {
             auto kineticBlockVisitor = KineticBlockVisitor();
             kineticBlockVisitor.visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            const auto filename = filepath("kinetic");
+            const auto filename = filepath("kinetic", "mod");
             ast_to_nmodl(*ast, filename);
             if (nmodl_ast && kineticBlockVisitor.get_conserve_statement_count()) {
                 logger->warn(
@@ -472,7 +482,7 @@ int main(int argc, const char* argv[]) {
             logger->info("Running STEADYSTATE visitor");
             SteadystateVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("steadystate"));
+            ast_to_nmodl(*ast, filepath("steadystate", "mod"));
         }
 
         /// Parsing units fron "nrnunits.lib" and mod files
@@ -489,14 +499,14 @@ int main(int argc, const char* argv[]) {
         if (nmodl_inline) {
             logger->info("Running nmodl inline visitor");
             InlineVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("inline"));
+            ast_to_nmodl(*ast, filepath("inline", "mod"));
         }
 
         if (local_rename) {
             logger->info("Running local variable rename visitor");
             LocalVarRenameVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("local_rename"));
+            ast_to_nmodl(*ast, filepath("local_rename", "mod"));
         }
 
         if (nmodl_localize) {
@@ -505,14 +515,14 @@ int main(int argc, const char* argv[]) {
             LocalizeVisitor(localize_verbatim).visit_program(*ast);
             LocalVarRenameVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("localize"));
+            ast_to_nmodl(*ast, filepath("localize", "mod"));
         }
 
         if (sympy_conductance) {
             logger->info("Running sympy conductance visitor");
             SympyConductanceVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("sympy_conductance"));
+            ast_to_nmodl(*ast, filepath("sympy_conductance", "mod"));
         }
 
         if (sympy_analytic || sparse_solver_exists(*ast)) {
@@ -523,19 +533,19 @@ int main(int argc, const char* argv[]) {
             logger->info("Running sympy solve visitor");
             SympySolverVisitor(sympy_pade, sympy_cse).visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("sympy_solve"));
+            ast_to_nmodl(*ast, filepath("sympy_solve", "mod"));
         }
 
         {
             logger->info("Running cnexp visitor");
             NeuronSolveVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("cnexp"));
+            ast_to_nmodl(*ast, filepath("cnexp", "mod"));
         }
 
         {
             SolveBlockVisitor().visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("solveblock"));
+            ast_to_nmodl(*ast, filepath("solveblock", "mod"));
         }
 
         if (json_perfstat) {
@@ -599,9 +609,11 @@ int main(int argc, const char* argv[]) {
 #ifdef NMODL_LLVM_BACKEND
             if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_passes, llvm_float_type);
+                CodegenLLVMVisitor visitor(
+                    modfile, output_dir, llvm_opt_passes, llvm_vec_width, llvm_float_type);
                 visitor.visit_program(*ast);
-                ast_to_nmodl(*ast, filepath("llvm"));
+                ast_to_nmodl(*ast, filepath("llvm", "mod"));
+                ast_to_json(*ast, filepath("llvm", "json"));
             }
 #endif
         }

From 06e274418d97befa476f1476da1053ee60d68a6a Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 7 Mar 2021 01:10:30 +0300
Subject: [PATCH 135/331] Addressing TODOs for Instance struct (#533) Part II  
 - remove undefined visit_codegen_instance_var   - Improved member creation
 for instance struct   - Instance struct type generation for kernel arguments 
  - Proper integration of instance struct   - Added scalar code generation for
 the kernel   - Removed instance test since it is not created explicitly
 anymore   - Fixed ordering for precision and width in LLVM Visitor   - Added
 vector induction variable   - Vectorised code for compute with direct loads
 fully functional   - Instance naming fixed   - (LLVM IR) Fixed compute vector
 code generation types   -  refactoring : improve coversion of double to int
 for     the loop expressions

---
 .../llvm/codegen_llvm_helper_visitor.cpp      |  83 ++--
 .../llvm/codegen_llvm_helper_visitor.hpp      |   7 +
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 387 ++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  48 ++-
 src/main.cpp                                  |   2 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  36 --
 6 files changed, 419 insertions(+), 144 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index b3f75b9372..c34ae2c873 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -146,7 +146,7 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
         /// create new type and name for creating new ast node
         auto type = new ast::CodegenVarType(FLOAT_TYPE);
         auto var = param->get_name()->clone();
-        arguments.emplace_back(new ast::CodegenVarWithType(type, 0, var));
+        arguments.emplace_back(new ast::CodegenVarWithType(type, /*is_pointer=*/0, var));
     }
 
     /// return type of the function is same as return variable type
@@ -170,31 +170,31 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
         };
 
     /// float variables are standard pointers to float vectors
-    for (auto& float_var: info.codegen_float_variables) {
-        add_var_with_type(float_var->get_name(), FLOAT_TYPE, 1);
+    for (const auto& float_var: info.codegen_float_variables) {
+        add_var_with_type(float_var->get_name(), FLOAT_TYPE, /*is_pointer=*/1);
     }
 
     /// int variables are pointers to indexes for other vectors
-    for (auto& int_var: info.codegen_int_variables) {
-        add_var_with_type(int_var.symbol->get_name(), FLOAT_TYPE, 1);
+    for (const auto& int_var: info.codegen_int_variables) {
+        add_var_with_type(int_var.symbol->get_name(), FLOAT_TYPE, /*is_pointer=*/1);
     }
 
     // for integer variables, there should be index
-    for (auto& int_var: info.codegen_int_variables) {
+    for (const auto& int_var: info.codegen_int_variables) {
         std::string var_name = int_var.symbol->get_name() + "_index";
-        add_var_with_type(var_name, INTEGER_TYPE, 1);
+        add_var_with_type(var_name, INTEGER_TYPE, /*is_pointer=*/1);
     }
 
     // add voltage and node index
-    add_var_with_type("voltage", FLOAT_TYPE, 1);
-    add_var_with_type("node_index", INTEGER_TYPE, 1);
+    add_var_with_type("voltage", FLOAT_TYPE, /*is_pointer=*/1);
+    add_var_with_type("node_index", INTEGER_TYPE, /*is_pointer=*/1);
 
     // add dt, t, celsius
-    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, 0);
-    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, 0);
-    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, 0);
-    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, 0);
-    add_var_with_type(MECH_NODECOUNT_VAR, INTEGER_TYPE, 0);
+    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
+    add_var_with_type(MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
 
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
@@ -384,7 +384,7 @@ void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
                                                             std::string& index_var) {
     /// collect all variables in the node of type ast::VarName
     auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
-    for (auto& v: variables) {
+    for (const auto& v: variables) {
         auto variable = std::dynamic_pointer_cast<ast::VarName>(v);
         auto variable_name = variable->get_node_name();
 
@@ -450,6 +450,44 @@ void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
     create_function_for_node(node);
 }
 
+/// Create asr::Varname node with given a given variable name
+static ast::VarName* create_varname(const std::string& varname) {
+    return new ast::VarName(new ast::Name(new ast::String(varname)), nullptr, nullptr);
+}
+
+/**
+ * Create for loop initialization expression
+ * @param code Usually "id = 0" as a string
+ * @return Expression representing code
+ * \todo : we can not use `create_statement_as_expression` function because
+ *         NMODL parser is using `ast::Double` type to represent all variables
+ *         including Integer. See #542.
+ */
+static std::shared_ptr<ast::Expression> loop_initialization_expression(
+    const std::string& induction_var) {
+    // create id = 0
+    const auto& id = create_varname(induction_var);
+    const auto& zero = new ast::Integer(0, nullptr);
+    return std::make_shared<ast::BinaryExpression>(id, ast::BinaryOperator(ast::BOP_ASSIGN), zero);
+}
+
+/**
+ * Create loop increment expression `id = id + width`
+ * \todo : same as loop_initialization_expression()
+ */
+static std::shared_ptr<ast::Expression> loop_increment_expression(const std::string& induction_var,
+                                                                  int vector_width) {
+    // first create id + x
+    const auto& id = create_varname(induction_var);
+    const auto& inc = new ast::Integer(vector_width, nullptr);
+    const auto& inc_expr =
+        new ast::BinaryExpression(id, ast::BinaryOperator(ast::BOP_ADDITION), inc);
+    // now create id = id + x
+    return std::make_shared<ast::BinaryExpression>(id->clone(),
+                                                   ast::BinaryOperator(ast::BOP_ASSIGN),
+                                                   inc_expr);
+}
+
 /**
  * \brief Convert ast::NrnStateBlock to corresponding code generation function nrn_state
  * @param node AST node representing ast::NrnStateBlock
@@ -471,9 +509,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// create now main compute part : for loop over channel instances
 
     /// loop constructs : initialization, condition and increment
-    const auto& initialization = create_statement_as_expression("id = 0");
-    const auto& condition = create_expression("id < node_count");
-    const auto& increment = create_statement_as_expression("id = id + {}"_format(vector_width));
+    const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
+    const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, MECH_NODECOUNT_VAR));
+    const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
 
     /// loop body : initialization + solve blocks
     ast::StatementVector loop_def_statements;
@@ -484,7 +522,8 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         std::vector<std::string> double_variables{"v"};
 
         /// access node index and corresponding voltage
-        loop_index_statements.push_back(visitor::create_statement("node_id = node_index[id]"));
+        loop_index_statements.push_back(
+            visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
         loop_body_statements.push_back(visitor::create_statement("v = voltage[node_id]"));
 
         /// read ion variables
@@ -558,7 +597,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     ast::CodegenVarWithTypeVector code_arguments;
 
     auto instance_var_type = new ast::CodegenVarType(ast::AstNodeType::INSTANCE_STRUCT);
-    auto instance_var_name = new ast::Name(new ast::String("mech"));
+    auto instance_var_name = new ast::Name(new ast::String(MECH_INSTANCE_VAR));
     auto instance_var = new ast::CodegenVarWithType(instance_var_type, 1, instance_var_name);
     code_arguments.emplace_back(instance_var);
 
@@ -567,7 +606,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         std::make_shared<ast::CodegenFunction>(return_type, name, code_arguments, function_block);
     codegen_functions.push_back(function);
 
-    std::cout << nmodl::to_nmodl(function);
+    std::cout << nmodl::to_nmodl(function) << std::endl;
 }
 
 void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
@@ -583,8 +622,6 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
-
-    std::cout << nmodl::to_nmodl(node);
 }
 
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 981372b4d5..b67aa7ee09 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -120,6 +120,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     const std::string MECH_INSTANCE_VAR = "mech";
     const std::string MECH_NODECOUNT_VAR = "node_count";
 
+    /// name of induction variable used in the kernel.
+    const std::string INDUCTION_VAR = "id";
+
     /// create new function for FUNCTION or PROCEDURE block
     void create_function_for_node(ast::Block& node);
 
@@ -134,6 +137,10 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
         return instance_var_helper;
     }
 
+    std::string get_kernel_id() {
+        return INDUCTION_VAR;
+    }
+
     /// run visitor and return code generation functions
     CodegenFunctionVector get_codegen_functions(const ast::Program& node);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 80bdfd20e3..62e69449b7 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -21,14 +21,22 @@ namespace nmodl {
 namespace codegen {
 
 
+static constexpr const char instance_struct_type_name[] = "__instance_var__type";
+
+// The prefix is used to create a vectorised id that can be used as index to GEPs. However, for
+// simple aligned vector loads and stores vector id is not needed. This is because we can bitcast
+// the pointer to the vector pointer! \todo: Consider removing this.
+static constexpr const char kernel_id_prefix[] = "__vec_";
+
+
 /****************************************************************************************/
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
 static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_codegen_return_statement() || statement.is_if_statement() ||
-           statement.is_while_statement();
+           statement.is_codegen_for_statement() || statement.is_codegen_return_statement() ||
+           statement.is_if_statement() || statement.is_while_statement();
 }
 
 bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
@@ -56,10 +64,82 @@ llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& no
     return create_gep(node.get_node_name(), index);
 }
 
+llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstanceVar& node) {
+    const auto& member_node = node.get_member_var();
+    const auto& instance_name = node.get_instance_var()->get_node_name();
+    const auto& member_name = member_node->get_node_name();
+
+    if (!instance_var_helper.is_an_instance_variable(member_name))
+        throw std::runtime_error("Error: " + member_name + " is not a member of the instance!");
+
+    // Load the instance struct given its name from the ValueSymbolTable.
+    llvm::Value* instance_ptr = builder.CreateLoad(lookup(instance_name));
+
+    // Create a GEP instruction to get a pointer to the member.
+    int member_index = instance_var_helper.get_variable_index(member_name);
+    llvm::Type* index_type = llvm::Type::getInt32Ty(*context);
+
+    std::vector<llvm::Value*> indices;
+    indices.push_back(llvm::ConstantInt::get(index_type, 0));
+    indices.push_back(llvm::ConstantInt::get(index_type, member_index));
+    llvm::Value* member_ptr = builder.CreateInBoundsGEP(instance_ptr, indices);
+
+    // Get the member AST node from the instance AST node, for which we proceed with the code
+    // generation. If the member is scalar, return the pointer to it straight away.
+    auto codegen_var_with_type = instance_var_helper.get_variable(member_name);
+    if (!codegen_var_with_type->get_is_pointer()) {
+        return member_ptr;
+    }
+
+    // Otherwise, the codegen variable is a pointer, and the member AST node must be an IndexedName.
+    auto member_var_name = std::dynamic_pointer_cast<ast::VarName>(member_node);
+    if (!member_var_name->get_name()->is_indexed_name())
+        throw std::runtime_error("Error: " + member_name + " is not an IndexedName!");
+
+    // Proceed to creating a GEP instruction to get the pointer to the member's element. While LLVM
+    // Helper set the indices to be Name nodes, a sanity check is added here. Note that this step
+    // can be avoided if using `get_array_index_or_length()`. However, it does not support indexing
+    // with Name/Expression at the moment. \todo: Reuse `get_array_index_or_length()` here.
+    auto member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
+        member_var_name->get_name());
+    if (!member_indexed_name->get_length()->is_name())
+        throw std::runtime_error("Error: " + member_name + " has a non-Name index!");
+
+    // Load the index variable that will be used to access the member's element. Since we index a
+    // pointer variable, we need to extend the 32-bit integer index variable to 64-bit.
+    llvm::Value* i32_index = builder.CreateLoad(
+        lookup(member_indexed_name->get_length()->get_node_name()));
+    llvm::Value* i64_index = builder.CreateSExt(i32_index, llvm::Type::getInt64Ty(*context));
+
+    // Create a indices vector for GEP to return the pointer to the element at the specified index.
+    std::vector<llvm::Value*> member_indices;
+    member_indices.push_back(i64_index);
+
+    // The codegen variable type is always a scalar, so we need to transform it to a pointer. Then
+    // load the member which would be indexed later.
+    llvm::Type* type = get_codegen_var_type(*codegen_var_with_type->get_type());
+    llvm::Value* instance_member =
+        builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
+
+
+    // If the code is vectorised, then bitcast to a vector pointer.
+    if (is_kernel_code && vector_width > 1) {
+        llvm::Type* vector_type =
+            llvm::PointerType::get(llvm::FixedVectorType::get(type, vector_width),
+                                   /*AddressSpace=*/0);
+        llvm::Value* instance_member_bitcasted = builder.CreateBitCast(instance_member,
+                                                                       vector_type);
+        return builder.CreateInBoundsGEP(instance_member_bitcasted, member_indices);
+    }
+
+    return builder.CreateInBoundsGEP(instance_member, member_indices);
+}
+
 unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& indexed_name) {
+    // \todo: Support indices with expressions and names: k[i + j] = ...
     auto integer = std::dynamic_pointer_cast<ast::Integer>(indexed_name.get_length());
     if (!integer)
-        throw std::runtime_error("Error: expecting integer index or length");
+        throw std::runtime_error("Error: only integer indices/length are supported!");
 
     // Check if integer value is taken from a macro.
     if (!integer->get_macro())
@@ -74,6 +154,8 @@ llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType&
         return llvm::Type::getInt1Ty(*context);
     case ast::AstNodeType::DOUBLE:
         return get_default_fp_type();
+    case ast::AstNodeType::INSTANCE_STRUCT:
+        return get_instance_struct_type();
     case ast::AstNodeType::INTEGER:
         return llvm::Type::getInt32Ty(*context);
     case ast::AstNodeType::VOID:
@@ -85,6 +167,26 @@ llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType&
     }
 }
 
+llvm::Value* CodegenLLVMVisitor::get_constant_int_vector(int value) {
+    llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+    std::vector<llvm::Constant*> constants;
+    for (unsigned i = 0; i < vector_width; ++i) {
+        const auto& element = llvm::ConstantInt::get(i32_type, value);
+        constants.push_back(element);
+    }
+    return llvm::ConstantVector::get(constants);
+}
+
+llvm::Value* CodegenLLVMVisitor::get_constant_fp_vector(const std::string& value) {
+    llvm::Type* fp_type = get_default_fp_type();
+    std::vector<llvm::Constant*> constants;
+    for (unsigned i = 0; i < vector_width; ++i) {
+        const auto& element = llvm::ConstantFP::get(fp_type, value);
+        constants.push_back(element);
+    }
+    return llvm::ConstantVector::get(constants);
+}
+
 llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
     if (use_single_precision)
         return llvm::Type::getFloatTy(*context);
@@ -97,6 +199,59 @@ llvm::Type* CodegenLLVMVisitor::get_default_fp_ptr_type() {
     return llvm::Type::getDoublePtrTy(*context);
 }
 
+llvm::Type* CodegenLLVMVisitor::get_instance_struct_type() {
+    std::vector<llvm::Type*> members;
+    for (const auto& variable: instance_var_helper.instance->get_codegen_vars()) {
+        auto is_pointer = variable->get_is_pointer();
+        auto nmodl_type = variable->get_type()->get_type();
+
+        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+        llvm::Type* i32ptr_type = llvm::Type::getInt32PtrTy(*context);
+
+        switch (nmodl_type) {
+#define DISPATCH(type, llvm_ptr_type, llvm_type)                       \
+    case type:                                                         \
+        members.push_back(is_pointer ? (llvm_ptr_type) : (llvm_type)); \
+        break;
+
+            DISPATCH(ast::AstNodeType::DOUBLE, get_default_fp_ptr_type(), get_default_fp_type());
+            DISPATCH(ast::AstNodeType::INTEGER, i32ptr_type, i32_type);
+
+#undef DISPATCH
+        default:
+            throw std::runtime_error("Error: unsupported type found in instance struct");
+        }
+    }
+
+    llvm::StructType* llvm_struct_type =
+        llvm::StructType::create(*context, mod_filename + instance_struct_type_name);
+    llvm_struct_type->setBody(members);
+    return llvm::PointerType::get(llvm_struct_type, /*AddressSpace=*/0);
+}
+
+llvm::Value* CodegenLLVMVisitor::get_variable_ptr(const ast::VarName& node) {
+    const auto& identifier = node.get_name();
+    if (!identifier->is_name() && !identifier->is_indexed_name() &&
+        !identifier->is_codegen_instance_var()) {
+        throw std::runtime_error("Error: Unsupported variable type - " + node.get_node_name());
+    }
+
+    llvm::Value* ptr;
+    if (identifier->is_name())
+        ptr = lookup(node.get_node_name());
+
+    if (identifier->is_indexed_name()) {
+        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        ptr = codegen_indexed_name(*indexed_name);
+    }
+
+    if (identifier->is_codegen_instance_var()) {
+        auto instance_var = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(identifier);
+        ptr = codegen_instance_var(*instance_var);
+    }
+    return ptr;
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
@@ -134,7 +289,7 @@ void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
     }
 
 #define DISPATCH(method_name, intrinsic)                                                           \
-    if (name == method_name) {                                                                     \
+    if (name == (method_name)) {                                                                   \
         llvm::Value* result = builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
         values.push_back(result);                                                                  \
         return;                                                                                    \
@@ -234,12 +389,12 @@ llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
     llvm::Value* result;
 
     switch (bin_op) {
-#define DISPATCH(binary_op, llvm_fp_op, llvm_int_op)         \
-    case binary_op:                                          \
-        if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
-            result = llvm_fp_op(lhs, rhs);                   \
-        else                                                 \
-            result = llvm_int_op(lhs, rhs);                  \
+#define DISPATCH(binary_op, llvm_fp_op, llvm_int_op) \
+    case binary_op:                                  \
+        if (lhs_type->isIntOrIntVectorTy())          \
+            result = llvm_int_op(lhs, rhs);          \
+        else                                         \
+            result = llvm_fp_op(lhs, rhs);           \
         return result;
 
         DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd, builder.CreateAdd);
@@ -256,20 +411,11 @@ llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
 
 void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs) {
     auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
-    if (!var) {
-        throw std::runtime_error("Error: only VarName assignment is currently supported.\n");
-    }
+    if (!var)
+        throw std::runtime_error("Error: only VarName assignment is supported!");
 
-    const auto& identifier = var->get_name();
-    if (identifier->is_name()) {
-        llvm::Value* alloca = lookup(var->get_node_name());
-        builder.CreateStore(rhs, alloca);
-    } else if (identifier->is_indexed_name()) {
-        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-        builder.CreateStore(rhs, codegen_indexed_name(*indexed_name));
-    } else {
-        throw std::runtime_error("Error: Unsupported variable type");
-    }
+    llvm::Value* ptr = get_variable_ptr(*var);
+    builder.CreateStore(rhs, ptr);
 }
 
 llvm::Value* CodegenLLVMVisitor::visit_logical_bin_op(llvm::Value* lhs,
@@ -373,6 +519,117 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
     values.push_back(constant);
 }
 
+// Generating FOR loop in LLVM IR creates the following structure:
+//
+//  +---------------------------+
+//  | <code before for loop>    |
+//  | <for loop initialisation> |
+//  | br %cond                  |
+//  +---------------------------+
+//                |
+//                V
+//  +-----------------------------+
+//  | <condition code>            |
+//  | %cond = ...                 |<------+
+//  | cond_br %cond, %body, %exit |       |
+//  +-----------------------------+       |
+//      |                 |               |
+//      |                 V               |
+//      |     +------------------------+  |
+//      |     | <body code>            |  |
+//      |     | br %inc                |  |
+//      |     +------------------------+  |
+//      |                 |               |
+//      |                 V               |
+//      |     +------------------------+  |
+//      |     | <increment code>       |  |
+//      |      | br %cond              |  |
+//      |     +------------------------+  |
+//      |                 |               |
+//      |                 +---------------+
+//      V
+//  +---------------------------+
+//  | <code after for loop>     |
+//  +---------------------------+
+void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatement& node) {
+    // Get the current and the next blocks within the function.
+    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* next = curr_block->getNextNode();
+    llvm::Function* func = curr_block->getParent();
+
+    // Create the basic blocks for FOR loop.
+    llvm::BasicBlock* for_cond =
+        llvm::BasicBlock::Create(*context, /*Name=*/"for.cond", func, next);
+    llvm::BasicBlock* for_body =
+        llvm::BasicBlock::Create(*context, /*Name=*/"for.body", func, next);
+    llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
+    llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
+
+    // First, initialise the loop in the same basic block.
+    node.get_initialization()->accept(*this);
+
+    // If the loop is to be vectorised, create a separate vector induction variable.
+    // \todo: See the comment for `kernel_id_prefix`.
+    if (vector_width > 1) {
+        // First, create a vector type and alloca for it.
+        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+        llvm::Type* vec_type = llvm::FixedVectorType::get(i32_type, vector_width);
+        llvm::Value* vec_alloca = builder.CreateAlloca(vec_type,
+                                                       /*ArraySize=*/nullptr,
+                                                       /*Name=*/kernel_id_prefix + kernel_id);
+
+        // Then, store the initial value of <0, 1, ..., [W-1]> o the alloca pointer, where W is the
+        // vector width.
+        std::vector<llvm::Constant*> constants;
+        for (unsigned i = 0; i < vector_width; ++i) {
+            const auto& element = llvm::ConstantInt::get(i32_type, i);
+            constants.push_back(element);
+        }
+        llvm::Value* vector_id = llvm::ConstantVector::get(constants);
+        builder.CreateStore(vector_id, vec_alloca);
+    }
+    // Branch to condition basic block and insert condition code there.
+    builder.CreateBr(for_cond);
+    builder.SetInsertPoint(for_cond);
+    node.get_condition()->accept(*this);
+
+    // Extract the condition to decide whether to branch to the loop body or loop exit.
+    llvm::Value* cond = values.back();
+    values.pop_back();
+    builder.CreateCondBr(cond, for_body, exit);
+
+    // Generate code for the loop body and create the basic block for the increment.
+    builder.SetInsertPoint(for_body);
+    is_kernel_code = true;
+    const auto& statement_block = node.get_statement_block();
+    statement_block->accept(*this);
+    is_kernel_code = false;
+    builder.CreateBr(for_inc);
+
+    // Process increment.
+    builder.SetInsertPoint(for_inc);
+    node.get_increment()->accept(*this);
+
+    // If the code is vectorised, then increment the vector id by <W, W, ..., W> where W is the
+    // vector width.
+    // \todo: See the comment for `kernel_id_prefix`.
+    if (vector_width > 1) {
+        // First, create an increment vector.
+        llvm::Value* vector_inc = get_constant_int_vector(vector_width);
+
+        // Increment the kernel id elements by a constant vector width.
+        llvm::Value* vector_id_ptr = lookup(kernel_id_prefix + kernel_id);
+        llvm::Value* vector_id = builder.CreateLoad(vector_id_ptr);
+        llvm::Value* incremented = builder.CreateAdd(vector_id, vector_inc);
+        builder.CreateStore(incremented, vector_id_ptr);
+    }
+
+    // Create a branch to condition block, then generate exit code out of the loop.
+    builder.CreateBr(for_cond);
+    builder.SetInsertPoint(exit);
+}
+
+
 void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node) {
     const auto& name = node.get_node_name();
     const auto& arguments = node.get_arguments();
@@ -406,7 +663,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     block->accept(*this);
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
-    if (node.is_void())
+    if (node.get_return_type()->get_type() == ast::AstNodeType::VOID)
         builder.CreateRetVoid();
 
     // Clear local values stack and remove the pointer to the local symbol table.
@@ -419,7 +676,7 @@ void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturn
         throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
 
     std::string ret = "ret_" + current_func->getName().str();
-    llvm::Value* ret_value = builder.CreateLoad(current_func->getValueSymbolTable()->lookup(ret));
+    llvm::Value* ret_value = builder.CreateLoad(lookup(ret));
     builder.CreateRet(ret_value);
 }
 
@@ -456,6 +713,10 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
 }
 
 void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
+    if (is_kernel_code && vector_width > 1) {
+        values.push_back(get_constant_fp_vector(node.get_value()));
+        return;
+    }
     const auto& constant = llvm::ConstantFP::get(get_default_fp_type(), node.get_value());
     values.push_back(constant);
 }
@@ -547,6 +808,10 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
 }
 
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
+    if (is_kernel_code && vector_width > 1) {
+        values.push_back(get_constant_int_vector(node.get_value()));
+        return;
+    }
     const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
                                                   node.get_value());
     values.push_back(constant);
@@ -561,9 +826,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
 
-    // TODO :: George / Ioannis :: before emitting procedures, we have
-    //         to emmit INSTANCE_STRUCT type as it's used as an argument.
-    //         Currently it's done in node.visit_children which is late.
+    kernel_id = v.get_kernel_id();
 
     // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
@@ -574,8 +837,15 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     // Set the AST symbol table.
     sym_tab = node.get_symbol_table();
 
-    // Proceed with code generation.
-    node.visit_children(*this);
+    // Proceed with code generation. Right now, we do not do
+    //     node.visit_children(*this);
+    // The reason is that the node may contain AST nodes for which the visitor functions have been
+    // defined. In our implementation we assume that the code generation is happening within the
+    // function scope. To avoid generating code outside of functions, visit only them for now.
+    // \todo: Handle what is mentioned here.
+    for (const auto& func: functions) {
+        visit_codegen_function(*func);
+    }
 
     if (opt_passes) {
         logger->info("Running LLVM optimisation passes");
@@ -605,60 +875,21 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
 }
 
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
-    const auto& identifier = node.get_name();
-    if (!identifier->is_name() && !identifier->is_indexed_name())
-        throw std::runtime_error("Error: Unsupported variable type");
-
-    // TODO :: George :: here instance_var_helper can be used to query
-    // variable type and it's index into structure
-    auto name = node.get_node_name();
-
-    auto codegen_var_with_type = instance_var_helper.get_variable(name);
-    auto codegen_var_index = instance_var_helper.get_variable_index(name);
-    // this will be INTEGER or DOUBLE
-    auto var_type = codegen_var_with_type->get_type()->get_type();
-    auto is_pointer = codegen_var_with_type->get_is_pointer();
-
-    llvm::Value* ptr;
-    if (identifier->is_name())
-        ptr = lookup(node.get_node_name());
-
-    if (identifier->is_indexed_name()) {
-        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-        ptr = codegen_indexed_name(*indexed_name);
-    }
+    llvm::Value* ptr = get_variable_ptr(node);
 
     // Finally, load the variable from the pointer value.
     llvm::Value* var = builder.CreateLoad(ptr);
-    values.push_back(var);
-}
 
-void CodegenLLVMVisitor::visit_instance_struct(const ast::InstanceStruct& node) {
-    std::vector<llvm::Type*> members;
-    for (const auto& variable: node.get_codegen_vars()) {
-        // TODO :: Ioannis / George :: we have now double*, int*, double and int
-        //         variables in the instance structure. Each variable is of type
-        //         ast::CodegenVarWithType. So we can query variable type and if
-        //         it's pointer.
-        auto is_pointer = variable->get_is_pointer();
-        auto type = variable->get_type()->get_type();
-
-        // todo : clean up ?
-        if (type == ast::AstNodeType::DOUBLE) {
-            auto llvm_type = is_pointer ? get_default_fp_ptr_type() : get_default_fp_type();
-            members.push_back(llvm_type);
-        } else {
-            if (is_pointer) {
-                members.push_back(llvm::Type::getInt32PtrTy(*context));
-            } else {
-                members.push_back(llvm::Type::getInt32Ty(*context));
-            }
-        }
+    // If the vale should not be vectorised, or it is already a vector, add it to the stack.
+    if (!is_kernel_code || vector_width <= 1 || var->getType()->isVectorTy()) {
+        values.push_back(var);
+        return;
     }
 
-    llvm_struct = llvm::StructType::create(*context, mod_filename + "_Instance");
-    llvm_struct->setBody(members);
-    module->getOrInsertGlobal("inst", llvm_struct);
+    // Otherwise, if we are generating vectorised inside the loop, replicate the value to form a
+    // vector of `vector_width`.
+    llvm::Value* vector_var = builder.CreateVectorSplat(vector_width, var);
+    values.push_back(vector_var);
 }
 
 void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index b20a19bac7..c93b76b1d6 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -82,11 +82,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
     bool use_single_precision;
 
-    // explicit vectorisation width
+    // Explicit vectorisation width.
     int vector_width;
 
-    // LLVM mechanism struct
-    llvm::StructType* llvm_struct;
+    // The name of induction variable used in the kernel functions.
+    std::string kernel_id;
+
+    // A flag to indicate that the code is generated for the kernel.
+    bool is_kernel_code = false;
 
     /**
      *\brief Run LLVM optimisation passes on generated IR
@@ -106,8 +109,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
                        bool opt_passes,
-                       int vector_width = 1,
-                       bool use_single_precision = false)
+                       bool use_single_precision = false,
+                       int vector_width = 1)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
@@ -130,6 +133,13 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Value* codegen_indexed_name(const ast::IndexedName& node);
 
+    /**
+     * Generates LLVM code for the given Instance variable
+     * \param node CodegenInstanceVar NMODL AST node
+     * \return LLVM code generated for this AST node
+     */
+    llvm::Value* codegen_instance_var(const ast::CodegenInstanceVar& node);
+
     /**
      * Returns GEP instruction to 1D array
      * \param name 1D array name
@@ -152,6 +162,20 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
 
+    /**
+     * Returns LLVM vector with `vector_width` int values.
+     * \param int value to replicate
+     * \return LLVM value
+     */
+    llvm::Value* get_constant_int_vector(int value);
+
+    /**
+     * Returns LLVM vector with `vector_width` double values.
+     * \param string a double value to replicate
+     * \return LLVM value
+     */
+    llvm::Value* get_constant_fp_vector(const std::string& value);
+
     /**
      * Returns 64-bit or 32-bit LLVM floating type
      * \return     \c LLVM floating point type according to `use_single_precision` flag
@@ -164,6 +188,18 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Type* get_default_fp_ptr_type();
 
+    /**
+     * Returns a pointer to LLVM struct type
+     * \return LLVM pointer type
+     */
+    llvm::Type* get_instance_struct_type();
+
+    /**
+     * Returns a LLVM value corresponding to the VarName node
+     * \return LLVM value
+     */
+    llvm::Value* get_variable_ptr(const ast::VarName& node);
+
     /**
      * Create a function call to an external method
      * \param name external method name
@@ -255,6 +291,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
     void visit_statement_block(const ast::StatementBlock& node) override;
+    void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
     void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
@@ -267,7 +304,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_program(const ast::Program& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
-    void visit_instance_struct(const ast::InstanceStruct& node) override;
     void visit_while_statement(const ast::WhileStatement& node) override;
 
     // \todo: move this to debug mode (e.g. -v option or --dump-ir)
diff --git a/src/main.cpp b/src/main.cpp
index 248cb2b5ad..62ed2b2251 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -610,7 +610,7 @@ int main(int argc, const char* argv[]) {
             if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(
-                    modfile, output_dir, llvm_opt_passes, llvm_vec_width, llvm_float_type);
+                    modfile, output_dir, llvm_opt_passes, llvm_float_type, llvm_vec_width);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index ba0c725c0c..a376bd3f5c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -794,39 +794,3 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
         }
     }
 }
-
-//=============================================================================
-// Create Instance Struct
-//=============================================================================
-
-SCENARIO("Creation of Instance Struct", "[visitor][llvm][instance_struct]") {
-    GIVEN("NEURON block with RANGE variables and IONS") {
-        std::string nmodl_text = R"(
-            NEURON {
-                USEION na READ ena WRITE ina
-                NONSPECIFIC_CURRENT il
-                RANGE minf, hinf
-            }
-
-            STATE {
-                m
-            }
-
-            ASSIGNED {
-                v (mV)
-                celsius (degC)
-                minf
-                hinf
-            }
-        )";
-
-        THEN("create struct with the declared variables") {
-            std::string module_string = run_llvm_visitor(nmodl_text, true);
-            std::smatch m;
-
-            std::regex instance_struct_declaration(
-                R"(%unknown_Instance = type \{ double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\*, double\* \})");
-            REQUIRE(std::regex_search(module_string, m, instance_struct_declaration));
-        }
-    }
-}

From d4355002f2e19641cd8b00b53ec3b9229bedc605 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 9 Mar 2021 11:50:53 +0300
Subject: [PATCH 136/331] Unit test for scalar state kernel generation in LLVM
 (#547)

This PR adds a unit test to check LLVM instructions generated for
the scalar kernel, particularly:

- FOR loop blocks.

- Induction variable increments and comparisons.

- Correct loads through GEPs from the struct.

Test for vectorised code generation would be added in a separate
PR or when full vectorisation support (indirect indexing) would
land.
---
 test/unit/codegen/codegen_llvm_ir.cpp | 112 +++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 2 deletions(-)

diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index a376bd3f5c..d5b531c5d5 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -12,6 +12,8 @@
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
+#include "visitors/neuron_solve_visitor.hpp"
+#include "visitors/solve_block_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
 
 using namespace nmodl;
@@ -24,16 +26,20 @@ using nmodl::parser::NmodlDriver;
 
 std::string run_llvm_visitor(const std::string& text,
                              bool opt = false,
-                             bool use_single_precision = false) {
+                             bool use_single_precision = false,
+                             int vector_width = 1) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
+    NeuronSolveVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
 
     codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                              /*output_dir=*/".",
                                              opt,
-                                             use_single_precision);
+                                             use_single_precision,
+                                             vector_width);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.print_module();
 }
@@ -770,6 +776,108 @@ SCENARIO("While", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// State scalar kernel
+//=============================================================================
+
+SCENARIO("Scalar state kernel", "[visitor][llvm]") {
+    GIVEN("A neuron state update") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                NONSPECIFIC_CURRENT il
+                RANGE minf, mtau, gl, el
+            }
+
+            STATE {
+                m
+            }
+
+            ASSIGNED {
+                v (mV)
+                minf
+                mtau (ms)
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                il = gl * (v - el)
+            }
+
+            DERIVATIVE states {
+                    m = (minf-m) / mtau
+            }
+        )";
+
+        THEN("a kernel with instance struct as an argument and a FOR loop is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text);
+            std::smatch m;
+
+            // Check the struct type and the kernel declaration.
+            std::regex struct_type(
+                "%.*__instance_var__type = type \\{ double\\*, double\\*, double\\*, double\\*, "
+                "double\\*, double\\*, double\\*, i32\\*, double, double, double, i32, i32 \\}");
+            std::regex kernel_declaration(
+                R"(define void @nrn_state_hh\(%.*__instance_var__type\* .*\))");
+            REQUIRE(std::regex_search(module_string, m, struct_type));
+            REQUIRE(std::regex_search(module_string, m, kernel_declaration));
+
+            // Check for correct induction variable initialisation and a branch to condition block.
+            std::regex alloca_instr(R"(%id = alloca i32)");
+            std::regex br(R"(br label %for\.cond)");
+            REQUIRE(std::regex_search(module_string, m, alloca_instr));
+            REQUIRE(std::regex_search(module_string, m, br));
+
+            // Check condition block: id < mech->node_count, and a conditional branch to loop body
+            // or exit.
+            std::regex condition(
+                "  %.* = load %.*__instance_var__type\\*, %.*__instance_var__type\\*\\* %.*,.*\n"
+                "  %.* = getelementptr inbounds %.*__instance_var__type, "
+                "%.*__instance_var__type\\* "
+                "%.*, i32 0, i32 [0-9]+\n"
+                "  %.* = load i32, i32\\* %.*,.*\n"
+                "  %.* = load i32, i32\\* %id,.*\n"
+                "  %.* = icmp slt i32 %.*, %.*");
+            std::regex cond_br(R"(br i1 %.*, label %for\.body, label %for\.exit)");
+            REQUIRE(std::regex_search(module_string, m, condition));
+            REQUIRE(std::regex_search(module_string, m, cond_br));
+
+            // In the body block, `node_id` and voltage `v` are initialised with the data from the
+            // struct. Check for variable allocations and correct loads from the struct with GEPs.
+            std::regex initialisation(
+                "for\\.body:.*\n"
+                "  %node_id = alloca i32,.*\n"
+                "  %v = alloca double,.*");
+            std::regex load_from_struct(
+                "  %.* = load %.*__instance_var__type\\*, %.*__instance_var__type\\*\\* %.*\n"
+                "  %.* = getelementptr inbounds %.*__instance_var__type, "
+                "%.*__instance_var__type\\* %.*, i32 0, i32 [0-9]+\n"
+                "  %.* = load i32, i32\\* %id,.*\n"
+                "  %.* = sext i32 %.* to i64\n"
+                "  %.* = load (i32|double)\\*, (i32|double)\\*\\* %.*\n"
+                "  %.* = getelementptr inbounds (i32|double), (i32|double)\\* %.*, i64 %.*\n"
+                "  %.* = load (i32|double), (i32|double)\\* %.*");
+            REQUIRE(std::regex_search(module_string, m, initialisation));
+            REQUIRE(std::regex_search(module_string, m, load_from_struct));
+
+            // Check induction variable is incremented in increment block.
+            std::regex increment(
+                "for.inc:.*\n"
+                "  %.* = load i32, i32\\* %id,.*\n"
+                "  %.* = add i32 %.*, 1\n"
+                "  store i32 %.*, i32\\* %id,.*\n"
+                "  br label %for\\.cond");
+            REQUIRE(std::regex_search(module_string, m, increment));
+
+            // Check exit block.
+            std::regex exit(
+                "for\\.exit:.*\n"
+                "  ret void");
+            REQUIRE(std::regex_search(module_string, m, exit));
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From 660cd31059169d70edaa6c68225f70384d6f43c1 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 12 Mar 2021 04:50:38 -0800
Subject: [PATCH 137/331] Indexed name codegen improvements (#550)

Improved index code generation within the LLVM pipeline.
The following issues were addressed:

Array indices are i64 per LLVM's addressing convention.
This means that if the value is not a constant, an additional
sext instruction must be created.

Bounds check is removed since it requires a certain analysis
on the index value. This can be addressed in a separate PR.

`IndexedName` code generation is separated into 2 functions
The first, `get_array_length()` is responsible for array initialisation,
the second, `get_array_index()`, for indexing. In latter case, we
support the following cases:
```
...
// Indexing with an integer constant
k[0] = ...

// Indexing with an integer expression
k[10 - 10]

// Indexing with a `Name` AST node that is an integer
// (in our case a FOR loop induction variable or a variable
// with `CodegenVarType` == `Integer`
k[id] = ...
k[ena_id] = ...
```
Note that the case:
```
// id := loop integer induction variable
k[id + 1] = ...
```
is not supported for 2 reasons:

On the AST level, as per #545 the expression would
contain a Name and not VarName node that fails the
code generation.

The case only arises in the kernel functions like state_update,
where indexing is "artificially" created with indexing by a Name
only.

fixes #541
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 71 +++++++++++++----------
 src/codegen/llvm/codegen_llvm_visitor.hpp | 21 +++----
 test/unit/codegen/codegen_llvm_ir.cpp     | 37 +++++-------
 3 files changed, 65 insertions(+), 64 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 62e69449b7..cd2af2af69 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -39,28 +39,17 @@ static bool is_supported_statement(const ast::Statement& statement) {
            statement.is_if_statement() || statement.is_while_statement();
 }
 
-bool CodegenLLVMVisitor::check_array_bounds(const ast::IndexedName& node, unsigned index) {
-    llvm::Type* array_type = lookup(node.get_node_name())->getType()->getPointerElementType();
-    unsigned length = array_type->getArrayNumElements();
-    return 0 <= index && index < length;
-}
-
-llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, unsigned index) {
-    llvm::Type* index_type = llvm::Type::getInt32Ty(*context);
+llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, llvm::Value* index) {
+    llvm::Type* index_type = llvm::Type::getInt64Ty(*context);
     std::vector<llvm::Value*> indices;
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
-    indices.push_back(llvm::ConstantInt::get(index_type, index));
+    indices.push_back(index);
 
     return builder.CreateInBoundsGEP(lookup(name), indices);
 }
 
 llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
-    unsigned index = get_array_index_or_length(node);
-
-    // Check if index is within array bounds.
-    if (!check_array_bounds(node, index))
-        throw std::runtime_error("Error: Index is out of bounds");
-
+    llvm::Value* index = get_array_index(node);
     return create_gep(node.get_node_name(), index);
 }
 
@@ -96,20 +85,11 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     if (!member_var_name->get_name()->is_indexed_name())
         throw std::runtime_error("Error: " + member_name + " is not an IndexedName!");
 
-    // Proceed to creating a GEP instruction to get the pointer to the member's element. While LLVM
-    // Helper set the indices to be Name nodes, a sanity check is added here. Note that this step
-    // can be avoided if using `get_array_index_or_length()`. However, it does not support indexing
-    // with Name/Expression at the moment. \todo: Reuse `get_array_index_or_length()` here.
+    // Proceed to creating a GEP instruction to get the pointer to the member's element.
     auto member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
         member_var_name->get_name());
-    if (!member_indexed_name->get_length()->is_name())
-        throw std::runtime_error("Error: " + member_name + " has a non-Name index!");
+    llvm::Value* i64_index = get_array_index(*member_indexed_name);
 
-    // Load the index variable that will be used to access the member's element. Since we index a
-    // pointer variable, we need to extend the 32-bit integer index variable to 64-bit.
-    llvm::Value* i32_index = builder.CreateLoad(
-        lookup(member_indexed_name->get_length()->get_node_name()));
-    llvm::Value* i64_index = builder.CreateSExt(i32_index, llvm::Type::getInt64Ty(*context));
 
     // Create a indices vector for GEP to return the pointer to the element at the specified index.
     std::vector<llvm::Value*> member_indices;
@@ -135,17 +115,44 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     return builder.CreateInBoundsGEP(instance_member, member_indices);
 }
 
-unsigned CodegenLLVMVisitor::get_array_index_or_length(const ast::IndexedName& indexed_name) {
-    // \todo: Support indices with expressions and names: k[i + j] = ...
-    auto integer = std::dynamic_pointer_cast<ast::Integer>(indexed_name.get_length());
+llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
+    // Process the index expression. It can either be a Name node:
+    //    k[id]     // id is an integer
+    // or an integer expression.
+    llvm::Value* index_value;
+    if (node.get_length()->is_name()) {
+        llvm::Value* ptr = lookup(node.get_length()->get_node_name());
+        index_value = builder.CreateLoad(ptr);
+    } else {
+        node.get_length()->accept(*this);
+        index_value = values.back();
+        values.pop_back();
+    }
+
+    // Check if index is a double. While it is possible to use casting from double to integer
+    // values, we choose not to support these cases.
+    if (!index_value->getType()->isIntOrIntVectorTy())
+        throw std::runtime_error("Error: only integer indexing is supported!");
+
+    // Conventionally, in LLVM array indices are 64 bit.
+    auto index_type = llvm::cast<llvm::IntegerType>(index_value->getType());
+    llvm::Type* i64_type = llvm::Type::getInt64Ty(*context);
+    if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
+        return index_value;
+
+    return builder.CreateSExtOrTrunc(index_value, i64_type);
+}
+
+int CodegenLLVMVisitor::get_array_length(const ast::IndexedName& node) {
+    auto integer = std::dynamic_pointer_cast<ast::Integer>(node.get_length());
     if (!integer)
-        throw std::runtime_error("Error: only integer indices/length are supported!");
+        throw std::runtime_error("Error: only integer length is supported!");
 
     // Check if integer value is taken from a macro.
     if (!integer->get_macro())
         return integer->get_value();
     const auto& macro = sym_tab->lookup(integer->get_macro()->get_node_name());
-    return static_cast<unsigned>(*macro->get_value());
+    return static_cast<int>(*macro->get_value());
 }
 
 llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType& node) {
@@ -691,7 +698,7 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
         llvm::Type* var_type;
         if (identifier->is_indexed_name()) {
             auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-            unsigned length = get_array_index_or_length(*indexed_name);
+            int length = get_array_length(*indexed_name);
             var_type = llvm::ArrayType::get(scalar_var_type, length);
         } else if (identifier->is_name()) {
             // This case corresponds to a scalar local variable. Its type is double by default.
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index c93b76b1d6..1477e0d66d 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -119,12 +119,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , builder(*context)
         , fpm(module.get()) {}
 
-    /**
-     * Checks if array index specified by the given IndexedName is within bounds
-     * \param node IndexedName representing array
-     * \return     \c true if the index is within bounds
-     */
-    bool check_array_bounds(const ast::IndexedName& node, unsigned index);
 
     /**
      * Generates LLVM code for the given IndexedName
@@ -146,14 +140,21 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      * \param index element index
      * \return GEP instruction value
      */
-    llvm::Value* create_gep(const std::string& name, unsigned index);
+    llvm::Value* create_gep(const std::string& name, llvm::Value* index);
+
+    /**
+     * Returns array index from given IndexedName
+     * \param node IndexedName representing array
+     * \return array index
+     */
+    llvm::Value* get_array_index(const ast::IndexedName& node);
 
     /**
-     * Returns array index or length from given IndexedName
+     * Returns array length from given IndexedName
      * \param node IndexedName representing array
-     * \return array index or length
+     * \return array length
      */
-    unsigned get_array_index_or_length(const ast::IndexedName& node);
+    int get_array_length(const ast::IndexedName& node);
 
     /**
      * Returns LLVM type for the given CodegenVarType node
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index d5b531c5d5..58c1e2a7eb 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -557,6 +557,7 @@ SCENARIO("Indexed name", "[visitor][llvm]") {
         std::string nmodl_text = R"(
             PROCEDURE foo() {
                 LOCAL x[2]
+                x[10 - 10] = 1
                 x[1] = 3
             }
         )";
@@ -565,14 +566,19 @@ SCENARIO("Indexed name", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check GEP is created correctly to pint at array element.
-            std::regex GEP(
-                R"(%1 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i32 0, i32 1)");
-            REQUIRE(std::regex_search(module_string, m, GEP));
-
-            // Check the value is stored to the pointer.
-            std::regex store(R"(store double 3.000000e\+00, double\* %1)");
-            REQUIRE(std::regex_search(module_string, m, store));
+            // Check GEPs are created correctly to get the addresses of array elements.
+            std::regex GEP1(
+                R"(%1 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i64 0, i64 0)");
+            std::regex GEP2(
+                R"(%2 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i64 0, i64 1)");
+            REQUIRE(std::regex_search(module_string, m, GEP1));
+            REQUIRE(std::regex_search(module_string, m, GEP2));
+
+            // Check the value is stored to the correct addresses.
+            std::regex store1(R"(store double 1.000000e\+00, double\* %1)");
+            std::regex store2(R"(store double 3.000000e\+00, double\* %2)");
+            REQUIRE(std::regex_search(module_string, m, store1));
+            REQUIRE(std::regex_search(module_string, m, store2));
         }
     }
 
@@ -591,7 +597,7 @@ SCENARIO("Indexed name", "[visitor][llvm]") {
 
             // Check GEP is created correctly to pint at array element.
             std::regex GEP(
-                R"(%2 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i32 0, i32 1)");
+                R"(%2 = getelementptr inbounds \[2 x double\], \[2 x double\]\* %x, i64 0, i64 1)");
             REQUIRE(std::regex_search(module_string, m, GEP));
 
             // Check the value is loaded from the pointer.
@@ -603,19 +609,6 @@ SCENARIO("Indexed name", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, store));
         }
     }
-
-    GIVEN("Array with out of bounds access") {
-        std::string nmodl_text = R"(
-            PROCEDURE foo() {
-                LOCAL x[2]
-                x[5] = 3
-            }
-        )";
-
-        THEN("error is thrown") {
-            REQUIRE_THROWS_AS(run_llvm_visitor(nmodl_text), std::runtime_error);
-        }
-    }
 }
 
 //=============================================================================

From 1643f7d026ea47f7889a3ac2d4ec9abc65c9fa19 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Sat, 13 Mar 2021 12:35:18 +0100
Subject: [PATCH 138/331] Add InstanceStruct  test data generation helper and
 unit test (#546)

* CodegenLLVMHelperVisitor improved without hardcoded parameters
* Added get_instance_struct_ptr to get instance structure for variable information
* test/unit/codegen/codegen_data_helper.cpp : first draft implementation
   of codegen data helper
* Added test for typecasting to the proper struct type

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 .../llvm/codegen_llvm_helper_visitor.cpp      |  25 ++-
 .../llvm/codegen_llvm_helper_visitor.hpp      |  14 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   4 +
 src/codegen/llvm/codegen_llvm_visitor.hpp     |   6 +
 test/unit/CMakeLists.txt                      |   5 +-
 test/unit/codegen/codegen_data_helper.cpp     | 186 ++++++++++++++++++
 test/unit/codegen/codegen_data_helper.hpp     | 111 +++++++++++
 .../codegen/codegen_llvm_instance_struct.cpp  | 174 ++++++++++++++++
 8 files changed, 512 insertions(+), 13 deletions(-)
 create mode 100644 test/unit/codegen/codegen_data_helper.cpp
 create mode 100644 test/unit/codegen/codegen_data_helper.hpp
 create mode 100644 test/unit/codegen/codegen_llvm_instance_struct.cpp

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c34ae2c873..c8143ac393 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -18,6 +18,13 @@ namespace codegen {
 
 using namespace fmt::literals;
 
+/// initialize static member variables
+const ast::AstNodeType CodegenLLVMHelperVisitor::INTEGER_TYPE = ast::AstNodeType::INTEGER;
+const ast::AstNodeType CodegenLLVMHelperVisitor::FLOAT_TYPE = ast::AstNodeType::DOUBLE;
+const std::string CodegenLLVMHelperVisitor::NODECOUNT_VAR = "node_count";
+const std::string CodegenLLVMHelperVisitor::VOLTAGE_VAR = "voltage";
+const std::string CodegenLLVMHelperVisitor::NODE_INDEX_VAR = "node_index";
+
 /**
  * \brief Create variable definition statement
  *
@@ -157,7 +164,12 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     auto function = std::make_shared<ast::CodegenFunction>(fun_ret_type, name, arguments, block);
     codegen_functions.push_back(function);
 }
-
+/**
+ * \note : Order of variables is not important but we assume all pointers
+ * are added first and then scalar variables like t, dt, second_order etc.
+ * This order is assumed when we allocate data for integration testing
+ * and benchmarking purpose. See CodegenDataHelper::create_data().
+ */
 std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_struct() {
     ast::CodegenVarWithTypeVector codegen_vars;
 
@@ -186,15 +198,15 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     }
 
     // add voltage and node index
-    add_var_with_type("voltage", FLOAT_TYPE, /*is_pointer=*/1);
-    add_var_with_type("node_index", INTEGER_TYPE, /*is_pointer=*/1);
+    add_var_with_type(VOLTAGE_VAR, FLOAT_TYPE, /*is_pointer=*/1);
+    add_var_with_type(NODE_INDEX_VAR, INTEGER_TYPE, /*is_pointer=*/1);
 
     // add dt, t, celsius
     add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
     add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
     add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
     add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
-    add_var_with_type(MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
+    add_var_with_type(NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
 
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
@@ -510,7 +522,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
 
     /// loop constructs : initialization, condition and increment
     const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
-    const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, MECH_NODECOUNT_VAR));
+    const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
     const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
 
     /// loop body : initialization + solve blocks
@@ -524,7 +536,8 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         /// access node index and corresponding voltage
         loop_index_statements.push_back(
             visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
-        loop_body_statements.push_back(visitor::create_statement("v = voltage[node_id]"));
+        loop_body_statements.push_back(
+            visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
 
         /// read ion variables
         ion_read_statements(BlockType::State,
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index b67aa7ee09..446d5a6fd9 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -112,13 +112,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     /// mechanism data helper
     InstanceVarHelper instance_var_helper;
 
-    /// default integer and float node type
-    const ast::AstNodeType INTEGER_TYPE = ast::AstNodeType::INTEGER;
-    const ast::AstNodeType FLOAT_TYPE = ast::AstNodeType::DOUBLE;
-
     /// name of the mechanism instance parameter
     const std::string MECH_INSTANCE_VAR = "mech";
-    const std::string MECH_NODECOUNT_VAR = "node_count";
 
     /// name of induction variable used in the kernel.
     const std::string INDUCTION_VAR = "id";
@@ -130,6 +125,15 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     std::shared_ptr<ast::InstanceStruct> create_instance_struct();
 
   public:
+    /// default integer and float node type
+    static const ast::AstNodeType INTEGER_TYPE;
+    static const ast::AstNodeType FLOAT_TYPE;
+
+    // node count, voltage and node index variables
+    static const std::string NODECOUNT_VAR;
+    static const std::string VOLTAGE_VAR;
+    static const std::string NODE_INDEX_VAR;
+
     CodegenLLVMHelperVisitor(int vector_width)
         : vector_width(vector_width){};
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index cd2af2af69..b1182d36b9 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -259,6 +259,10 @@ llvm::Value* CodegenLLVMVisitor::get_variable_ptr(const ast::VarName& node) {
     return ptr;
 }
 
+std::shared_ptr<ast::InstanceStruct> CodegenLLVMVisitor::get_instance_struct_ptr() {
+    return instance_var_helper.instance;
+}
+
 void CodegenLLVMVisitor::run_llvm_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
     fpm.add(llvm::createInstructionCombiningPass());
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 1477e0d66d..41235a1ff0 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -201,6 +201,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     llvm::Value* get_variable_ptr(const ast::VarName& node);
 
+    /**
+     * Returns shared_ptr to generated ast::InstanceStruct
+     * \return std::shared_ptr<ast::InstanceStruct>
+     */
+    std::shared_ptr<ast::InstanceStruct> get_instance_struct_ptr();
+
     /**
      * Create a function call to an external method
      * \param name external method name
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index e99f257f88..47b525767a 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -105,8 +105,9 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 
 if(NMODL_ENABLE_LLVM)
-  include_directories(${LLVM_INCLUDE_DIRS})
-  add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp)
+  include_directories(${LLVM_INCLUDE_DIRS} codegen)
+  add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp
+                          codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_llvm_execution.cpp)
   target_link_libraries(
     testllvm
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
new file mode 100644
index 0000000000..e42cfe01f3
--- /dev/null
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -0,0 +1,186 @@
+#include <algorithm>
+
+#include "ast/codegen_var_type.hpp"
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+
+#include "codegen_data_helper.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+// scalar variables with default values
+const double default_nthread_dt_value = 0.025;
+const double default_nthread_t_value = 100.0;
+const double default_celsius_value = 34.0;
+const int default_second_order_value = 0;
+
+// cleanup all members and struct base pointer
+CodegenInstanceData::~CodegenInstanceData() {
+    // first free num_ptr_members members which are pointers
+    for (size_t i = 0; i < num_ptr_members; i++) {
+        free(members[i]);
+    }
+    // and then pointer to container struct
+    free(base_ptr);
+}
+
+/**
+ * \todo : various things can be improved here
+ * - if variable is voltage then initialization range could be -65 to +65
+ * - if variable is double or float then those could be initialize with
+ *   "some" floating point value between range like 1.0 to 100.0. Note
+ *   it would be nice to have unique values to avoid errors like division
+ *   by zero. We have simple implementation that is taking care of this.
+ * - if variable is integer then initialization range must be between
+ *   0 and num_elements. In practice, num_elements is number of instances
+ *   of a particular mechanism. This would be <= number of compartments
+ *   in the cell. For now, just initialize integer variables from 0 to
+ *   num_elements - 1.
+ */
+void initialize_variable(const std::shared_ptr<ast::CodegenVarWithType>& var,
+                         void* ptr,
+                         size_t initial_value,
+                         size_t num_elements) {
+    ast::AstNodeType type = var->get_type()->get_type();
+    const std::string& name = var->get_name()->get_node_name();
+
+    if (type == ast::AstNodeType::DOUBLE) {
+        const auto& generated_double_data = generate_dummy_data<double>(initial_value,
+                                                                        num_elements);
+        double* data = (double*) ptr;
+        for (size_t i = 0; i < num_elements; i++) {
+            data[i] = generated_double_data[i];
+        }
+    } else if (type == ast::AstNodeType::FLOAT) {
+        const auto& generated_float_data = generate_dummy_data<float>(initial_value, num_elements);
+        float* data = (float*) ptr;
+        for (size_t i = 0; i < num_elements; i++) {
+            data[i] = generated_float_data[i];
+        }
+    } else if (type == ast::AstNodeType::INTEGER) {
+        const auto& generated_int_data = generate_dummy_data<int>(initial_value, num_elements);
+        int* data = (int*) ptr;
+        for (size_t i = 0; i < num_elements; i++) {
+            data[i] = generated_int_data[i];
+        }
+    } else {
+        throw std::runtime_error("Unhandled data type during initialize_variable");
+    };
+}
+
+CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t seed) {
+    // alignment with 64-byte to generate aligned loads/stores
+    const unsigned NBYTE_ALIGNMENT = 64;
+
+    // get variable information
+    const auto& variables = instance->get_codegen_vars();
+
+    // start building data
+    CodegenInstanceData data;
+    data.num_elements = num_elements;
+
+    // base pointer to instance object
+    void* base = nullptr;
+
+    // max size of each member : pointer / double has maximum size
+    size_t member_size = std::max(sizeof(double), sizeof(double*));
+
+    // allocate instance object with memory alignment
+    posix_memalign(&base, NBYTE_ALIGNMENT, member_size * variables.size());
+    data.base_ptr = base;
+
+    size_t offset = 0;
+    void* ptr = base;
+    size_t variable_index = 0;
+
+    // allocate each variable and allocate memory at particular offset in base pointer
+    for (auto& var: variables) {
+        // only process until first non-pointer variable
+        if (!var->get_is_pointer()) {
+            break;
+        }
+
+        // check type of variable and it's size
+        size_t member_size = 0;
+        ast::AstNodeType type = var->get_type()->get_type();
+        if (type == ast::AstNodeType::DOUBLE) {
+            member_size = sizeof(double);
+        } else if (type == ast::AstNodeType::FLOAT) {
+            member_size = sizeof(float);
+        } else if (type == ast::AstNodeType::INTEGER) {
+            member_size = sizeof(int);
+        }
+
+        // allocate memory and setup a pointer
+        void* member;
+        posix_memalign(&member, NBYTE_ALIGNMENT, member_size * num_elements);
+        initialize_variable(var, member, variable_index, num_elements);
+
+        // copy address at specific location in the struct
+        memcpy(ptr, &member, sizeof(double*));
+
+        data.offsets.push_back(offset);
+        data.members.push_back(member);
+        data.num_ptr_members++;
+
+        // all pointer types are of same size, so just use double*
+        offset += sizeof(double*);
+        ptr = (char*) base + offset;
+
+        variable_index++;
+    }
+
+    // we are now switching from pointer type to next member type (e.g. double)
+    // ideally we should use padding but switching from double* to double should
+    // already meet alignment requirements
+    for (auto& var: variables) {
+        // process only scalar elements
+        if (var->get_is_pointer()) {
+            continue;
+        }
+        ast::AstNodeType type = var->get_type()->get_type();
+        const std::string& name = var->get_name()->get_node_name();
+
+        // some default values for standard parameters
+        double value = 0;
+        if (name == naming::NTHREAD_DT_VARIABLE) {
+            value = default_nthread_dt_value;
+        } else if (name == naming::NTHREAD_T_VARIABLE) {
+            value = default_nthread_t_value;
+        } else if (name == naming::CELSIUS_VARIABLE) {
+            value = default_celsius_value;
+        } else if (name == CodegenLLVMHelperVisitor::NODECOUNT_VAR) {
+            value = num_elements;
+        } else if (name == naming::SECOND_ORDER_VARIABLE) {
+            value = default_second_order_value;
+        }
+
+        if (type == ast::AstNodeType::DOUBLE) {
+            *((double*) ptr) = value;
+            data.offsets.push_back(offset);
+            data.members.push_back(ptr);
+            offset += sizeof(double);
+            ptr = (char*) base + offset;
+        } else if (type == ast::AstNodeType::FLOAT) {
+            *((float*) ptr) = float(value);
+            data.offsets.push_back(offset);
+            data.members.push_back(ptr);
+            offset += sizeof(float);
+            ptr = (char*) base + offset;
+        } else if (type == ast::AstNodeType::INTEGER) {
+            *((int*) ptr) = int(value);
+            data.offsets.push_back(offset);
+            data.members.push_back(ptr);
+            offset += sizeof(int);
+            ptr = (char*) base + offset;
+        } else {
+            throw std::runtime_error(
+                "Unhandled type while allocating data in CodegenDataHelper::create_data()");
+        }
+    }
+
+    return data;
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/test/unit/codegen/codegen_data_helper.hpp b/test/unit/codegen/codegen_data_helper.hpp
new file mode 100644
index 0000000000..368b964147
--- /dev/null
+++ b/test/unit/codegen/codegen_data_helper.hpp
@@ -0,0 +1,111 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+#include "ast/ast.hpp"
+
+/// \file
+/// \brief Generate test data for testing and benchmarking compute kernels
+
+namespace nmodl {
+namespace codegen {
+
+/// common scalar variables
+extern const double default_nthread_dt_value;
+extern const double default_nthread_t_value;
+extern const double default_celsius_value;
+extern const int default_second_order_value;
+
+/**
+ * \class CodegenInstanceData
+ * \brief Wrapper class to pack data allocate for instance
+ */
+struct CodegenInstanceData {
+    /// base pointer which can be type casted
+    /// to instance struct at run time
+    void* base_ptr = nullptr;
+
+    /// length of each member of pointer type
+    size_t num_elements = 0;
+
+    /// number of pointer members
+    size_t num_ptr_members = 0;
+
+    /// offset relative to base_ptr to locate
+    /// each member variable in instance struct
+    std::vector<size_t> offsets;
+
+    /// pointer to array allocated for each member variable
+    /// i.e. *(base_ptr + offsets[0]) will be members[0]
+    std::vector<void*> members;
+
+    // cleanup all memory allocated for type and member variables
+    ~CodegenInstanceData();
+};
+
+
+/**
+ * Generate vector of dummy data according to the template type specified
+ *
+ * For double type: generate vector starting from (initial_value + 1e-15)
+ *                  with increments of 1e-15
+ * For float type:  generate vector starting from (initial_value + 1e-6)
+ *                  with increments of 1e-6
+ * For int type:    generate vector starting from (initial_value + 1) with
+ *                  increments of 1
+ *
+ * \param inital_value Base value for initializing the data
+ * \param num_elements Number of element of the generated vector
+ * \return std::vector<T> of dummy data for testing purposes
+ */
+template <typename T>
+std::vector<T> generate_dummy_data(size_t initial_value, size_t num_elements) {
+    std::vector<T> data(num_elements);
+    T precision;
+    if (std::is_same<T, double>::value) {
+        precision = 1e-15;
+    } else if (std::is_same<T, float>::value) {
+        precision = 1e-6;
+    } else {
+        precision = 1;
+    }
+    for (size_t i = 0; i < num_elements; i++) {
+        data[i] = initial_value + precision * (i + 1);
+    }
+    return data;
+}
+
+/**
+ * \class CodegenDataHelper
+ * \brief Helper to allocate and initialize data for benchmarking
+ *
+ * The `ast::InstanceStruct` is has different number of member
+ * variables for different MOD files and hence we can't instantiate
+ * it at compile time. This class helps to inspect the variables
+ * information gathered from AST and allocate memory block that
+ * can be type cast to the `ast::InstanceStruct` corresponding
+ * to the MOD file.
+ */
+class CodegenDataHelper {
+    std::shared_ptr<ast::Program> program;
+    std::shared_ptr<ast::InstanceStruct> instance;
+
+  public:
+    CodegenDataHelper() = delete;
+    CodegenDataHelper(const std::shared_ptr<ast::Program>& program,
+                      const std::shared_ptr<ast::InstanceStruct>& instance)
+        : program(program)
+        , instance(instance) {}
+
+    CodegenInstanceData create_data(size_t num_elements, size_t seed);
+};
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
new file mode 100644
index 0000000000..4bfa1cd31c
--- /dev/null
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -0,0 +1,174 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <catch/catch.hpp>
+
+#include "ast/all.hpp"
+#include "ast/program.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen_data_helper.hpp"
+#include "parser/nmodl_driver.hpp"
+#include "visitors/checkparent_visitor.hpp"
+#include "visitors/neuron_solve_visitor.hpp"
+#include "visitors/solve_block_visitor.hpp"
+#include "visitors/symtab_visitor.hpp"
+
+using namespace nmodl;
+using namespace codegen;
+using namespace visitor;
+using nmodl::parser::NmodlDriver;
+
+//=============================================================================
+// Utility to get initialized Struct Instance data
+//=============================================================================
+
+codegen::CodegenInstanceData generate_instance_data(const std::string& text,
+                                                    bool opt = false,
+                                                    bool use_single_precision = false,
+                                                    int vector_width = 1,
+                                                    size_t num_elements = 100,
+                                                    size_t seed = 1) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    // Generate full AST and solve the BREAKPOINT block to be able to generate the Instance Struct
+    SymtabVisitor().visit_program(*ast);
+    NeuronSolveVisitor().visit_program(*ast);
+
+    codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"test",
+                                             /*output_dir=*/".",
+                                             opt,
+                                             use_single_precision,
+                                             vector_width);
+    llvm_visitor.visit_program(*ast);
+    llvm_visitor.print_module();
+    const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+    auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+    auto instance_data = codegen_data.create_data(num_elements, seed);
+    return instance_data;
+}
+
+template <typename T>
+bool compare(void* instance_struct_data_ptr, const std::vector<T>& generated_data) {
+    std::vector<T> instance_struct_vector;
+    std::cout << "Generated data size: " << generated_data.size() << std::endl;
+    instance_struct_vector.assign(static_cast<T*>(instance_struct_data_ptr),
+                                  static_cast<T*>(instance_struct_data_ptr) +
+                                      generated_data.size());
+    for (auto value: instance_struct_vector) {
+        std::cout << value << std::endl;
+    }
+    return instance_struct_vector == generated_data;
+}
+
+//=============================================================================
+// Simple Instance Struct creation
+//=============================================================================
+
+SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
+    GIVEN("Instantiate simple Instance Struct") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                USEION na READ ena
+                RANGE minf, mtau
+            }
+
+            STATE {
+                m
+            }
+
+            ASSIGNED {
+                v (mV)
+                celsius (degC)
+                ena (mV)
+                minf
+                mtau
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+                m' =  (minf-m)/mtau
+            }
+        )";
+
+
+        THEN("instance struct elements are properly initialized") {
+            const size_t num_elements = 10;
+            constexpr static double seed = 42;
+            auto instance_data = generate_instance_data(nmodl_text,
+                                                        /*opt=*/false,
+                                                        /*use_single_precision=*/true,
+                                                        /*vector_width*/ 1,
+                                                        num_elements,
+                                                        seed);
+            size_t minf_index = 0;
+            size_t mtau_index = 1;
+            size_t m_index = 2;
+            size_t Dm_index = 3;
+            size_t ena_index = 4;
+            size_t v_unused_index = 5;
+            size_t g_unused_index = 6;
+            size_t ion_ena_index = 7;
+            size_t ion_ena_index_index = 8;
+            size_t voltage_index = 9;
+            size_t node_index_index = 10;
+            size_t t_index = 11;
+            size_t dt_index = 12;
+            size_t celsius_index = 13;
+            size_t secondorder_index = 14;
+            size_t node_count_index = 15;
+            // Check if the various instance struct fields are properly initialized
+            REQUIRE(compare(instance_data.members[minf_index],
+                            generate_dummy_data<double>(minf_index, num_elements)));
+            REQUIRE(compare(instance_data.members[ena_index],
+                            generate_dummy_data<double>(ena_index, num_elements)));
+            REQUIRE(compare(instance_data.members[ion_ena_index],
+                            generate_dummy_data<double>(ion_ena_index, num_elements)));
+            REQUIRE(compare(instance_data.members[node_index_index],
+                            generate_dummy_data<int>(node_index_index, num_elements)));
+            REQUIRE(*static_cast<double*>(instance_data.members[t_index]) ==
+                    default_nthread_t_value);
+            REQUIRE(*static_cast<int*>(instance_data.members[node_count_index]) == num_elements);
+
+            // Hard code TestInstanceType struct
+            struct TestInstanceType {
+                double* minf;
+                double* mtau;
+                double* m;
+                double* Dm;
+                double* ena;
+                double* v_unused;
+                double* g_unused;
+                double* ion_ena;
+                int* ion_ena_index;
+                double* voltage;
+                int* node_index;
+                double t;
+                double dt;
+                double celsius;
+                int secondorder;
+                int node_count;
+            };
+            // Test if TestInstanceType struct is properly initialized
+            // Cast void ptr instance_data.base_ptr to TestInstanceType*
+            TestInstanceType* instance = (TestInstanceType*) instance_data.base_ptr;
+            REQUIRE(compare(instance->minf, generate_dummy_data<double>(minf_index, num_elements)));
+            REQUIRE(compare(instance->ena, generate_dummy_data<double>(ena_index, num_elements)));
+            REQUIRE(compare(instance->ion_ena,
+                            generate_dummy_data<double>(ion_ena_index, num_elements)));
+            REQUIRE(compare(instance->node_index,
+                            generate_dummy_data<int>(node_index_index, num_elements)));
+            REQUIRE(instance->t == default_nthread_t_value);
+            REQUIRE(instance->celsius == default_celsius_value);
+            REQUIRE(instance->secondorder == default_second_order_value);
+        }
+    }
+}

From 46133d0984bb40de978c7408a59cf875843b9f98 Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Wed, 17 Mar 2021 12:57:02 +0100
Subject: [PATCH 139/331] Add the remainder loop for vectorization of
 DERIVATIVE block (#534)

* Implement remainder loop along with main vector loop
* Add unit test for the same

fixes #532
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 56 ++++++++----
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  6 +-
 test/unit/codegen/codegen_llvm_ir.cpp         | 89 ++++++++++++++++++-
 3 files changed, 132 insertions(+), 19 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c8143ac393..0173664a8c 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -520,11 +520,6 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
 
     /// create now main compute part : for loop over channel instances
 
-    /// loop constructs : initialization, condition and increment
-    const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
-    const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
-    const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
-
     /// loop body : initialization + solve blocks
     ast::StatementVector loop_def_statements;
     ast::StatementVector loop_index_statements;
@@ -583,20 +578,49 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// now construct a new code block which will become the body of the loop
     auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
 
-    /// convert local statement to codegenvar statement
-    convert_local_statement(*loop_block);
+    /// main loop possibly vectorized on vector_width
+    {
+        /// loop constructs : initialization, condition and increment
+        const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
+        const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
+        const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
+
+        /// clone it
+        auto local_loop_block = std::shared_ptr<ast::StatementBlock>(loop_block->clone());
 
-    /// create for loop node
-    auto for_loop_statement = std::make_shared<ast::CodegenForStatement>(initialization,
-                                                                         condition,
-                                                                         increment,
-                                                                         loop_block);
+        /// convert local statement to codegenvar statement
+        convert_local_statement(*local_loop_block);
 
-    /// convert all variables inside loop body to instance variables
-    convert_to_instance_variable(*for_loop_statement, loop_index_var);
+        auto for_loop_statement_main = std::make_shared<ast::CodegenForStatement>(initialization,
+                                                                                  condition,
+                                                                                  increment,
+                                                                                  local_loop_block);
+
+        /// convert all variables inside loop body to instance variables
+        convert_to_instance_variable(*for_loop_statement_main, loop_index_var);
+
+        /// loop itself becomes one of the statement in the function
+        function_statements.push_back(for_loop_statement_main);
+    }
 
-    /// loop itself becomes one of the statement in the function
-    function_statements.push_back(for_loop_statement);
+    /// remainder loop possibly vectorized on vector_width
+    {
+        /// loop constructs : initialization, condition and increment
+        const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
+        const auto& increment = loop_increment_expression(INDUCTION_VAR, 1);
+
+        /// convert local statement to codegenvar statement
+        convert_local_statement(*loop_block);
+
+        auto for_loop_statement_remainder =
+            std::make_shared<ast::CodegenForStatement>(nullptr, condition, increment, loop_block);
+
+        /// convert all variables inside loop body to instance variables
+        convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
+
+        /// loop itself becomes one of the statement in the function
+        function_statements.push_back(for_loop_statement_remainder);
+    }
 
     /// new block for the function
     auto function_block = new ast::StatementBlock(function_statements);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index b1182d36b9..bed88046a7 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -576,8 +576,10 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
-    // First, initialise the loop in the same basic block.
-    node.get_initialization()->accept(*this);
+    // First, initialise the loop in the same basic block. This block is optional.
+    if (node.get_initialization()) {
+        node.get_initialization()->accept(*this);
+    }
 
     // If the loop is to be vectorised, create a separate vector induction variable.
     // \todo: See the comment for `kernel_id_prefix`.
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 58c1e2a7eb..3ab0c8d929 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -8,16 +8,25 @@
 #include <catch/catch.hpp>
 #include <regex>
 
+#include "test/unit/utils/test_utils.hpp"
+
 #include "ast/program.hpp"
+#include "ast/statement_block.hpp"
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
 #include "visitors/neuron_solve_visitor.hpp"
 #include "visitors/solve_block_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
+#include "visitors/visitor_utils.hpp"
 
 using namespace nmodl;
+using namespace codegen;
 using namespace visitor;
+
+using namespace test_utils;
+
 using nmodl::parser::NmodlDriver;
 
 //=============================================================================
@@ -44,6 +53,24 @@ std::string run_llvm_visitor(const std::string& text,
     return llvm_visitor.print_module();
 }
 
+//=============================================================================
+// Utility to get specific LLVM nodes
+//=============================================================================
+
+std::vector<std::shared_ptr<ast::Ast>> run_codegen_visitor_helper(const std::string& text) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    /// construct symbol table and run codegen helper visitor
+    SymtabVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
+    CodegenLLVMHelperVisitor(8).visit_program(*ast);
+
+    const auto& nodes = collect_nodes(*ast, {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
+
+    return nodes;
+}
+
 //=============================================================================
 // BinaryExpression and Double
 //=============================================================================
@@ -864,13 +891,73 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
 
             // Check exit block.
             std::regex exit(
-                "for\\.exit:.*\n"
+                "for\\.exit[0-9]*:.*\n"
                 "  ret void");
             REQUIRE(std::regex_search(module_string, m, exit));
         }
     }
 }
 
+//=============================================================================
+// Derivative block : test optimization
+//=============================================================================
+
+SCENARIO("Derivative block", "[visitor][llvm][derivative]") {
+    GIVEN("After helper visitor") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                RANGE minf, mtau
+            }
+            STATE {
+                m
+            }
+            ASSIGNED {
+                v (mV)
+                minf
+                mtau (ms)
+            }
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+            DERIVATIVE states {
+                m = (minf-m)/mtau
+            }
+        )";
+
+        std::string expected_main_loop = R"(
+            for(id = 0; id<mech->node_count; id = id+8) {
+                INTEGER node_id
+                DOUBLE v
+                node_id = mech->node_index[id]
+                v = mech->voltage[node_id]
+                mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
+                SOLVE states METHOD cnexp
+            })";
+        std::string expected_reminder_loop = R"(
+            for(; id<mech->node_count; id = id+1) {
+                INTEGER node_id
+                DOUBLE v
+                node_id = mech->node_index[id]
+                v = mech->voltage[node_id]
+                mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
+                SOLVE states METHOD cnexp
+            })";
+
+
+        THEN("should contains 2 for loops") {
+            auto result = run_codegen_visitor_helper(nmodl_text);
+            REQUIRE(result.size() == 2);
+
+            auto main_loop = reindent_text(to_nmodl(result[0]));
+            REQUIRE(main_loop == reindent_text(expected_main_loop));
+
+            auto reminder_loop = reindent_text(to_nmodl(result[1]));
+            REQUIRE(reminder_loop == reindent_text(expected_reminder_loop));
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From a72e2f23d3a30f6ff9ea99733f2c2d839d09d39b Mon Sep 17 00:00:00 2001
From: Nicolas Cornu <nicolas.cornu@epfl.ch>
Date: Fri, 19 Mar 2021 20:59:19 +0100
Subject: [PATCH 140/331] Always initialize return variable in function block
 (#554)

* return value in PROCEDURE block was not initialised
* do the initialisation as part of ASTR transformation
* remove initialisation specific code from LLVM visitor

fixes #530
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 55 +++++++++++--------
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  9 ---
 test/unit/codegen/codegen_llvm_ir.cpp         |  1 +
 3 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 0173664a8c..ceced6dc77 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -25,6 +25,28 @@ const std::string CodegenLLVMHelperVisitor::NODECOUNT_VAR = "node_count";
 const std::string CodegenLLVMHelperVisitor::VOLTAGE_VAR = "voltage";
 const std::string CodegenLLVMHelperVisitor::NODE_INDEX_VAR = "node_index";
 
+/// Create asr::Varname node with given a given variable name
+static ast::VarName* create_varname(const std::string& varname) {
+    return new ast::VarName(new ast::Name(new ast::String(varname)), nullptr, nullptr);
+}
+
+/**
+ * Create initialization expression
+ * @param code Usually "id = 0" as a string
+ * @return Expression representing code
+ * \todo : we can not use `create_statement_as_expression` function because
+ *         NMODL parser is using `ast::Double` type to represent all variables
+ *         including Integer. See #542.
+ */
+static std::shared_ptr<ast::Expression> int_initialization_expression(
+    const std::string& induction_var,
+    int value = 0) {
+    // create id = 0
+    const auto& id = create_varname(induction_var);
+    const auto& zero = new ast::Integer(value, nullptr);
+    return std::make_shared<ast::BinaryExpression>(id, ast::BinaryOperator(ast::BOP_ASSIGN), zero);
+}
+
 /**
  * \brief Create variable definition statement
  *
@@ -120,7 +142,8 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     auto name = new ast::Name(new ast::String(function_name));
 
     /// return variable name has "ret_" prefix
-    auto return_var = new ast::Name(new ast::String("ret_" + function_name));
+    std::string return_var_name = "ret_{}"_format(function_name);
+    auto return_var = new ast::Name(new ast::String(return_var_name));
 
     /// return type based on node type
     ast::CodegenVarType* ret_var_type = nullptr;
@@ -137,6 +160,11 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     /// convert local statement to codegenvar statement
     convert_local_statement(*block);
 
+    if (node.get_node_type() == ast::AstNodeType::PROCEDURE_BLOCK) {
+        block->insert_statement(statements.begin(),
+                                std::make_shared<ast::ExpressionStatement>(
+                                    int_initialization_expression(return_var_name)));
+    }
     /// insert return variable at the start of the block
     ast::CodegenVarVector codegen_vars;
     codegen_vars.emplace_back(new ast::CodegenVar(0, return_var->clone()));
@@ -462,30 +490,9 @@ void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
     create_function_for_node(node);
 }
 
-/// Create asr::Varname node with given a given variable name
-static ast::VarName* create_varname(const std::string& varname) {
-    return new ast::VarName(new ast::Name(new ast::String(varname)), nullptr, nullptr);
-}
-
-/**
- * Create for loop initialization expression
- * @param code Usually "id = 0" as a string
- * @return Expression representing code
- * \todo : we can not use `create_statement_as_expression` function because
- *         NMODL parser is using `ast::Double` type to represent all variables
- *         including Integer. See #542.
- */
-static std::shared_ptr<ast::Expression> loop_initialization_expression(
-    const std::string& induction_var) {
-    // create id = 0
-    const auto& id = create_varname(induction_var);
-    const auto& zero = new ast::Integer(0, nullptr);
-    return std::make_shared<ast::BinaryExpression>(id, ast::BinaryOperator(ast::BOP_ASSIGN), zero);
-}
-
 /**
  * Create loop increment expression `id = id + width`
- * \todo : same as loop_initialization_expression()
+ * \todo : same as int_initialization_expression()
  */
 static std::shared_ptr<ast::Expression> loop_increment_expression(const std::string& induction_var,
                                                                   int vector_width) {
@@ -581,7 +588,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// main loop possibly vectorized on vector_width
     {
         /// loop constructs : initialization, condition and increment
-        const auto& initialization = loop_initialization_expression(INDUCTION_VAR);
+        const auto& initialization = int_initialization_expression(INDUCTION_VAR);
         const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
         const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index bed88046a7..37b2e7fc67 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -713,15 +713,6 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
             throw std::runtime_error("Error: Unsupported local variable type");
         }
         llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
-
-        // Check if the variable we process is a procedure return variable (i.e. it has a name
-        // "ret_<current_function_name>" and the function return type is integer). If so, initialise
-        // it to 0.
-        std::string ret_val_name = "ret_" + current_func->getName().str();
-        if (name == ret_val_name && current_func->getReturnType()->isIntegerTy()) {
-            llvm::Value* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 0);
-            builder.CreateStore(zero, alloca);
-        }
     }
 }
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 3ab0c8d929..4a0e440aaf 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -686,6 +686,7 @@ SCENARIO("Procedure", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, signature));
             REQUIRE(std::regex_search(module_string, m, alloc));
             REQUIRE(std::regex_search(module_string, m, store));
+            REQUIRE(std::regex_search(module_string, m, load));
             REQUIRE(std::regex_search(module_string, m, ret));
         }
     }

From 99d6a03ba8896b01e1a58b1950794fabc9e1bb41 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 9 Apr 2021 19:57:54 +0300
Subject: [PATCH 141/331] Running a kernel with NMODL-LLVM JIT (#549)

* Added support for arguments in the JIT llvm runner
* Adjusted tests and added a simple kernel test
* Removed printfs from the kernel
* Fixed kernel number of arguments check
* Initial integration of dataHelper for kernel tests
* Implemented a test to check the scalar kernel execution
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    |  36 +++++
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  14 ++
 src/codegen/llvm/jit_driver.hpp              |  36 +++--
 src/codegen/llvm/main.cpp                    |   2 +-
 test/unit/CMakeLists.txt                     |   3 +-
 test/unit/codegen/codegen_llvm_execution.cpp | 151 +++++++++++++++++--
 6 files changed, 218 insertions(+), 24 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 37b2e7fc67..5fdd906480 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -923,5 +923,41 @@ void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node)
     builder.SetInsertPoint(exit);
 }
 
+void CodegenLLVMVisitor::wrap_kernel_function(const std::string& kernel_name) {
+    // Get the kernel function and the instance struct type.
+    auto kernel = module->getFunction(kernel_name);
+    if (!kernel)
+        throw std::runtime_error("Kernel " + kernel_name + " is not found!");
+
+    if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
+        throw std::runtime_error("Kernel " + kernel_name + " must have a single argument!");
+
+    auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(kernel->getArg(0)->getType());
+    if (!instance_struct_ptr_type)
+        throw std::runtime_error("Kernel " + kernel_name +
+                                 " does not have an instance struct pointer argument!");
+
+    // Create a wrapper void function that takes a void pointer as a single argument.
+    llvm::Type* void_type = llvm::Type::getVoidTy(*context);
+    llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+    llvm::Type* void_ptr_type = llvm::PointerType::get(void_type, /*AddressSpace=*/0);
+    llvm::Function* wrapper_func = llvm::Function::Create(
+        llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
+        llvm::Function::ExternalLinkage,
+        "__" + kernel_name + "_wrapper",
+        *module);
+    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
+    builder.SetInsertPoint(body);
+
+    // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel and
+    // adding a terminator.
+    llvm::Value* bitcasted = builder.CreateBitCast(wrapper_func->getArg(0),
+                                                   instance_struct_ptr_type);
+    std::vector<llvm::Value*> args;
+    args.push_back(bitcasted);
+    builder.CreateCall(kernel, args);
+    builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 41235a1ff0..b099646b07 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -237,6 +237,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      */
     void emit_procedure_or_function_declaration(const ast::CodegenFunction& node);
 
+    /**
+     * Return InstanceVarHelper
+     * \return InstanceVarHelper
+     */
+    InstanceVarHelper get_instance_var_helper() {
+        return instance_var_helper;
+    }
+
     /**
      * Return module pointer
      * \return LLVM IR module pointer
@@ -321,6 +329,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         os.flush();
         return str;
     }
+
+    /**
+     * For the given kernel function, wraps it into another function that uses void* to pass the
+     * data to the kernel \param kernel_name kernel name to be wrapped
+     */
+    void wrap_kernel_function(const std::string& kernel_name);
 };
 
 /** \} */  // end of llvm_backends
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index d1e9a9412f..23c8fca612 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -39,15 +39,27 @@ class JITDriver {
     /// Initialize the JIT.
     void init();
 
-    /// Lookup the entry-point in the JIT and execute it, returning the result.
-    template <typename T>
-    T execute(const std::string& entry_point) {
+    /// Lookup the entry-point without arguments in the JIT and execute it, returning the result.
+    template <typename ReturnType>
+    ReturnType execute_without_arguments(const std::string& entry_point) {
         auto expected_symbol = jit->lookup(entry_point);
         if (!expected_symbol)
             throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
 
-        auto (*res)() = (T(*)())(intptr_t) expected_symbol->getAddress();
-        T result = res();
+        auto (*res)() = (ReturnType(*)())(intptr_t) expected_symbol->getAddress();
+        ReturnType result = res();
+        return result;
+    }
+
+    /// Lookup the entry-point with an argument in the JIT and execute it, returning the result.
+    template <typename ReturnType, typename ArgType>
+    ReturnType execute_with_arguments(const std::string& entry_point, ArgType arg) {
+        auto expected_symbol = jit->lookup(entry_point);
+        if (!expected_symbol)
+            throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
+
+        auto (*res)(ArgType) = (ReturnType(*)(ArgType))(intptr_t) expected_symbol->getAddress();
+        ReturnType result = res(arg);
         return result;
     }
 
@@ -71,10 +83,16 @@ class Runner {
         driver->init();
     }
 
-    /// Run the entry-point function.
-    template <typename T>
-    double run(const std::string& entry_point) {
-        return driver->execute<T>(entry_point);
+    /// Run the entry-point function without arguments.
+    template <typename ReturnType>
+    ReturnType run_without_arguments(const std::string& entry_point) {
+        return driver->template execute_without_arguments<ReturnType>(entry_point);
+    }
+
+    /// Run the entry-point function with a pointer to the data as an argument.
+    template <typename ReturnType, typename ArgType>
+    ReturnType run_with_argument(const std::string& entry_point, ArgType arg) {
+        return driver->template execute_with_arguments<ReturnType, ArgType>(entry_point, arg);
     }
 };
 
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 11ea178cb4..acbdc37f19 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -67,7 +67,7 @@ int main(int argc, const char* argv[]) {
     Runner runner(std::move(module));
 
     // Since only double type is supported, provide explicit double type to the running function.
-    auto r = runner.run<double>(entry_point_name);
+    auto r = runner.run_without_arguments<double>(entry_point_name);
     fprintf(stderr, "Result: %f\n", r);
 
     return 0;
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 47b525767a..5b0e93e196 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -108,7 +108,8 @@ if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS} codegen)
   add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp
                           codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
-  add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_llvm_execution.cpp)
+  add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_data_helper.cpp
+                                  codegen/codegen_llvm_execution.cpp)
   target_link_libraries(
     testllvm
     llvm_codegen
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 90e8fb3cc2..c0764c7897 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -6,13 +6,15 @@
  *************************************************************************/
 
 #include <catch/catch.hpp>
-#include <regex>
 
 #include "ast/program.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "codegen/llvm/jit_driver.hpp"
+#include "codegen_data_helper.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
+#include "visitors/neuron_solve_visitor.hpp"
+#include "visitors/solve_block_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
 
 using namespace nmodl;
@@ -23,7 +25,43 @@ using nmodl::parser::NmodlDriver;
 static double EPSILON = 1e-15;
 
 //=============================================================================
-// No optimisations
+// Utilities for testing.
+//=============================================================================
+
+struct InstanceTestInfo {
+    codegen::CodegenInstanceData& instance;
+    codegen::CodegenLLVMVisitor& visitor;
+    int num_elements;
+};
+
+template <typename T>
+bool check_instance_variable(InstanceTestInfo& instance_info,
+                             std::vector<T>& expected,
+                             const std::string& variable_name) {
+    std::vector<T> actual;
+    int variable_index = instance_info.visitor.get_instance_var_helper().get_variable_index(
+        variable_name);
+    actual.assign(static_cast<T*>(instance_info.instance.members[variable_index]),
+                  static_cast<T*>(instance_info.instance.members[variable_index]) +
+                      instance_info.num_elements);
+    // While we are comparing double types as well, for simplicity the test cases are hand-crafted
+    // so that no floating-point arithmetic is really involved.
+    return actual == expected;
+}
+
+template <typename T>
+void initialise_instance_variable(InstanceTestInfo& instance_info,
+                                  std::vector<T>& data,
+                                  const std::string& variable_name) {
+    int variable_index = instance_info.visitor.get_instance_var_helper().get_variable_index(
+        variable_name);
+    T* data_start = static_cast<T*>(instance_info.instance.members[variable_index]);
+    for (int i = 0; i < instance_info.num_elements; ++i)
+        *(data_start + i) = data[i];
+}
+
+//=============================================================================
+// Simple functions: no optimisations
 //=============================================================================
 
 SCENARIO("Arithmetic expression", "[llvm][runner]") {
@@ -60,6 +98,10 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
 
             PROCEDURE foo() {}
 
+            FUNCTION with_argument(x) {
+                with_argument = x
+            }
+
             FUNCTION loop() {
                 LOCAL i, j, sum, result
                 result = 0
@@ -92,26 +134,31 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         Runner runner(std::move(m));
 
         THEN("functions are evaluated correctly") {
-            auto exp_result = runner.run<double>("exponential");
+            auto exp_result = runner.run_without_arguments<double>("exponential");
             REQUIRE(fabs(exp_result - 2.718281828459045) < EPSILON);
 
-            auto constant_result = runner.run<double>("constant");
+            auto constant_result = runner.run_without_arguments<double>("constant");
             REQUIRE(fabs(constant_result - 10.0) < EPSILON);
 
-            auto arithmetic_result = runner.run<double>("arithmetic");
+            auto arithmetic_result = runner.run_without_arguments<double>("arithmetic");
             REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
 
-            auto function_call_result = runner.run<double>("function_call");
+            auto function_call_result = runner.run_without_arguments<double>("function_call");
             REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
 
-            auto loop_result = runner.run<double>("loop");
+            double data = 10.0;
+            auto with_argument_result = runner.run_with_argument<double, double>("with_argument",
+                                                                                 data);
+            REQUIRE(fabs(with_argument_result - 10.0) < EPSILON);
+
+            auto loop_result = runner.run_without_arguments<double>("loop");
             REQUIRE(fabs(loop_result - 90.0) < EPSILON);
         }
     }
 }
 
 //=============================================================================
-// With optimisations
+// Simple functions: with optimisations
 //=============================================================================
 
 SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
@@ -189,23 +236,101 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
 
         THEN("optimizations preserve function results") {
             // Check exponential is turned into a constant.
-            auto exp_result = runner.run<double>("exponential");
+            auto exp_result = runner.run_without_arguments<double>("exponential");
             REQUIRE(fabs(exp_result - 2.718281828459045) < EPSILON);
 
             // Check constant folding.
-            auto constant_result = runner.run<double>("constant");
+            auto constant_result = runner.run_without_arguments<double>("constant");
             REQUIRE(fabs(constant_result - 10.0) < EPSILON);
 
             // Check nested conditionals
-            auto conditionals_result = runner.run<double>("conditionals");
+            auto conditionals_result = runner.run_without_arguments<double>("conditionals");
             REQUIRE(fabs(conditionals_result - 4.0) < EPSILON);
 
             // Check constant folding.
-            auto arithmetic_result = runner.run<double>("arithmetic");
+            auto arithmetic_result = runner.run_without_arguments<double>("arithmetic");
             REQUIRE(fabs(arithmetic_result - 2.1) < EPSILON);
 
-            auto function_call_result = runner.run<double>("function_call");
+            auto function_call_result = runner.run_without_arguments<double>("function_call");
             REQUIRE(fabs(function_call_result - 1.0) < EPSILON);
         }
     }
 }
+
+//=============================================================================
+// State scalar kernel.
+//=============================================================================
+
+SCENARIO("Simple scalar kernel", "[llvm][runner]") {
+    GIVEN("Simple MOD file with a state update") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                NONSPECIFIC_CURRENT i
+                RANGE x0, x1
+            }
+
+            STATE {
+                x
+            }
+
+            ASSIGNED {
+                v
+                x0
+                x1
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                i = 0
+            }
+
+            DERIVATIVE states {
+                x = (x0 - x) / x1
+            }
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/false,
+                                                 /*use_single_precision=*/false,
+                                                 /*vector_width=*/1);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_function("nrn_state_test");
+
+        // Create the instance struct data.
+        int num_elements = 4;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // Fill the instance struct data with some values.
+        std::vector<double> x = {1.0, 2.0, 3.0, 4.0};
+        std::vector<double> x0 = {5.0, 5.0, 5.0, 5.0};
+        std::vector<double> x1 = {1.0, 1.0, 1.0, 1.0};
+
+        InstanceTestInfo instance_info{instance_data, llvm_visitor, num_elements};
+        initialise_instance_variable(instance_info, x, "x");
+        initialise_instance_variable(instance_info, x0, "x0");
+        initialise_instance_variable(instance_info, x1, "x1");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        Runner runner(std::move(module));
+
+        THEN("Values in struct have changed according to the formula") {
+            runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
+                                                 instance_data.base_ptr);
+            std::vector<double> x_expected = {4.0, 3.0, 2.0, 1.0};
+            REQUIRE(check_instance_variable(instance_info, x_expected, "x"));
+        }
+    }
+}

From d134ad40969b3169d2fbeb5ab1b878a7786f934a Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 9 Apr 2021 21:42:09 +0300
Subject: [PATCH 142/331] Loop epilogue fix for LLVM visitor helper (#567)

* Added renaming for loop local variables in CodegenForStatement
* Fixed trip count in main loop and removed epilogue loop for scalar case
* Refactored loop remainder tests and added a scalar case
* Change `reminder` to `epilogue` in the test
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 68 +++++++++++++--
 test/unit/codegen/codegen_llvm_ir.cpp         | 84 +++++++++++++++----
 2 files changed, 126 insertions(+), 26 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index ceced6dc77..c3e9159dfa 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -11,6 +11,7 @@
 #include "ast/all.hpp"
 #include "codegen/codegen_helper_visitor.hpp"
 #include "utils/logger.hpp"
+#include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
 namespace nmodl {
@@ -25,6 +26,8 @@ const std::string CodegenLLVMHelperVisitor::NODECOUNT_VAR = "node_count";
 const std::string CodegenLLVMHelperVisitor::VOLTAGE_VAR = "voltage";
 const std::string CodegenLLVMHelperVisitor::NODE_INDEX_VAR = "node_index";
 
+static constexpr const char epilogue_variable_prefix[] = "epilogue_";
+
 /// Create asr::Varname node with given a given variable name
 static ast::VarName* create_varname(const std::string& varname) {
     return new ast::VarName(new ast::Name(new ast::String(varname)), nullptr, nullptr);
@@ -507,6 +510,39 @@ static std::shared_ptr<ast::Expression> loop_increment_expression(const std::str
                                                    inc_expr);
 }
 
+/**
+ * Create loop count comparison expression
+ *
+ * Based on if loop is vectorised or not, the condition for loop
+ * is different. For example:
+ *  - serial loop : `id < node_count`
+ *  - vector loop : `id < (node_count - vector_width + 1)`
+ *
+ * \todo : same as int_initialization_expression()
+ */
+static std::shared_ptr<ast::Expression> loop_count_expression(const std::string& induction_var,
+                                                              const std::string& node_count,
+                                                              int vector_width) {
+    const auto& id = create_varname(induction_var);
+    const auto& mech_node_count = create_varname(node_count);
+
+    // For non-vectorised loop, the condition is id < mech->node_count
+    if (vector_width == 1) {
+        return std::make_shared<ast::BinaryExpression>(id->clone(),
+                                                       ast::BinaryOperator(ast::BOP_LESS),
+                                                       mech_node_count);
+    }
+
+    // For vectorised loop, the condition is id < mech->node_count - vector_width + 1
+    const auto& remainder = new ast::Integer(vector_width - 1, /*macro=*/nullptr);
+    const auto& count = new ast::BinaryExpression(mech_node_count,
+                                                  ast::BinaryOperator(ast::BOP_SUBTRACTION),
+                                                  remainder);
+    return std::make_shared<ast::BinaryExpression>(id->clone(),
+                                                   ast::BinaryOperator(ast::BOP_LESS),
+                                                   count);
+}
+
 /**
  * \brief Convert ast::NrnStateBlock to corresponding code generation function nrn_state
  * @param node AST node representing ast::NrnStateBlock
@@ -522,8 +558,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
 
     /// create variable definition for loop index and insert at the beginning
     std::string loop_index_var = "id";
-    std::vector<std::string> int_variables{"id"};
-    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    std::vector<std::string> induction_variables{"id"};
+    function_statements.push_back(
+        create_local_variable_statement(induction_variables, INTEGER_TYPE));
 
     /// create now main compute part : for loop over channel instances
 
@@ -531,10 +568,10 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     ast::StatementVector loop_def_statements;
     ast::StatementVector loop_index_statements;
     ast::StatementVector loop_body_statements;
-    {
-        std::vector<std::string> int_variables{"node_id"};
-        std::vector<std::string> double_variables{"v"};
 
+    std::vector<std::string> int_variables{"node_id"};
+    std::vector<std::string> double_variables{"v"};
+    {
         /// access node index and corresponding voltage
         loop_index_statements.push_back(
             visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
@@ -589,7 +626,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     {
         /// loop constructs : initialization, condition and increment
         const auto& initialization = int_initialization_expression(INDUCTION_VAR);
-        const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
+        const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, vector_width);
         const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
 
         /// clone it
@@ -611,10 +648,11 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     }
 
     /// remainder loop possibly vectorized on vector_width
-    {
+    if (vector_width > 1) {
         /// loop constructs : initialization, condition and increment
-        const auto& condition = create_expression("{} < {}"_format(INDUCTION_VAR, NODECOUNT_VAR));
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, 1);
+        const auto& condition =
+            loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, /*vector_width=*/1);
+        const auto& increment = loop_increment_expression(INDUCTION_VAR, /*vector_width=*/1);
 
         /// convert local statement to codegenvar statement
         convert_local_statement(*loop_block);
@@ -622,6 +660,18 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         auto for_loop_statement_remainder =
             std::make_shared<ast::CodegenForStatement>(nullptr, condition, increment, loop_block);
 
+        const auto& loop_statements = for_loop_statement_remainder->get_statement_block();
+        // \todo: Change RenameVisitor to take a vector of names to which it would append a single
+        // prefix.
+        for (const auto& name: int_variables) {
+            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
+            loop_statements->accept(v);
+        }
+        for (const auto& name: double_variables) {
+            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
+            loop_statements->accept(v);
+        }
+
         /// convert all variables inside loop body to instance variables
         convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 4a0e440aaf..b51a4e3d58 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -54,19 +54,21 @@ std::string run_llvm_visitor(const std::string& text,
 }
 
 //=============================================================================
-// Utility to get specific LLVM nodes
+// Utility to get specific NMODL AST nodes
 //=============================================================================
 
-std::vector<std::shared_ptr<ast::Ast>> run_codegen_visitor_helper(const std::string& text) {
+std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
+    const std::string& text,
+    int vector_width,
+    const std::vector<ast::AstNodeType>& nodes_to_collect) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
-    /// construct symbol table and run codegen helper visitor
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(8).visit_program(*ast);
+    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
 
-    const auto& nodes = collect_nodes(*ast, {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
+    const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 
     return nodes;
 }
@@ -903,11 +905,12 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
 // Derivative block : test optimization
 //=============================================================================
 
-SCENARIO("Derivative block", "[visitor][llvm][derivative]") {
-    GIVEN("After helper visitor") {
+SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
+    GIVEN("After LLVM helper visitor transformations") {
         std::string nmodl_text = R"(
             NEURON {
                 SUFFIX hh
+                NONSPECIFIC_CURRENT il
                 RANGE minf, mtau
             }
             STATE {
@@ -920,41 +923,88 @@ SCENARIO("Derivative block", "[visitor][llvm][derivative]") {
             }
             BREAKPOINT {
                 SOLVE states METHOD cnexp
+                il = 2
             }
             DERIVATIVE states {
                 m = (minf-m)/mtau
             }
         )";
 
-        std::string expected_main_loop = R"(
-            for(id = 0; id<mech->node_count; id = id+8) {
+        std::string expected_loop = R"(
+            for(id = 0; id<mech->node_count; id = id+1) {
                 INTEGER node_id
                 DOUBLE v
                 node_id = mech->node_index[id]
                 v = mech->voltage[node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
-                SOLVE states METHOD cnexp
             })";
-        std::string expected_reminder_loop = R"(
-            for(; id<mech->node_count; id = id+1) {
+
+        THEN("a single scalar loops is constructed") {
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  /*vector_width=*/1,
+                                                  {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
+            REQUIRE(result.size() == 1);
+
+            auto main_loop = reindent_text(to_nmodl(result[0]));
+            REQUIRE(main_loop == reindent_text(expected_loop));
+        }
+    }
+}
+
+SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
+    GIVEN("After LLVM helper visitor transformations") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                NONSPECIFIC_CURRENT il
+                RANGE minf, mtau
+            }
+            STATE {
+                m
+            }
+            ASSIGNED {
+                v (mV)
+                minf
+                mtau (ms)
+            }
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                il = 2
+            }
+            DERIVATIVE states {
+                m = (minf-m)/mtau
+            }
+        )";
+
+        std::string expected_main_loop = R"(
+            for(id = 0; id<mech->node_count-7; id = id+8) {
                 INTEGER node_id
                 DOUBLE v
                 node_id = mech->node_index[id]
                 v = mech->voltage[node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
-                SOLVE states METHOD cnexp
+            })";
+        std::string expected_epilogue_loop = R"(
+            for(; id<mech->node_count; id = id+1) {
+                INTEGER epilogue_node_id
+                DOUBLE epilogue_v
+                epilogue_node_id = mech->node_index[id]
+                epilogue_v = mech->voltage[epilogue_node_id]
+                mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
             })";
 
 
-        THEN("should contains 2 for loops") {
-            auto result = run_codegen_visitor_helper(nmodl_text);
+        THEN("vector and epilogue scalar loops are constructed") {
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  /*vector_width=*/8,
+                                                  {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 2);
 
             auto main_loop = reindent_text(to_nmodl(result[0]));
             REQUIRE(main_loop == reindent_text(expected_main_loop));
 
-            auto reminder_loop = reindent_text(to_nmodl(result[1]));
-            REQUIRE(reminder_loop == reindent_text(expected_reminder_loop));
+            auto epilogue_loop = reindent_text(to_nmodl(result[1]));
+            REQUIRE(epilogue_loop == reindent_text(expected_epilogue_loop));
         }
     }
 }

From 2b4a7b730b5dd93a55cfa23a0d4bfd6945dcc6a9 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 10 Apr 2021 22:00:50 +0300
Subject: [PATCH 143/331] Gather support and vectorisation fixes for LLVM code
 generation (#568)

* Add gather support
* Fixed vectorisation patterns and added simple JIT tests
* Added IR regex test for gather
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    | 103 ++++++++-----------
 test/unit/codegen/codegen_llvm_execution.cpp | 103 +++++++++++++++++--
 test/unit/codegen/codegen_llvm_ir.cpp        |  55 ++++++++++
 3 files changed, 191 insertions(+), 70 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 5fdd906480..a42201824c 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -23,11 +23,6 @@ namespace codegen {
 
 static constexpr const char instance_struct_type_name[] = "__instance_var__type";
 
-// The prefix is used to create a vectorised id that can be used as index to GEPs. However, for
-// simple aligned vector loads and stores vector id is not needed. This is because we can bitcast
-// the pointer to the vector pointer! \todo: Consider removing this.
-static constexpr const char kernel_id_prefix[] = "__vec_";
-
 
 /****************************************************************************************/
 /*                            Helper routines                                           */
@@ -88,12 +83,11 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     // Proceed to creating a GEP instruction to get the pointer to the member's element.
     auto member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
         member_var_name->get_name());
-    llvm::Value* i64_index = get_array_index(*member_indexed_name);
 
+    if (!member_indexed_name->get_length()->is_name())
+        throw std::runtime_error("Error: " + member_name + " must be indexed with a variable!");
 
-    // Create a indices vector for GEP to return the pointer to the element at the specified index.
-    std::vector<llvm::Value*> member_indices;
-    member_indices.push_back(i64_index);
+    llvm::Value* i64_index = get_array_index(*member_indexed_name);
 
     // The codegen variable type is always a scalar, so we need to transform it to a pointer. Then
     // load the member which would be indexed later.
@@ -101,18 +95,25 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     llvm::Value* instance_member =
         builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
 
+    // Check if the code is vectorised and the index is indirect.
+    std::string id = member_indexed_name->get_length()->get_node_name();
+    if (id != kernel_id && is_kernel_code && vector_width > 1) {
+        // Calculate a vector of addresses via GEP instruction, and then created a gather to load
+        // indirectly.
+        llvm::Value* addresses = builder.CreateInBoundsGEP(instance_member, {i64_index});
+        return builder.CreateMaskedGather(addresses, llvm::Align());
+    }
+
+    llvm::Value* member_addr = builder.CreateInBoundsGEP(instance_member, {i64_index});
 
     // If the code is vectorised, then bitcast to a vector pointer.
     if (is_kernel_code && vector_width > 1) {
         llvm::Type* vector_type =
             llvm::PointerType::get(llvm::FixedVectorType::get(type, vector_width),
                                    /*AddressSpace=*/0);
-        llvm::Value* instance_member_bitcasted = builder.CreateBitCast(instance_member,
-                                                                       vector_type);
-        return builder.CreateInBoundsGEP(instance_member_bitcasted, member_indices);
+        return builder.CreateBitCast(member_addr, vector_type);
     }
-
-    return builder.CreateInBoundsGEP(instance_member, member_indices);
+    return member_addr;
 }
 
 llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
@@ -135,12 +136,19 @@ llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
         throw std::runtime_error("Error: only integer indexing is supported!");
 
     // Conventionally, in LLVM array indices are 64 bit.
-    auto index_type = llvm::cast<llvm::IntegerType>(index_value->getType());
     llvm::Type* i64_type = llvm::Type::getInt64Ty(*context);
-    if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
-        return index_value;
+    if (auto index_type = llvm::dyn_cast<llvm::IntegerType>(index_value->getType())) {
+        if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
+            return index_value;
+        return builder.CreateSExtOrTrunc(index_value, i64_type);
+    }
 
-    return builder.CreateSExtOrTrunc(index_value, i64_type);
+    auto vector_type = llvm::cast<llvm::FixedVectorType>(index_value->getType());
+    auto element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
+    if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
+        return index_value;
+    return builder.CreateSExtOrTrunc(index_value,
+                                     llvm::FixedVectorType::get(i64_type, vector_width));
 }
 
 int CodegenLLVMVisitor::get_array_length(const ast::IndexedName& node) {
@@ -167,8 +175,6 @@ llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType&
         return llvm::Type::getInt32Ty(*context);
     case ast::AstNodeType::VOID:
         return llvm::Type::getVoidTy(*context);
-    // TODO :: George/Ioannis : Here we have to also return INSTANCE_STRUCT type
-    //         as it is used as an argument to nrn_state function
     default:
         throw std::runtime_error("Error: expecting a type in CodegenVarType node\n");
     }
@@ -576,31 +582,15 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
-    // First, initialise the loop in the same basic block. This block is optional.
+    // First, initialise the loop in the same basic block. This block is optional. Also, reset
+    // vector width to 1 if processing the remainder of the loop.
+    int tmp_vector_width = vector_width;
     if (node.get_initialization()) {
         node.get_initialization()->accept(*this);
+    } else {
+        vector_width = 1;
     }
 
-    // If the loop is to be vectorised, create a separate vector induction variable.
-    // \todo: See the comment for `kernel_id_prefix`.
-    if (vector_width > 1) {
-        // First, create a vector type and alloca for it.
-        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::Type* vec_type = llvm::FixedVectorType::get(i32_type, vector_width);
-        llvm::Value* vec_alloca = builder.CreateAlloca(vec_type,
-                                                       /*ArraySize=*/nullptr,
-                                                       /*Name=*/kernel_id_prefix + kernel_id);
-
-        // Then, store the initial value of <0, 1, ..., [W-1]> o the alloca pointer, where W is the
-        // vector width.
-        std::vector<llvm::Constant*> constants;
-        for (unsigned i = 0; i < vector_width; ++i) {
-            const auto& element = llvm::ConstantInt::get(i32_type, i);
-            constants.push_back(element);
-        }
-        llvm::Value* vector_id = llvm::ConstantVector::get(constants);
-        builder.CreateStore(vector_id, vec_alloca);
-    }
     // Branch to condition basic block and insert condition code there.
     builder.CreateBr(for_cond);
     builder.SetInsertPoint(for_cond);
@@ -623,23 +613,11 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     builder.SetInsertPoint(for_inc);
     node.get_increment()->accept(*this);
 
-    // If the code is vectorised, then increment the vector id by <W, W, ..., W> where W is the
+    // Create a branch to condition block, then generate exit code out of the loop. Restore the
     // vector width.
-    // \todo: See the comment for `kernel_id_prefix`.
-    if (vector_width > 1) {
-        // First, create an increment vector.
-        llvm::Value* vector_inc = get_constant_int_vector(vector_width);
-
-        // Increment the kernel id elements by a constant vector width.
-        llvm::Value* vector_id_ptr = lookup(kernel_id_prefix + kernel_id);
-        llvm::Value* vector_id = builder.CreateLoad(vector_id_ptr);
-        llvm::Value* incremented = builder.CreateAdd(vector_id, vector_inc);
-        builder.CreateStore(incremented, vector_id_ptr);
-    }
-
-    // Create a branch to condition block, then generate exit code out of the loop.
     builder.CreateBr(for_cond);
     builder.SetInsertPoint(exit);
+    vector_width = tmp_vector_width;
 }
 
 
@@ -707,8 +685,12 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
             int length = get_array_length(*indexed_name);
             var_type = llvm::ArrayType::get(scalar_var_type, length);
         } else if (identifier->is_name()) {
-            // This case corresponds to a scalar local variable. Its type is double by default.
-            var_type = scalar_var_type;
+            // This case corresponds to a scalar or vector local variable.
+            if (is_kernel_code && vector_width > 1) {
+                var_type = llvm::FixedVectorType::get(scalar_var_type, vector_width);
+            } else {
+                var_type = scalar_var_type;
+            }
         } else {
             throw std::runtime_error("Error: Unsupported local variable type");
         }
@@ -881,10 +863,11 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
     llvm::Value* ptr = get_variable_ptr(node);
 
-    // Finally, load the variable from the pointer value.
-    llvm::Value* var = builder.CreateLoad(ptr);
+    // Finally, load the variable from the pointer value unless it has already been loaded (e.g. via
+    // gather instruction).
+    llvm::Value* var = ptr->getType()->isPointerTy() ? builder.CreateLoad(ptr) : ptr;
 
-    // If the vale should not be vectorised, or it is already a vector, add it to the stack.
+    // If the value should not be vectorised, or it is already a vector, add it to the stack.
     if (!is_kernel_code || vector_width <= 1 || var->getType()->isVectorTy()) {
         values.push_back(var);
         return;
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index c0764c7897..782a3374b8 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -29,8 +29,8 @@ static double EPSILON = 1e-15;
 //=============================================================================
 
 struct InstanceTestInfo {
-    codegen::CodegenInstanceData& instance;
-    codegen::CodegenLLVMVisitor& visitor;
+    codegen::CodegenInstanceData* instance;
+    codegen::InstanceVarHelper helper;
     int num_elements;
 };
 
@@ -39,11 +39,11 @@ bool check_instance_variable(InstanceTestInfo& instance_info,
                              std::vector<T>& expected,
                              const std::string& variable_name) {
     std::vector<T> actual;
-    int variable_index = instance_info.visitor.get_instance_var_helper().get_variable_index(
-        variable_name);
-    actual.assign(static_cast<T*>(instance_info.instance.members[variable_index]),
-                  static_cast<T*>(instance_info.instance.members[variable_index]) +
+    int variable_index = instance_info.helper.get_variable_index(variable_name);
+    actual.assign(static_cast<T*>(instance_info.instance->members[variable_index]),
+                  static_cast<T*>(instance_info.instance->members[variable_index]) +
                       instance_info.num_elements);
+
     // While we are comparing double types as well, for simplicity the test cases are hand-crafted
     // so that no floating-point arithmetic is really involved.
     return actual == expected;
@@ -53,9 +53,8 @@ template <typename T>
 void initialise_instance_variable(InstanceTestInfo& instance_info,
                                   std::vector<T>& data,
                                   const std::string& variable_name) {
-    int variable_index = instance_info.visitor.get_instance_var_helper().get_variable_index(
-        variable_name);
-    T* data_start = static_cast<T*>(instance_info.instance.members[variable_index]);
+    int variable_index = instance_info.helper.get_variable_index(variable_name);
+    T* data_start = static_cast<T*>(instance_info.instance->members[variable_index]);
     for (int i = 0; i < instance_info.num_elements; ++i)
         *(data_start + i) = data[i];
 }
@@ -317,7 +316,9 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         std::vector<double> x0 = {5.0, 5.0, 5.0, 5.0};
         std::vector<double> x1 = {1.0, 1.0, 1.0, 1.0};
 
-        InstanceTestInfo instance_info{instance_data, llvm_visitor, num_elements};
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
         initialise_instance_variable(instance_info, x, "x");
         initialise_instance_variable(instance_info, x0, "x0");
         initialise_instance_variable(instance_info, x1, "x1");
@@ -334,3 +335,85 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         }
     }
 }
+
+//=============================================================================
+// State vectorised kernel with optimisations on.
+//=============================================================================
+
+SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
+    GIVEN("Simple MOD file with a state update") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                NONSPECIFIC_CURRENT i
+                RANGE x0, x1
+            }
+
+            STATE {
+                x
+            }
+
+            ASSIGNED {
+                v
+                x0
+                x1
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                i = 0
+            }
+
+            DERIVATIVE states {
+                x = (x0 - x) / x1
+            }
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/true,
+                                                 /*use_single_precision=*/false,
+                                                 /*vector_width=*/4);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_function("nrn_state_test");
+
+        // Create the instance struct data.
+        int num_elements = 10;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // Fill the instance struct data with some values for unit testing.
+        std::vector<double> x = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
+        std::vector<double> x0 = {11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0};
+        std::vector<double> x1 = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
+        initialise_instance_variable<double>(instance_info, x, "x");
+        initialise_instance_variable<double>(instance_info, x0, "x0");
+        initialise_instance_variable<double>(instance_info, x1, "x1");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        Runner runner(std::move(module));
+
+        THEN("Values in struct have changed according to the formula") {
+            runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
+                                                 instance_data.base_ptr);
+            std::vector<double> x_expected = {10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
+
+            // Check that the main and remainder loops correctly change the data stored in x.
+            REQUIRE(check_instance_variable<double>(instance_info, x_expected, "x"));
+        }
+    }
+}
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index b51a4e3d58..dfa6d271dc 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -901,6 +901,61 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// Gather for vectorised kernel
+//=============================================================================
+
+SCENARIO("Vectorised simple kernel", "[visitor][llvm]") {
+    GIVEN("An indirect indexing of voltage") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                NONSPECIFIC_CURRENT i
+            }
+
+            STATE {}
+
+            ASSIGNED {
+                v (mV)
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                i = 2
+            }
+
+            DERIVATIVE states {}
+        )";
+
+        THEN("a gather instructions is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text,
+                                                         /*opt=*/false,
+                                                         /*use_single_precision=*/false,
+                                                         /*vector_width=*/4);
+            std::smatch m;
+
+            // Check gather intrinsic is correctly declared.
+            std::regex declaration(
+                R"(declare <4 x double> @llvm\.masked\.gather\.v4f64\.v4p0f64\(<4 x double\*>, i32 immarg, <4 x i1>, <4 x double>\) )");
+            REQUIRE(std::regex_search(module_string, m, declaration));
+
+            // Check that the indices vector is created correctly and extended to i64.
+            std::regex index_load(R"(load <4 x i32>, <4 x i32>\* %node_id)");
+            std::regex sext(R"(sext <4 x i32> %.* to <4 x i64>)");
+            REQUIRE(std::regex_search(module_string, m, index_load));
+            REQUIRE(std::regex_search(module_string, m, sext));
+
+            // Check that the access to `voltage` is performed via gather instruction.
+            //      v = mech->voltage[node_id]
+            std::regex gather(
+                "call <4 x double> @llvm\\.masked\\.gather\\.v4f64\\.v4p0f64\\("
+                "<4 x double\\*> %.*, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x "
+                "double> undef\\)");
+            REQUIRE(std::regex_search(module_string, m, gather));
+        }
+    }
+}
+
 //=============================================================================
 // Derivative block : test optimization
 //=============================================================================

From ecebcc0b4b31c11e93e4c9eb6e600e9aeb46f0b7 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 13 Apr 2021 08:31:28 +0300
Subject: [PATCH 144/331] Verification and file utilities for LLVM IR codegen
 (#582)

Added several minor improvement to the current pipeline
infrastructure. Particularly, the following was addressed:

- The generated IR module is now verified after running the
visitor
- The kernel is checked if it can be vectorised or not
- The generated IR can be dumped to `.ll` file with
`-o <filename>`
- Printing LLVM IR is moved to debug mode
---
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 58 ++++++++++++++++++-
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  6 +-
 .../codegen/codegen_llvm_instance_struct.cpp  |  2 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  2 +-
 4 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index a42201824c..b080a1638f 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -9,13 +9,17 @@
 
 #include "ast/all.hpp"
 #include "visitors/rename_visitor.hpp"
+#include "visitors/visitor_utils.hpp"
 
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ToolOutputFile.h"
 
 namespace nmodl {
 namespace codegen {
@@ -28,12 +32,31 @@ static constexpr const char instance_struct_type_name[] = "__instance_var__type"
 /*                            Helper routines                                           */
 /****************************************************************************************/
 
+/// A utility to check for supported Statement AST nodes.
 static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
            statement.is_codegen_for_statement() || statement.is_codegen_return_statement() ||
            statement.is_if_statement() || statement.is_while_statement();
 }
 
+/// A utility to check of the kernel body can be vectorised.
+static bool can_vectorise(const ast::CodegenForStatement& statement, symtab::SymbolTable* sym_tab) {
+    // Check that function calls are made to external methods only.
+    const auto& function_calls = collect_nodes(statement, {ast::AstNodeType::FUNCTION_CALL});
+    for (const auto& call: function_calls) {
+        const auto& name = call->get_node_name();
+        auto symbol = sym_tab->lookup(name);
+        if (symbol && !symbol->has_any_property(symtab::syminfo::NmodlType::extern_method))
+            return false;
+    }
+
+    // Check there is no control flow in the kernel.
+    const std::vector<ast::AstNodeType> unsupported_nodes = {ast::AstNodeType::IF_STATEMENT};
+    const auto& collected = collect_nodes(statement, unsupported_nodes);
+
+    return collected.empty();
+}
+
 llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, llvm::Value* index) {
     llvm::Type* index_type = llvm::Type::getInt64Ty(*context);
     std::vector<llvm::Value*> indices;
@@ -582,9 +605,18 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
+    // Save the vector width.
+    int tmp_vector_width = vector_width;
+
+    // Check if the kernel can be vectorised. If not, generate scalar code.
+    if (!can_vectorise(node, sym_tab)) {
+        logger->info("Cannot vectorise the for loop in '" + current_func->getName().str() + "'");
+        logger->info("Generating scalar code...");
+        vector_width = 1;
+    }
+
     // First, initialise the loop in the same basic block. This block is optional. Also, reset
     // vector width to 1 if processing the remainder of the loop.
-    int tmp_vector_width = vector_width;
     if (node.get_initialization()) {
         node.get_initialization()->accept(*this);
     } else {
@@ -833,13 +865,33 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         visit_codegen_function(*func);
     }
 
+    // Verify the generated LLVM IR module.
+    std::string error;
+    llvm::raw_string_ostream ostream(error);
+    if (verifyModule(*module, &ostream)) {
+        throw std::runtime_error("Error: incorrect IR has been generated!\n" + ostream.str());
+    }
+
     if (opt_passes) {
         logger->info("Running LLVM optimisation passes");
         run_llvm_opt_passes();
     }
 
-    // Keep this for easier development (maybe move to debug mode later).
-    std::cout << print_module();
+    // If the output directory is specified, save the IR to .ll file.
+    // \todo: Consider saving the generated LLVM IR to bytecode (.bc) file instead.
+    if (output_dir != ".") {
+        std::error_code error_code;
+        std::unique_ptr<llvm::ToolOutputFile> out = std::make_unique<llvm::ToolOutputFile>(
+            output_dir + "/" + mod_filename + ".ll", error_code, llvm::sys::fs::OF_Text);
+        if (error_code)
+            throw std::runtime_error("Error: " + error_code.message());
+
+        std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
+        module->print(out->os(), annotator.get());
+        out->keep();
+    }
+
+    logger->debug("Dumping generated IR...\n" + dump_module());
 }
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index b099646b07..f001c2c2fe 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -321,8 +321,10 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_var_name(const ast::VarName& node) override;
     void visit_while_statement(const ast::WhileStatement& node) override;
 
-    // \todo: move this to debug mode (e.g. -v option or --dump-ir)
-    std::string print_module() const {
+    /**
+     * Dumps the generated LLVM IR module to string.
+     */
+    std::string dump_module() const {
         std::string str;
         llvm::raw_string_ostream os(str);
         os << *module;
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 4bfa1cd31c..52b9bb9868 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -45,7 +45,7 @@ codegen::CodegenInstanceData generate_instance_data(const std::string& text,
                                              use_single_precision,
                                              vector_width);
     llvm_visitor.visit_program(*ast);
-    llvm_visitor.print_module();
+    llvm_visitor.dump_module();
     const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
     auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
     auto instance_data = codegen_data.create_data(num_elements, seed);
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index dfa6d271dc..83807fedbf 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -50,7 +50,7 @@ std::string run_llvm_visitor(const std::string& text,
                                              use_single_precision,
                                              vector_width);
     llvm_visitor.visit_program(*ast);
-    return llvm_visitor.print_module();
+    return llvm_visitor.dump_module();
 }
 
 //=============================================================================

From 4c13787863e079aa9945e0c86e3f3654544aa0f7 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 16 Apr 2021 19:20:29 +0300
Subject: [PATCH 145/331] Add gather execution test (#591)

---
 test/unit/codegen/codegen_llvm_execution.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 782a3374b8..b191f350df 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -350,7 +350,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
             }
 
             STATE {
-                x
+                x y
             }
 
             ASSIGNED {
@@ -366,6 +366,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
 
             DERIVATIVE states {
                 x = (x0 - x) / x1
+                y = v
             }
         )";
 
@@ -396,6 +397,9 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         std::vector<double> x0 = {11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0};
         std::vector<double> x1 = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
+        std::vector<double> voltage = {3.0, 4.0, 7.0, 1.0, 2.0, 5.0, 8.0, 6.0, 10.0, 9.0};
+        std::vector<int> node_index = {3, 4, 0, 1, 5, 7, 2, 6, 9, 8};
+
         InstanceTestInfo instance_info{&instance_data,
                                        llvm_visitor.get_instance_var_helper(),
                                        num_elements};
@@ -403,6 +407,9 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         initialise_instance_variable<double>(instance_info, x0, "x0");
         initialise_instance_variable<double>(instance_info, x1, "x1");
 
+        initialise_instance_variable<double>(instance_info, voltage, "voltage");
+        initialise_instance_variable<int>(instance_info, node_index, "node_index");
+
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
         Runner runner(std::move(module));
@@ -410,10 +417,14 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         THEN("Values in struct have changed according to the formula") {
             runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
                                                  instance_data.base_ptr);
-            std::vector<double> x_expected = {10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-
             // Check that the main and remainder loops correctly change the data stored in x.
+            std::vector<double> x_expected = {10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
             REQUIRE(check_instance_variable<double>(instance_info, x_expected, "x"));
+
+            // Check that the gather load produces correct results in y:
+            //   y[id] = voltage[node_index[id]]
+            std::vector<double> y_expected = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
+            REQUIRE(check_instance_variable<double>(instance_info, y_expected, "y"));
         }
     }
 }

From 566ea70fa157ca035dee9f516862fa5be0a21469 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 17 Apr 2021 09:34:46 +0300
Subject: [PATCH 146/331] Fixed loop allocations (#590)

* avoid local variables inside loop to not have allocas
* this was causing stack overview for large instance count
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 32 ++++++++++++++-----
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 28 +++++++++++++---
 test/unit/codegen/codegen_llvm_ir.cpp         | 24 +++++---------
 3 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c3e9159dfa..eec79370f6 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -562,15 +562,16 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     function_statements.push_back(
         create_local_variable_statement(induction_variables, INTEGER_TYPE));
 
+    /// create vectors of local variables that would be used in compute part
+    std::vector<std::string> int_variables{"node_id"};
+    std::vector<std::string> double_variables{"v"};
+
     /// create now main compute part : for loop over channel instances
 
     /// loop body : initialization + solve blocks
     ast::StatementVector loop_def_statements;
     ast::StatementVector loop_index_statements;
     ast::StatementVector loop_body_statements;
-
-    std::vector<std::string> int_variables{"node_id"};
-    std::vector<std::string> double_variables{"v"};
     {
         /// access node index and corresponding voltage
         loop_index_statements.push_back(
@@ -597,6 +598,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         /// add breakpoint block if no current
         if (info.currents.empty() && info.breakpoint_node != nullptr) {
             auto block = info.breakpoint_node->get_statement_block();
+            // \todo this automatically adds `SOLVE states METHOD ...`
             append_statements_from_block(loop_body_statements, block);
         }
 
@@ -607,10 +609,6 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
                              loop_index_statements,
                              loop_body_statements);
 
-        loop_def_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
-        loop_def_statements.push_back(
-            create_local_variable_statement(double_variables, FLOAT_TYPE));
-
         // \todo handle process_shadow_update_statement and wrote_conc_call yet
     }
 
@@ -622,6 +620,10 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// now construct a new code block which will become the body of the loop
     auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
 
+    /// declare main FOR loop local variables
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
+
     /// main loop possibly vectorized on vector_width
     {
         /// loop constructs : initialization, condition and increment
@@ -647,6 +649,10 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         function_statements.push_back(for_loop_statement_main);
     }
 
+    /// vectors containing renamed FOR loop local variables
+    std::vector<std::string> renamed_int_variables;
+    std::vector<std::string> renamed_double_variables;
+
     /// remainder loop possibly vectorized on vector_width
     if (vector_width > 1) {
         /// loop constructs : initialization, condition and increment
@@ -664,14 +670,24 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         // \todo: Change RenameVisitor to take a vector of names to which it would append a single
         // prefix.
         for (const auto& name: int_variables) {
-            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
+            std::string new_name = epilogue_variable_prefix + name;
+            renamed_int_variables.push_back(new_name);
+            visitor::RenameVisitor v(name, new_name);
             loop_statements->accept(v);
         }
         for (const auto& name: double_variables) {
+            std::string new_name = epilogue_variable_prefix + name;
+            renamed_double_variables.push_back(new_name);
             visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
             loop_statements->accept(v);
         }
 
+        /// declare remainder FOR loop local variables
+        function_statements.push_back(
+            create_local_variable_statement(renamed_int_variables, INTEGER_TYPE));
+        function_statements.push_back(
+            create_local_variable_statement(renamed_double_variables, FLOAT_TYPE));
+
         /// convert all variables inside loop body to instance variables
         convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index b080a1638f..3a165e465a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -592,6 +592,9 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
 //  | <code after for loop>     |
 //  +---------------------------+
 void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatement& node) {
+    // Disable vector code generation for condition and increment blocks.
+    is_kernel_code = false;
+
     // Get the current and the next blocks within the function.
     llvm::BasicBlock* curr_block = builder.GetInsertBlock();
     llvm::BasicBlock* next = curr_block->getNextNode();
@@ -650,6 +653,7 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     builder.CreateBr(for_cond);
     builder.SetInsertPoint(exit);
     vector_width = tmp_vector_width;
+    is_kernel_code = true;
 }
 
 
@@ -682,11 +686,19 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
         builder.CreateStore(&arg, alloca);
     }
 
-    // Process function or procedure body. The return statement is handled in a separate visitor.
-    block->accept(*this);
+    // Process function or procedure body. If the function is a compute kernel, then set the
+    // corresponding flags. The return statement is handled in a separate visitor.
+    bool has_void_ret_type = node.get_return_type()->get_type() == ast::AstNodeType::VOID;
+    if (has_void_ret_type) {
+        is_kernel_code = true;
+        block->accept(*this);
+        is_kernel_code = false;
+    } else {
+        block->accept(*this);
+    }
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
-    if (node.get_return_type()->get_type() == ast::AstNodeType::VOID)
+    if (has_void_ret_type)
         builder.CreateRetVoid();
 
     // Clear local values stack and remove the pointer to the local symbol table.
@@ -718,7 +730,13 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
             var_type = llvm::ArrayType::get(scalar_var_type, length);
         } else if (identifier->is_name()) {
             // This case corresponds to a scalar or vector local variable.
-            if (is_kernel_code && vector_width > 1) {
+            const auto& identifier_name = identifier->get_node_name();
+
+            // Even if generating vectorised code, some variables still need to be scalar.
+            // Particularly, the induction variable "id" and remainder loop variables (that start
+            // with "epilogue").
+            if (is_kernel_code && vector_width > 1 && identifier_name != kernel_id &&
+                identifier_name.rfind("epilogue", 0)) {
                 var_type = llvm::FixedVectorType::get(scalar_var_type, vector_width);
             } else {
                 var_type = scalar_var_type;
@@ -726,7 +744,7 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
         } else {
             throw std::runtime_error("Error: Unsupported local variable type");
         }
-        llvm::Value* alloca = builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+        builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
     }
 }
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 83807fedbf..207548ee46 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -845,10 +845,14 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, struct_type));
             REQUIRE(std::regex_search(module_string, m, kernel_declaration));
 
-            // Check for correct induction variable initialisation and a branch to condition block.
-            std::regex alloca_instr(R"(%id = alloca i32)");
+            // Check for correct variables initialisation and a branch to condition block.
+            std::regex id_initialisation(R"(%id = alloca i32)");
+            std::regex node_id_initialisation(R"(%node_id = alloca i32)");
+            std::regex v_initialisation(R"(%v = alloca double)");
             std::regex br(R"(br label %for\.cond)");
-            REQUIRE(std::regex_search(module_string, m, alloca_instr));
+            REQUIRE(std::regex_search(module_string, m, id_initialisation));
+            REQUIRE(std::regex_search(module_string, m, node_id_initialisation));
+            REQUIRE(std::regex_search(module_string, m, v_initialisation));
             REQUIRE(std::regex_search(module_string, m, br));
 
             // Check condition block: id < mech->node_count, and a conditional branch to loop body
@@ -865,12 +869,7 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, condition));
             REQUIRE(std::regex_search(module_string, m, cond_br));
 
-            // In the body block, `node_id` and voltage `v` are initialised with the data from the
-            // struct. Check for variable allocations and correct loads from the struct with GEPs.
-            std::regex initialisation(
-                "for\\.body:.*\n"
-                "  %node_id = alloca i32,.*\n"
-                "  %v = alloca double,.*");
+            // Check for correct loads from the struct with GEPs.
             std::regex load_from_struct(
                 "  %.* = load %.*__instance_var__type\\*, %.*__instance_var__type\\*\\* %.*\n"
                 "  %.* = getelementptr inbounds %.*__instance_var__type, "
@@ -880,7 +879,6 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
                 "  %.* = load (i32|double)\\*, (i32|double)\\*\\* %.*\n"
                 "  %.* = getelementptr inbounds (i32|double), (i32|double)\\* %.*, i64 %.*\n"
                 "  %.* = load (i32|double), (i32|double)\\* %.*");
-            REQUIRE(std::regex_search(module_string, m, initialisation));
             REQUIRE(std::regex_search(module_string, m, load_from_struct));
 
             // Check induction variable is incremented in increment block.
@@ -987,8 +985,6 @@ SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
 
         std::string expected_loop = R"(
             for(id = 0; id<mech->node_count; id = id+1) {
-                INTEGER node_id
-                DOUBLE v
                 node_id = mech->node_index[id]
                 v = mech->voltage[node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
@@ -1033,16 +1029,12 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
 
         std::string expected_main_loop = R"(
             for(id = 0; id<mech->node_count-7; id = id+8) {
-                INTEGER node_id
-                DOUBLE v
                 node_id = mech->node_index[id]
                 v = mech->voltage[node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
             })";
         std::string expected_epilogue_loop = R"(
             for(; id<mech->node_count; id = id+1) {
-                INTEGER epilogue_node_id
-                DOUBLE epilogue_v
                 epilogue_node_id = mech->node_index[id]
                 epilogue_v = mech->voltage[epilogue_node_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]

From 0c2cfc3321f9daacc4305e29bf2c22b1ce9a9469 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 17 Apr 2021 09:51:19 +0300
Subject: [PATCH 147/331] Benchmarking LLVM code generation (#583)

Introduced the benchmarking for LLVM code generation pipeline.
For that, new options have been added:

```
benchmark
  LLVM benchmark option
  Options:
    --run                                                               Run LLVM benchmark (false)
    --instance-size INT                                       Instance struct size (10000)
    --repeat INT                                                    Number of experiments for benchmarking (100)
    --backend TEXT:{avx2, default, sse2}         Target's backend (default)
```

The JIT runner has also been modified to extract the target
information correctly, and disable available CPU features for
benchmarking a specific backend.

Example:
```
$ nmodl hh.mod llvm --ir --vector-width 1 benchmark --run --instance-size 100 --repeat 2 --backend default

Created LLVM IR module from NMODL AST in 0.006765817

Benchmarking kernel 'nrn_state_hh'
Experiment 0: compute time = 0.013977749
Experiment 1: compute time = 0.004847989
Average compute time = 0.0058550929
```

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 src/CMakeLists.txt                           |   2 +-
 src/codegen/llvm/CMakeLists.txt              |   5 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp    |  83 ++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  11 +-
 src/codegen/llvm/jit_driver.cpp              |  59 +++----
 src/codegen/llvm/jit_driver.hpp              |  11 +-
 src/codegen/llvm/llvm_benchmark.cpp          | 157 +++++++++++++++++++
 src/codegen/llvm/llvm_benchmark.hpp          |  85 ++++++++++
 src/main.cpp                                 |  43 ++++-
 test/unit/CMakeLists.txt                     |   5 +
 test/unit/codegen/codegen_llvm_execution.cpp |   4 +-
 11 files changed, 390 insertions(+), 75 deletions(-)
 create mode 100644 src/codegen/llvm/llvm_benchmark.cpp
 create mode 100644 src/codegen/llvm/llvm_benchmark.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4c8a9801a7..d27a039de7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -36,7 +36,7 @@ target_link_libraries(
   ${NMODL_WRAPPER_LIBS})
 
 if(NMODL_ENABLE_LLVM)
-  target_link_libraries(nmodl llvm_codegen ${LLVM_LIBS_TO_LINK})
+  target_link_libraries(nmodl llvm_codegen llvm_benchmark ${LLVM_LIBS_TO_LINK})
 endif()
 
 # =============================================================================
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index bd54f4143d..8c2a295598 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -7,7 +7,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
@@ -29,6 +31,7 @@ if(NOT NMODL_AS_SUBPROJECT)
     nmodl_llvm_runner
     llvm_codegen
     codegen
+    llvm_benchmark
     visitor
     symtab
     lexer
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 3a165e465a..ea7e828035 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -976,40 +976,57 @@ void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node)
     builder.SetInsertPoint(exit);
 }
 
-void CodegenLLVMVisitor::wrap_kernel_function(const std::string& kernel_name) {
-    // Get the kernel function and the instance struct type.
-    auto kernel = module->getFunction(kernel_name);
-    if (!kernel)
-        throw std::runtime_error("Kernel " + kernel_name + " is not found!");
-
-    if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
-        throw std::runtime_error("Kernel " + kernel_name + " must have a single argument!");
-
-    auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(kernel->getArg(0)->getType());
-    if (!instance_struct_ptr_type)
-        throw std::runtime_error("Kernel " + kernel_name +
-                                 " does not have an instance struct pointer argument!");
-
-    // Create a wrapper void function that takes a void pointer as a single argument.
-    llvm::Type* void_type = llvm::Type::getVoidTy(*context);
-    llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-    llvm::Type* void_ptr_type = llvm::PointerType::get(void_type, /*AddressSpace=*/0);
-    llvm::Function* wrapper_func = llvm::Function::Create(
-        llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
-        llvm::Function::ExternalLinkage,
-        "__" + kernel_name + "_wrapper",
-        *module);
-    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
-    builder.SetInsertPoint(body);
+void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
+    // By convention, only the kernel functions return void type.
+    const auto& functions = module->getFunctionList();
+    for (const auto& func: functions) {
+        if (func.getReturnType()->isVoidTy()) {
+            container.push_back(func.getName().str());
+        }
+    }
+}
 
-    // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel and
-    // adding a terminator.
-    llvm::Value* bitcasted = builder.CreateBitCast(wrapper_func->getArg(0),
-                                                   instance_struct_ptr_type);
-    std::vector<llvm::Value*> args;
-    args.push_back(bitcasted);
-    builder.CreateCall(kernel, args);
-    builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
+void CodegenLLVMVisitor::wrap_kernel_functions() {
+    // First, identify all kernels.
+    std::vector<std::string> kernel_names;
+    find_kernel_names(kernel_names);
+
+    for (const auto& kernel_name: kernel_names) {
+        // Get the kernel function and the instance struct type.
+        auto kernel = module->getFunction(kernel_name);
+        if (!kernel)
+            throw std::runtime_error("Kernel " + kernel_name + " is not found!");
+
+        if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
+            throw std::runtime_error("Kernel " + kernel_name + " must have a single argument!");
+
+        auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(
+            kernel->getArg(0)->getType());
+        if (!instance_struct_ptr_type)
+            throw std::runtime_error("Kernel " + kernel_name +
+                                     " does not have an instance struct pointer argument!");
+
+        // Create a wrapper void function that takes a void pointer as a single argument.
+        llvm::Type* void_type = llvm::Type::getVoidTy(*context);
+        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
+        llvm::Type* void_ptr_type = llvm::PointerType::get(void_type, /*AddressSpace=*/0);
+        llvm::Function* wrapper_func = llvm::Function::Create(
+            llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
+            llvm::Function::ExternalLinkage,
+            "__" + kernel_name + "_wrapper",
+            *module);
+        llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
+        builder.SetInsertPoint(body);
+
+        // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
+        // and adding a terminator.
+        llvm::Value* bitcasted = builder.CreateBitCast(wrapper_func->getArg(0),
+                                                       instance_struct_ptr_type);
+        std::vector<llvm::Value*> args;
+        args.push_back(bitcasted);
+        builder.CreateCall(kernel, args);
+        builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
+    }
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index f001c2c2fe..1007258010 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -333,10 +333,15 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     }
 
     /**
-     * For the given kernel function, wraps it into another function that uses void* to pass the
-     * data to the kernel \param kernel_name kernel name to be wrapped
+     * Fills the container with the names of kernel functions from the MOD file.
      */
-    void wrap_kernel_function(const std::string& kernel_name);
+    void find_kernel_names(std::vector<std::string>& container);
+
+    /**
+     * Wraps all kernel function calls into wrapper functions that use void* to pass the data to the
+     * kernel.
+     */
+    void wrap_kernel_functions();
 };
 
 /** \} */  // end of llvm_backends
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index a7673bb2ff..842c500810 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -22,24 +22,27 @@
 namespace nmodl {
 namespace runner {
 
-void JITDriver::init() {
+void JITDriver::init(std::string features) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
 
-    set_target_triple(module.get());
-    auto data_layout = module->getDataLayout();
-
     // Create IR compile function callback.
     auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
-        auto tm = tm_builder.createTargetMachine();
-        if (!tm)
-            return tm.takeError();
-        return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(*tm));
+        // Create target machine with some features possibly turned off.
+        auto tm = create_target(&tm_builder, features);
+
+        // Set the target triple and the data layout for the module.
+        module->setDataLayout(tm->createDataLayout());
+        module->setTargetTriple(tm->getTargetTriple().getTriple());
+
+        return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(tm));
     };
 
+    // Set JIT instance and extract the data layout from the module.
     auto jit_instance = cantFail(
         llvm::orc::LLJITBuilder().setCompileFunctionCreator(compile_function_creator).create());
+    auto data_layout = module->getDataLayout();
 
     // Add a ThreadSafeModule to the driver.
     llvm::orc::ThreadSafeModule tsm(std::move(module), std::make_unique<llvm::LLVMContext>());
@@ -52,29 +55,29 @@ void JITDriver::init() {
         data_layout.getGlobalPrefix())));
 }
 
-void JITDriver::set_target_triple(llvm::Module* module) {
-    auto target_triple = llvm::sys::getDefaultTargetTriple();
-    std::string error;
-    auto target = llvm::TargetRegistry::lookupTarget(target_triple, error);
+std::unique_ptr<llvm::TargetMachine> JITDriver::create_target(
+    llvm::orc::JITTargetMachineBuilder* builder,
+    const std::string& features) {
+    // First, look up the target.
+    std::string error_msg;
+    auto target_triple = builder->getTargetTriple().getTriple();
+    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
     if (!target)
-        throw std::runtime_error("Error: " + error + "\n");
-
-    std::string cpu(llvm::sys::getHostCPUName());
-    llvm::SubtargetFeatures features;
-    llvm::StringMap<bool> host_features;
-
-    if (llvm::sys::getHostCPUFeatures(host_features)) {
-        for (auto& f: host_features)
-            features.AddFeature(f.first(), f.second);
-    }
+        throw std::runtime_error("Error " + error_msg + "\n");
 
-    std::unique_ptr<llvm::TargetMachine> machine(
-        target->createTargetMachine(target_triple, cpu, features.getString(), {}, {}));
-    if (!machine)
-        throw std::runtime_error("Error: failed to create a target machine\n");
+    // Create default target machine with provided features.
+    auto tm = target->createTargetMachine(target_triple,
+                                          llvm::sys::getHostCPUName().str(),
+                                          features,
+                                          builder->getOptions(),
+                                          builder->getRelocationModel(),
+                                          builder->getCodeModel(),
+                                          /*OL=*/llvm::CodeGenOpt::Default,
+                                          /*JIT=*/true);
+    if (!tm)
+        throw std::runtime_error("Error: could not create the target machine\n");
 
-    module->setDataLayout(machine->createDataLayout());
-    module->setTargetTriple(target_triple);
+    return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
 }  // namespace runner
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index 23c8fca612..f994a57303 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -37,7 +37,7 @@ class JITDriver {
         : module(std::move(m)) {}
 
     /// Initialize the JIT.
-    void init();
+    void init(std::string features);
 
     /// Lookup the entry-point without arguments in the JIT and execute it, returning the result.
     template <typename ReturnType>
@@ -63,8 +63,9 @@ class JITDriver {
         return result;
     }
 
-    /// Set the target triple on the module.
-    static void set_target_triple(llvm::Module* module);
+    /// A wrapper around llvm::createTargetMachine to turn on/off certain CPU features.
+    std::unique_ptr<llvm::TargetMachine> create_target(llvm::orc::JITTargetMachineBuilder* builder,
+                                                       const std::string& features);
 };
 
 /**
@@ -78,9 +79,9 @@ class Runner {
     std::unique_ptr<JITDriver> driver = std::make_unique<JITDriver>(std::move(module));
 
   public:
-    Runner(std::unique_ptr<llvm::Module> m)
+    Runner(std::unique_ptr<llvm::Module> m, std::string features = "")
         : module(std::move(m)) {
-        driver->init();
+        driver->init(features);
     }
 
     /// Run the entry-point function without arguments.
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
new file mode 100644
index 0000000000..57e0d05c5b
--- /dev/null
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -0,0 +1,157 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <chrono>
+#include <fstream>
+
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/jit_driver.hpp"
+#include "llvm_benchmark.hpp"
+#include "llvm/Support/Host.h"
+
+#include "test/unit/codegen/codegen_data_helper.hpp"
+
+
+namespace nmodl {
+namespace benchmark {
+
+
+/// Precision for the timing measurements.
+static constexpr int PRECISION = 9;
+
+
+void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>& host_features) {
+    for (auto& host_feature: host_features) {
+        if (feature == host_feature.substr(1)) {
+            host_feature[0] = '-';
+            *log_stream << host_feature << "\n";
+            return;
+        }
+    }
+}
+
+void LLVMBenchmark::benchmark(const std::shared_ptr<ast::Program>& node) {
+    // First, set the output stream for the logs.
+    set_log_output();
+
+    // Then, record the time taken for building the LLVM IR module.
+    codegen::CodegenLLVMVisitor visitor(mod_filename,
+                                        output_dir,
+                                        llvm_build_info.opt_passes,
+                                        llvm_build_info.use_single_precision,
+                                        llvm_build_info.vector_width);
+    generate_llvm(visitor, node);
+
+    // Finally, run the benchmark and log the measurements.
+    run_benchmark(visitor, node);
+}
+
+void LLVMBenchmark::generate_llvm(codegen::CodegenLLVMVisitor& visitor,
+                                  const std::shared_ptr<ast::Program>& node) {
+    // First, visit the AST to build the LLVM IR module and wrap the kernel function calls.
+    auto start = std::chrono::high_resolution_clock::now();
+    visitor.visit_program(*node);
+    visitor.wrap_kernel_functions();
+    auto end = std::chrono::high_resolution_clock::now();
+
+    // Log the time taken to visit the AST and build LLVM IR.
+    std::chrono::duration<double> diff = end - start;
+    *log_stream << "Created LLVM IR module from NMODL AST in " << std::setprecision(PRECISION)
+                << diff.count() << "\n\n";
+}
+
+std::vector<std::string> LLVMBenchmark::get_cpu_features() {
+    std::string cpu(llvm::sys::getHostCPUName());
+
+    llvm::SubtargetFeatures features;
+    llvm::StringMap<bool> host_features;
+    if (llvm::sys::getHostCPUFeatures(host_features)) {
+        for (auto& f: host_features)
+            features.AddFeature(f.first(), f.second);
+    }
+    return features.getFeatures();
+}
+
+void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
+                                  const std::shared_ptr<ast::Program>& node) {
+    // Set the codegen data helper and find the kernels.
+    auto codegen_data = codegen::CodegenDataHelper(node, visitor.get_instance_struct_ptr());
+    std::vector<std::string> kernel_names;
+    visitor.find_kernel_names(kernel_names);
+
+    // Get feature's string and turn them off depending on the backend.
+    std::vector<std::string> features = get_cpu_features();
+    *log_stream << "Backend: " << backend << "\n";
+    if (backend == "avx2") {
+        // Disable SSE.
+        *log_stream << "Disabling features:\n";
+        disable("sse", features);
+        disable("sse2", features);
+        disable("sse3", features);
+        disable("sse4.1", features);
+        disable("sse4.2", features);
+    } else if (backend == "sse2") {
+        // Disable AVX.
+        *log_stream << "Disabling features:\n";
+        disable("avx", features);
+        disable("avx2", features);
+    }
+
+    std::string features_str = llvm::join(features.begin(), features.end(), ",");
+    std::unique_ptr<llvm::Module> m = visitor.get_module();
+    runner::Runner runner(std::move(m), features_str);
+
+    // Benchmark every kernel.
+    for (const auto& kernel_name: kernel_names) {
+        *log_stream << "Benchmarking kernel '" << kernel_name << "'\n";
+
+        // For every kernel run the benchmark `num_experiments` times.
+        double time_sum = 0.0;
+        for (int i = 0; i < num_experiments; ++i) {
+            // Initialise the data.
+            auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+
+            // Record the execution time of the kernel.
+            std::string wrapper_name = "__" + kernel_name + "_wrapper";
+            auto start = std::chrono::high_resolution_clock::now();
+            runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+            auto end = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<double> diff = end - start;
+
+            // Log the time taken for each run.
+            *log_stream << "Experiment " << i << ": compute time = " << std::setprecision(9)
+                        << diff.count() << "\n";
+
+            time_sum += diff.count();
+        }
+        // Log the average time taken for the kernel.
+        *log_stream << "Average compute time = " << std::setprecision(PRECISION)
+                    << time_sum / num_experiments << "\n\n";
+    }
+}
+
+void LLVMBenchmark::set_log_output() {
+    // If the output directory is not specified, dump logs to the console.
+    if (output_dir == ".") {
+        log_stream = std::make_shared<std::ostream>(std::cout.rdbuf());
+        return;
+    }
+
+    // Otherwise, dump logs to the specified file.
+    std::string filename = output_dir + "/" + mod_filename + ".log";
+    std::ofstream ofs;
+
+    ofs.open(filename.c_str());
+
+    if (ofs.fail())
+        throw std::runtime_error("Error while opening a file '" + filename + "'");
+
+    log_stream = std::make_shared<std::ostream>(ofs.rdbuf());
+}
+
+}  // namespace benchmark
+}  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/src/codegen/llvm/llvm_benchmark.hpp
new file mode 100644
index 0000000000..30ebf182e8
--- /dev/null
+++ b/src/codegen/llvm/llvm_benchmark.hpp
@@ -0,0 +1,85 @@
+/*************************************************************************
+ * Copyright (C) 2018-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+
+
+namespace nmodl {
+namespace benchmark {
+
+/// A struct to hold LLVM visitor information.
+struct LLVMBuildInfo {
+    int vector_width;
+    bool opt_passes;
+    bool use_single_precision;
+};
+
+/**
+ * \class LLVMBenchmark
+ * \brief A wrapper to execute MOD file kernels via LLVM IR backend, and
+ * benchmark compile-time and runtime.
+ */
+class LLVMBenchmark {
+  private:
+    std::string mod_filename;
+
+    std::string output_dir;
+
+    int num_experiments;
+
+    int instance_size;
+
+    std::string backend;
+
+    LLVMBuildInfo llvm_build_info;
+
+    std::shared_ptr<std::ostream> log_stream;
+
+    /// Disable the specified feature.
+    void disable(const std::string& feature, std::vector<std::string>& host_features);
+
+    /// Visits the AST to construct the LLVM IR module.
+    void generate_llvm(codegen::CodegenLLVMVisitor& visitor,
+                       const std::shared_ptr<ast::Program>& node);
+
+    /// Get the host CPU features in the format:
+    ///   +feature,+feature,-feature,+feature,...
+    /// where `+` indicates that the feature is enabled.
+    std::vector<std::string> get_cpu_features();
+
+    /// Runs the main body of the benchmark, executing the compute kernels.
+    void run_benchmark(codegen::CodegenLLVMVisitor& visitor,
+                       const std::shared_ptr<ast::Program>& node);
+
+    /// Sets the log output stream (file or console).
+    void set_log_output();
+
+  public:
+    LLVMBenchmark(const std::string& mod_filename,
+                  const std::string& output_dir,
+                  LLVMBuildInfo info,
+                  int num_experiments,
+                  int instance_size,
+                  const std::string& backend)
+        : mod_filename(mod_filename)
+        , output_dir(output_dir)
+        , num_experiments(num_experiments)
+        , instance_size(instance_size)
+        , backend(backend)
+        , llvm_build_info(info) {}
+
+    /// Runs the benchmark.
+    void benchmark(const std::shared_ptr<ast::Program>& node);
+};
+
+
+}  // namespace benchmark
+}  // namespace nmodl
diff --git a/src/main.cpp b/src/main.cpp
index 62ed2b2251..34aabfc190 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -20,6 +20,7 @@
 
 #ifdef NMODL_LLVM_BACKEND
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/llvm_benchmark.hpp"
 #endif
 
 #include "config/config.h"
@@ -178,8 +179,20 @@ int main(int argc, const char* argv[]) {
     /// run llvm optimisation passes
     bool llvm_opt_passes(false);
 
-    /// llvm vector width;
+    /// llvm vector width
     int llvm_vec_width = 1;
+
+    /// run llvm benchmark
+    bool run_benchmark(false);
+
+    /// the size of the instance struct for the benchmark
+    int instance_size = 10000;
+
+    /// the number of experiments to run for the benchmarking
+    int repeat = 100;
+
+    /// specify the backend for LLVM IR to target
+    std::string backend = "default";
 #endif
 
     app.get_formatter()->column_width(40);
@@ -289,6 +302,8 @@ int main(int argc, const char* argv[]) {
         "Optimize copies of ion variables ({})"_format(optimize_ionvar_copies_codegen))->ignore_case();
 
 #ifdef NMODL_LLVM_BACKEND
+
+    // LLVM IR code generation options.
     auto llvm_opt = app.add_subcommand("llvm", "LLVM code generation option")->ignore_case();
     llvm_opt->add_flag("--ir",
         llvm_ir,
@@ -302,6 +317,21 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_option("--vector-width",
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
+
+    // LLVM IR benchmark options.
+    auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
+    benchmark_opt->add_flag("--run",
+                       run_benchmark,
+                       "Run LLVM benchmark ({})"_format(run_benchmark))->ignore_case();
+    benchmark_opt->add_option("--instance-size",
+                       instance_size,
+                       "Instance struct size ({})"_format(instance_size))->ignore_case();
+    benchmark_opt->add_option("--repeat",
+                       repeat,
+                       "Number of experiments for benchmarking ({})"_format(repeat))->ignore_case();
+    benchmark_opt->add_option("--backend",
+                       backend,
+                       "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));;
 #endif
     // clang-format on
 
@@ -607,7 +637,16 @@ int main(int argc, const char* argv[]) {
             }
 
 #ifdef NMODL_LLVM_BACKEND
-            if (llvm_ir) {
+
+            if (run_benchmark) {
+                logger->info("Running LLVM benchmark");
+                benchmark::LLVMBuildInfo info{llvm_vec_width, llvm_opt_passes, llvm_float_type};
+                benchmark::LLVMBenchmark bench(
+                    modfile, output_dir, info, repeat, instance_size, backend);
+                bench.benchmark(ast);
+            }
+
+            else if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(
                     modfile, output_dir, llvm_opt_passes, llvm_float_type, llvm_vec_width);
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 5b0e93e196..1aa091c7fd 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -93,6 +93,7 @@ target_link_libraries(
   test_util
   printer
   ${NMODL_WRAPPER_LIBS})
+
 target_link_libraries(
   testcodegen
   codegen
@@ -106,6 +107,10 @@ target_link_libraries(
 
 if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS} codegen)
+
+  add_library(llvm_benchmark STATIC codegen/codegen_data_helper.cpp)
+  add_dependencies(llvm_benchmark lexer)
+
   add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp
                           codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_data_helper.cpp
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index b191f350df..4e2717e45c 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -303,7 +303,7 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/1);
         llvm_visitor.visit_program(*ast);
-        llvm_visitor.wrap_kernel_function("nrn_state_test");
+        llvm_visitor.wrap_kernel_functions();
 
         // Create the instance struct data.
         int num_elements = 4;
@@ -384,7 +384,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/4);
         llvm_visitor.visit_program(*ast);
-        llvm_visitor.wrap_kernel_function("nrn_state_test");
+        llvm_visitor.wrap_kernel_functions();
 
         // Create the instance struct data.
         int num_elements = 10;

From 72feb65a1b52b8d5a267ddfc6c93f092e64edd22 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Sun, 18 Apr 2021 23:47:28 +0200
Subject: [PATCH 148/331] Minor benchmarking improvement (#593)

- allocate instance data only once
- store memory size with instance data
- print memory size while running benchmarking kernel
---
 src/codegen/llvm/llvm_benchmark.cpp       | 9 +++++----
 test/unit/codegen/codegen_data_helper.cpp | 2 ++
 test/unit/codegen/codegen_data_helper.hpp | 3 +++
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index 57e0d05c5b..6ab9ff4982 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -107,14 +107,15 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
-        *log_stream << "Benchmarking kernel '" << kernel_name << "'\n";
+        // Initialise the data.
+        auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+
+        double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
+        *log_stream << "Benchmarking kernel '" << kernel_name << ", with " << size_mbs << " MBs\n";
 
         // For every kernel run the benchmark `num_experiments` times.
         double time_sum = 0.0;
         for (int i = 0; i < num_experiments; ++i) {
-            // Initialise the data.
-            auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::high_resolution_clock::now();
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index e42cfe01f3..4bf94f583d 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -88,6 +88,7 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
     // allocate instance object with memory alignment
     posix_memalign(&base, NBYTE_ALIGNMENT, member_size * variables.size());
     data.base_ptr = base;
+    data.num_bytes += member_size * variables.size();
 
     size_t offset = 0;
     void* ptr = base;
@@ -115,6 +116,7 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
         void* member;
         posix_memalign(&member, NBYTE_ALIGNMENT, member_size * num_elements);
         initialize_variable(var, member, variable_index, num_elements);
+        data.num_bytes += member_size * num_elements;
 
         // copy address at specific location in the struct
         memcpy(ptr, &member, sizeof(double*));
diff --git a/test/unit/codegen/codegen_data_helper.hpp b/test/unit/codegen/codegen_data_helper.hpp
index 368b964147..ef8e869366 100644
--- a/test/unit/codegen/codegen_data_helper.hpp
+++ b/test/unit/codegen/codegen_data_helper.hpp
@@ -46,6 +46,9 @@ struct CodegenInstanceData {
     /// i.e. *(base_ptr + offsets[0]) will be members[0]
     std::vector<void*> members;
 
+    /// size in bytes
+    size_t num_bytes = 0;
+
     // cleanup all memory allocated for type and member variables
     ~CodegenInstanceData();
 };

From c372e46193f65af725fb065822f90586eea5a204 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Mon, 19 Apr 2021 19:22:07 +0200
Subject: [PATCH 149/331] Bug fix in codegen helper: delete LOCAL statement
 (#595)

- LOCAL statement was not deleted correctly
- Instead of getting first element from statement vector,
   use local statement pointer to erase it from the node.

Related to #594
---
 src/codegen/llvm/codegen_llvm_helper_visitor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index eec79370f6..8105fec848 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -473,12 +473,13 @@ void CodegenLLVMHelperVisitor::convert_local_statement(ast::StatementBlock& node
         }
 
         /// remove local list statement now
-        const auto& statements = node.get_statements();
-        node.erase_statement(statements.begin());
+        std::unordered_set<nmodl::ast::Statement*> to_delete({local_statement.get()});
+        node.erase_statement(to_delete);
 
         /// create new codegen variable statement and insert at the beginning of the block
         auto type = new ast::CodegenVarType(FLOAT_TYPE);
         auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+        const auto& statements = node.get_statements();
         node.insert_statement(statements.begin(), statement);
     }
 }

From ebb155e30f267f9ef3625e7e80dcf727a73804ae Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 20 Apr 2021 21:36:39 +0300
Subject: [PATCH 150/331] LLVM 13 compatibility and fixing void* type (#603)

* Made compatible with LLVM 13 and replaced void* with i8*
---
 cmake/LLVMHelper.cmake                    | 9 ++++++++-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 3 +--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index a731fa0151..e27ac8d553 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -5,7 +5,14 @@
 find_package(LLVM REQUIRED CONFIG)
 
 # include LLVM header and core library
-llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK core orcjit native)
+llvm_map_components_to_libnames(
+  LLVM_LIBS_TO_LINK
+  core
+  instcombine
+  native
+  orcjit
+  scalaropts
+  support)
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
 set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index ea7e828035..cd42fffae3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -1007,9 +1007,8 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
                                      " does not have an instance struct pointer argument!");
 
         // Create a wrapper void function that takes a void pointer as a single argument.
-        llvm::Type* void_type = llvm::Type::getVoidTy(*context);
         llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::Type* void_ptr_type = llvm::PointerType::get(void_type, /*AddressSpace=*/0);
+        llvm::Type* void_ptr_type = llvm::Type::getInt8PtrTy(*context);
         llvm::Function* wrapper_func = llvm::Function::Create(
             llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
             llvm::Function::ExternalLinkage,

From 31b95d84a93ef95e3628bd60ef701d2998185221 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Tue, 20 Apr 2021 23:31:40 +0200
Subject: [PATCH 151/331] Allow LOCAL variable inside StatementBlock for LLVM
 IR generation (#599)

  - if LOCAL variable was declared inside DERIVATIVE block then
    we were getting error:
      "Stored value type does not match pointer operand type!"
  - the error was happening because scalar variable from epilogue
    loop was conflicting with the vector type variable in main loop
  - to avoid conflict between main and epilogue loop, rename all
    local variables in epilogue.
  - bug fix for recursive handling of LocalList statement

fixes #594
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 57 ++++++++++++++++---
 .../llvm/codegen_llvm_helper_visitor.hpp      |  1 +
 2 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 8105fec848..0df364e649 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -459,12 +459,13 @@ void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
  * it to CodegenVarListStatement that will represent all variables as double.
  */
 void CodegenLLVMHelperVisitor::convert_local_statement(ast::StatementBlock& node) {
-    /// first process all children blocks if any
-    node.visit_children(*this);
+    /// collect all local statement block
+    const auto& statements = collect_nodes(node, {ast::AstNodeType::LOCAL_LIST_STATEMENT});
+
+    /// iterate over all statements and replace each with codegen variable
+    for (const auto& statement: statements) {
+        const auto& local_statement = std::dynamic_pointer_cast<ast::LocalListStatement>(statement);
 
-    /// check if block contains LOCAL statement
-    const auto& local_statement = visitor::get_local_list_statement(node);
-    if (local_statement) {
         /// create codegen variables from local variables
         /// clone variable to make new independent statement
         ast::CodegenVarVector variables;
@@ -474,16 +475,51 @@ void CodegenLLVMHelperVisitor::convert_local_statement(ast::StatementBlock& node
 
         /// remove local list statement now
         std::unordered_set<nmodl::ast::Statement*> to_delete({local_statement.get()});
-        node.erase_statement(to_delete);
+        /// local list statement is enclosed in statement block
+        const auto& parent_node = dynamic_cast<ast::StatementBlock*>(local_statement->get_parent());
+        parent_node->erase_statement(to_delete);
 
         /// create new codegen variable statement and insert at the beginning of the block
         auto type = new ast::CodegenVarType(FLOAT_TYPE);
-        auto statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
-        const auto& statements = node.get_statements();
-        node.insert_statement(statements.begin(), statement);
+        auto new_statement = std::make_shared<ast::CodegenVarListStatement>(type, variables);
+        const auto& statements = parent_node->get_statements();
+        parent_node->insert_statement(statements.begin(), new_statement);
     }
 }
 
+/**
+ * \brief Visit StatementBlock and rename all LOCAL variables
+ * @param node AST node representing Statement block
+ *
+ * Statement block in remainder loop will have same LOCAL variables from
+ * main loop. In order to avoid conflict during lookup, rename each local
+ * variable by appending unique number. The number used as suffix is just
+ * a counter used for Statement block.
+ */
+void CodegenLLVMHelperVisitor::rename_local_variables(ast::StatementBlock& node) {
+    /// local block counter just to append unique number
+    static int local_block_counter = 1;
+
+    /// collect all local statement block
+    const auto& statements = collect_nodes(node, {ast::AstNodeType::LOCAL_LIST_STATEMENT});
+
+    /// iterate over each statement and rename all variables
+    for (const auto& statement: statements) {
+        const auto& local_statement = std::dynamic_pointer_cast<ast::LocalListStatement>(statement);
+
+        /// rename local variable in entire statement block
+        for (auto& var: local_statement->get_variables()) {
+            std::string old_name = var->get_node_name();
+            std::string new_name = "{}_{}"_format(old_name, local_block_counter);
+            visitor::RenameVisitor(old_name, new_name).visit_statement_block(node);
+        }
+    }
+
+    /// make it unique for next statement block
+    local_block_counter++;
+}
+
+
 void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
     node.visit_children(*this);
     create_function_for_node(node);
@@ -661,6 +697,9 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
             loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, /*vector_width=*/1);
         const auto& increment = loop_increment_expression(INDUCTION_VAR, /*vector_width=*/1);
 
+        /// rename local variables to avoid conflict with main loop
+        rename_local_variables(*loop_block);
+
         /// convert local statement to codegenvar statement
         convert_local_statement(*loop_block);
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 446d5a6fd9..bbff588675 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -163,6 +163,7 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void convert_to_instance_variable(ast::Node& node, std::string& index_var);
 
     void convert_local_statement(ast::StatementBlock& node);
+    void rename_local_variables(ast::StatementBlock& node);
 
     void visit_procedure_block(ast::ProcedureBlock& node) override;
     void visit_function_block(ast::FunctionBlock& node) override;

From c71b36834b68eedb08d2c4ad868b7918775ef9be Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Thu, 22 Apr 2021 17:11:09 +0200
Subject: [PATCH 152/331] Update CI with LLVM v13 (trunk) (#605)

 * In order to use VecLibReplace pass, we need LLVM 13 / trunk
 * Change ubuntu image on azure from 16.04 to 18.04
 * Install llvm-13 nightly snapshot
 * Enable LLVM build on Ubuntu
 * For Mac OS use pre-built binary package from https://github.com/pramodk/llvm-nightly
 * We will see if we get OS X bottle from BlueBrain/homebrew-tap/pull/7
---
 azure-pipelines.yml | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index d8e6408d74..effe8c43f9 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -47,6 +47,10 @@ jobs:
       url="https://github.com/ispc/ispc/releases/download/${ispc_version}/ispc-${ispc_version}${ispc_version_suffix}-${url_os}.tar.gz";
       mkdir $(pwd)/$CMAKE_PKG/ispc
       wget --quiet --output-document=- $url | tar -xvzf - -C $(pwd)/$CMAKE_PKG/ispc --strip 1;
+      # install llvm nightly (future v13) TODO: this will fail now, FIX this!
+      wget https://apt.llvm.org/llvm.sh
+      chmod +x llvm.sh
+      sudo ./llvm.sh 13
     env:
       CMAKE_VER: 'v3.15.0'
       CMAKE_PKG: 'cmake-3.15.0-Linux-x86_64'
@@ -57,7 +61,7 @@ jobs:
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
       cmake --version
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3.7) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=Release -DNMODL_ENABLE_LLVM=OFF
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3.7) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=Release -DNMODL_ENABLE_LLVM=ON -DLLVM_DIR=/usr/lib/llvm-13/share/llvm/cmake/
       make -j 2
       if [ $? -ne 0 ]
       then
@@ -119,7 +123,7 @@ jobs:
 - job: 'osx1015'
   pool:
     vmImage: 'macOS-10.15'
-  displayName: 'MacOS (10.15), AppleClang 12.0'
+  displayName: 'MacOS (10.15), AppleClang 13.0 (trunk, May 2021)'
   steps:
   - checkout: self
     submodules: True
@@ -128,11 +132,15 @@ jobs:
       python3 -m pip install --upgrade pip 'setuptools<59.7.0'
       python3 -m pip install --user 'Jinja2>=2.9.3' 'PyYAML>=3.13' pytest pytest-cov numpy 'sympy>=1.3,<1.9'
     displayName: 'Install Dependencies'
+  - script: |
+      cd $HOME
+      git clone https://github.com/pramodk/llvm-nightly.git
+    displayName: 'Setup LLVM v13'
   - script: |
       export PATH=/usr/local/opt/flex/bin:/usr/local/opt/bison/bin:$PATH;
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=`brew --prefix llvm`/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$HOME/llvm-nightly/0421/osx/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
       make -j 2
       if [ $? -ne 0 ]
       then

From 63a662e57e471f34bce68bbc22db83fe2ac7c1b2 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 22 Apr 2021 09:08:33 -0700
Subject: [PATCH 153/331] Integrating vector maths library into LLVM codegen
 (#604)

Added support for replacing LLVM IR maths intrinsics with vector
maths functions from Accelerate, libmvec, MASSV, and SVML. To
trigger the replacement, a new `--veclib` option should be used.
This is only supported on LLVM 13+.

Example:
```
$ bin/nmodl hh.mod llvm --ir --vector-width 4 --veclib SVML
```

fixes #589

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 CMakeLists.txt                            |  3 +
 cmake/LLVMHelper.cmake                    |  3 +
 src/codegen/llvm/codegen_llvm_visitor.cpp | 52 ++++++++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp | 29 ++++++-
 src/main.cpp                              | 24 ++++--
 test/unit/codegen/codegen_llvm_ir.cpp     | 98 ++++++++++++++++++++++-
 6 files changed, 188 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 26ff33eeb9..b2dbc4cc8c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -160,6 +160,9 @@ if(NMODL_ENABLE_LLVM)
   include(LLVMHelper)
   include_directories(${LLVM_INCLUDE_DIRS})
   add_definitions(-DNMODL_LLVM_BACKEND)
+  if(LLVM_VERSION VERSION_LESS_EQUAL 12)
+    add_definitions(-DLLVM_VERSION_LESS_THAN_13)
+  endif()
 endif()
 
 # =============================================================================
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index e27ac8d553..f81a5a62e8 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -7,8 +7,11 @@ find_package(LLVM REQUIRED CONFIG)
 # include LLVM header and core library
 llvm_map_components_to_libnames(
   LLVM_LIBS_TO_LINK
+  analysis
+  codegen
   core
   instcombine
+  mc
   native
   orcjit
   scalaropts
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index cd42fffae3..1738d4139e 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -11,6 +11,7 @@
 #include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -19,8 +20,13 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/ToolOutputFile.h"
 
+#ifndef LLVM_VERSION_LESS_THAN_13
+#include "llvm/CodeGen/ReplaceWithVeclib.h"
+#endif
+
 namespace nmodl {
 namespace codegen {
 
@@ -292,21 +298,21 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMVisitor::get_instance_struct_ptr
     return instance_var_helper.instance;
 }
 
-void CodegenLLVMVisitor::run_llvm_opt_passes() {
+void CodegenLLVMVisitor::run_ir_opt_passes() {
     /// run some common optimisation passes that are commonly suggested
-    fpm.add(llvm::createInstructionCombiningPass());
-    fpm.add(llvm::createReassociatePass());
-    fpm.add(llvm::createGVNPass());
-    fpm.add(llvm::createCFGSimplificationPass());
+    opt_pm.add(llvm::createInstructionCombiningPass());
+    opt_pm.add(llvm::createReassociatePass());
+    opt_pm.add(llvm::createGVNPass());
+    opt_pm.add(llvm::createCFGSimplificationPass());
 
     /// initialize pass manager
-    fpm.doInitialization();
+    opt_pm.doInitialization();
 
     /// iterate over all functions and run the optimisation passes
     auto& functions = module->getFunctionList();
     for (auto& function: functions) {
         llvm::verifyFunction(function);
-        fpm.run(function);
+        opt_pm.run(function);
     }
 }
 
@@ -892,7 +898,37 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     if (opt_passes) {
         logger->info("Running LLVM optimisation passes");
-        run_llvm_opt_passes();
+        run_ir_opt_passes();
+    }
+
+    // Optionally, replace LLVM's maths intrinsics with vector library calls.
+    if (vector_width > 1 && vector_library != llvm::TargetLibraryInfoImpl::NoLibrary) {
+#ifdef LLVM_VERSION_LESS_THAN_13
+        logger->warn(
+            "This version of LLVM does not support replacement of LLVM intrinsics with vector "
+            "library calls");
+#else
+        // First, get the target library information.
+        llvm::Triple triple(llvm::sys::getDefaultTargetTriple());
+        llvm::TargetLibraryInfoImpl target_lib_info = llvm::TargetLibraryInfoImpl(triple);
+
+        // Populate target library information with vectorisable functions. Since libmvec is
+        // supported for x86_64 only, have a check to catch other architectures.
+        if (vector_library != llvm::TargetLibraryInfoImpl::LIBMVEC_X86 ||
+            (triple.isX86() && triple.isArch64Bit())) {
+            target_lib_info.addVectorizableFunctionsFromVecLib(vector_library);
+        }
+
+        // Run the codegen optimisation passes that replace maths intrinsics.
+        codegen_pm.add(new llvm::TargetLibraryInfoWrapperPass(target_lib_info));
+        codegen_pm.add(new llvm::ReplaceWithVeclibLegacy);
+        codegen_pm.doInitialization();
+        for (auto& function: module->getFunctionList()) {
+            if (!function.isDeclaration())
+                codegen_pm.run(function);
+        }
+        codegen_pm.doFinalization();
+#endif
     }
 
     // If the output directory is specified, save the IR to .ll file.
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 1007258010..099613f8d4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -23,6 +23,7 @@
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -45,6 +46,16 @@ namespace codegen {
  * @{
  */
 
+/// A map to query vector library by its string value.
+static const std::map<std::string, llvm::TargetLibraryInfoImpl::VectorLibrary> veclib_map = {
+    {"Accelerate", llvm::TargetLibraryInfoImpl::Accelerate},
+#ifndef LLVM_VERSION_LESS_THAN_13
+    {"libmvec", llvm::TargetLibraryInfoImpl::LIBMVEC_X86},
+#endif
+    {"MASSV", llvm::TargetLibraryInfoImpl::MASSV},
+    {"SVML", llvm::TargetLibraryInfoImpl::SVML},
+    {"none", llvm::TargetLibraryInfoImpl::NoLibrary}};
+
 /**
  * \class CodegenLLVMVisitor
  * \brief %Visitor for transforming NMODL AST to LLVM IR
@@ -65,7 +76,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
     llvm::IRBuilder<> builder;
 
-    llvm::legacy::FunctionPassManager fpm;
+    // Pass manager for optimisation passes that are used for target code generation.
+    llvm::legacy::FunctionPassManager codegen_pm;
+
+    // Vector library used for maths functions.
+    llvm::TargetLibraryInfoImpl::VectorLibrary vector_library;
+
+    // Pass manager for optimisation passes that are run on IR and are not related to target.
+    llvm::legacy::FunctionPassManager opt_pm;
 
     // Stack to hold visited values
     std::vector<llvm::Value*> values;
@@ -97,7 +115,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
      * LLVM provides number of optimisation passes that can be run on the generated IR.
      * Here we run common optimisation LLVM passes that benefits code optimisation.
      */
-    void run_llvm_opt_passes();
+    void run_ir_opt_passes();
 
   public:
     /**
@@ -110,14 +128,17 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        const std::string& output_dir,
                        bool opt_passes,
                        bool use_single_precision = false,
-                       int vector_width = 1)
+                       int vector_width = 1,
+                       std::string vec_lib = "none")
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , use_single_precision(use_single_precision)
         , vector_width(vector_width)
+        , vector_library(veclib_map.at(vec_lib))
         , builder(*context)
-        , fpm(module.get()) {}
+        , codegen_pm(module.get())
+        , opt_pm(module.get()) {}
 
 
     /**
diff --git a/src/main.cpp b/src/main.cpp
index 34aabfc190..2d49f76445 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -177,11 +177,14 @@ int main(int argc, const char* argv[]) {
     bool llvm_float_type(false);
 
     /// run llvm optimisation passes
-    bool llvm_opt_passes(false);
+    bool llvm_ir_opt_passes(false);
 
     /// llvm vector width
     int llvm_vec_width = 1;
 
+    /// vector library
+    std::string vec_lib("none");
+
     /// run llvm benchmark
     bool run_benchmark(false);
 
@@ -309,14 +312,17 @@ int main(int argc, const char* argv[]) {
         llvm_ir,
         "Generate LLVM IR ({})"_format(llvm_ir))->ignore_case();
     llvm_opt->add_flag("--opt",
-        llvm_opt_passes,
-        "Run LLVM optimisation passes ({})"_format(llvm_opt_passes))->ignore_case();
+                       llvm_ir_opt_passes,
+                       "Run LLVM optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
     llvm_opt->add_option("--vector-width",
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
+    llvm_opt->add_option("--veclib",
+                         vec_lib,
+                         "Vector library for maths functions ({})"_format(vec_lib))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
 
     // LLVM IR benchmark options.
     auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
@@ -331,7 +337,7 @@ int main(int argc, const char* argv[]) {
                        "Number of experiments for benchmarking ({})"_format(repeat))->ignore_case();
     benchmark_opt->add_option("--backend",
                        backend,
-                       "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));;
+                       "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));
 #endif
     // clang-format on
 
@@ -640,7 +646,7 @@ int main(int argc, const char* argv[]) {
 
             if (run_benchmark) {
                 logger->info("Running LLVM benchmark");
-                benchmark::LLVMBuildInfo info{llvm_vec_width, llvm_opt_passes, llvm_float_type};
+                benchmark::LLVMBuildInfo info{llvm_vec_width, llvm_ir_opt_passes, llvm_float_type};
                 benchmark::LLVMBenchmark bench(
                     modfile, output_dir, info, repeat, instance_size, backend);
                 bench.benchmark(ast);
@@ -648,8 +654,12 @@ int main(int argc, const char* argv[]) {
 
             else if (llvm_ir) {
                 logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(
-                    modfile, output_dir, llvm_opt_passes, llvm_float_type, llvm_vec_width);
+                CodegenLLVMVisitor visitor(modfile,
+                                           output_dir,
+                                           llvm_ir_opt_passes,
+                                           llvm_float_type,
+                                           llvm_vec_width,
+                                           vec_lib);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 207548ee46..93fd269b8e 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -36,7 +36,8 @@ using nmodl::parser::NmodlDriver;
 std::string run_llvm_visitor(const std::string& text,
                              bool opt = false,
                              bool use_single_precision = false,
-                             int vector_width = 1) {
+                             int vector_width = 1,
+                             std::string vec_lib = "none") {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
@@ -48,7 +49,8 @@ std::string run_llvm_visitor(const std::string& text,
                                              /*output_dir=*/".",
                                              opt,
                                              use_single_precision,
-                                             vector_width);
+                                             vector_width,
+                                             vec_lib);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();
 }
@@ -1056,6 +1058,98 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
     }
 }
 
+//=============================================================================
+// Vector library calls.
+//=============================================================================
+
+SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
+    GIVEN("A vector LLVM intrinsic") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                NONSPECIFIC_CURRENT il
+            }
+            STATE {
+                m
+            }
+            ASSIGNED {
+                v (mV)
+            }
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                il = 2
+            }
+            DERIVATIVE states {
+                m = exp(m)
+            }
+        )";
+
+        THEN("it is replaced with an appropriate vector library call") {
+            std::smatch m;
+
+            // Check exponential intrinsic is created.
+            std::string no_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                 /*opt=*/false,
+                                                                 /*use_single_precision=*/false,
+                                                                 /*vector_width=*/2);
+            std::regex exp_decl(R"(declare <2 x double> @llvm\.exp\.v2f64\(<2 x double>\))");
+            std::regex exp_call(R"(call <2 x double> @llvm\.exp\.v2f64\(<2 x double> .*\))");
+            REQUIRE(std::regex_search(no_library_module_str, m, exp_decl));
+            REQUIRE(std::regex_search(no_library_module_str, m, exp_call));
+
+#ifndef LLVM_VERSION_LESS_THAN_13
+            // Check exponential calls are replaced with calls to SVML library.
+            std::string svml_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                   /*opt=*/false,
+                                                                   /*use_single_precision=*/false,
+                                                                   /*vector_width=*/2,
+                                                                   /*vec_lib=*/"SVML");
+            std::regex svml_exp_decl(R"(declare <2 x double> @__svml_exp2\(<2 x double>\))");
+            std::regex svml_exp_call(R"(call <2 x double> @__svml_exp2\(<2 x double> .*\))");
+            REQUIRE(std::regex_search(svml_library_module_str, m, svml_exp_decl));
+            REQUIRE(std::regex_search(svml_library_module_str, m, svml_exp_call));
+            REQUIRE(!std::regex_search(svml_library_module_str, m, exp_call));
+
+            // Check that supported exponential calls are replaced with calls to MASSV library (i.e.
+            // operating on vector of width 2).
+            std::string massv2_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                     /*opt=*/false,
+                                                                     /*use_single_precision=*/false,
+                                                                     /*vector_width=*/2,
+                                                                     /*vec_lib=*/"MASSV");
+            std::regex massv2_exp_decl(R"(declare <2 x double> @__expd2_P8\(<2 x double>\))");
+            std::regex massv2_exp_call(R"(call <2 x double> @__expd2_P8\(<2 x double> .*\))");
+            REQUIRE(std::regex_search(massv2_library_module_str, m, massv2_exp_decl));
+            REQUIRE(std::regex_search(massv2_library_module_str, m, massv2_exp_call));
+            REQUIRE(!std::regex_search(massv2_library_module_str, m, exp_call));
+
+            // Check no replacement for MASSV happens for non-supported vector widths.
+            std::string massv4_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                     /*opt=*/false,
+                                                                     /*use_single_precision=*/false,
+                                                                     /*vector_width=*/4,
+                                                                     /*vec_lib=*/"MASSV");
+            std::regex exp4_call(R"(call <4 x double> @llvm\.exp\.v4f64\(<4 x double> .*\))");
+            REQUIRE(std::regex_search(massv4_library_module_str, m, exp4_call));
+
+            // Check correct replacement of @llvm.exp.v4f32 into @vexpf when using Accelerate.
+            std::string accelerate_library_module_str =
+                run_llvm_visitor(nmodl_text,
+                                 /*opt=*/false,
+                                 /*use_single_precision=*/true,
+                                 /*vector_width=*/4,
+                                 /*vec_lib=*/"Accelerate");
+            std::regex accelerate_exp_decl(R"(declare <4 x float> @vexpf\(<4 x float>\))");
+            std::regex accelerate_exp_call(R"(call <4 x float> @vexpf\(<4 x float> .*\))");
+            std::regex fexp_call(R"(call <4 x float> @llvm\.exp\.v4f32\(<4 x float> .*\))");
+            REQUIRE(std::regex_search(accelerate_library_module_str, m, accelerate_exp_decl));
+            REQUIRE(std::regex_search(accelerate_library_module_str, m, accelerate_exp_call));
+            REQUIRE(!std::regex_search(accelerate_library_module_str, m, fexp_call));
+#endif
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From b22afdc34418bcc515d7aa603b14233a223de51a Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 22 Apr 2021 16:25:35 -0700
Subject: [PATCH 154/331] Using shared libraries in LLVM JIT (#609)

* Integrated veclibs in benchmark and added shared libs support for JIT
* Tested on BBP Ubuntu Linux box
* Make sure to set LD_LIBRARY_PATH for Intel library dir
---
 cmake/LLVMHelper.cmake              |  1 +
 src/codegen/llvm/jit_driver.cpp     | 68 +++++++++++++++++++++++++----
 src/codegen/llvm/jit_driver.hpp     | 11 +++--
 src/codegen/llvm/llvm_benchmark.cpp |  5 ++-
 src/codegen/llvm/llvm_benchmark.hpp |  5 +++
 src/main.cpp                        | 15 +++++--
 6 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index f81a5a62e8..2b7db94a85 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -10,6 +10,7 @@ llvm_map_components_to_libnames(
   analysis
   codegen
   core
+  executionengine
   instcombine
   mc
   native
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index 842c500810..ec08e8856d 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -11,9 +11,11 @@
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -22,27 +24,55 @@
 namespace nmodl {
 namespace runner {
 
-void JITDriver::init(std::string features) {
+void JITDriver::init(std::string features, std::vector<std::string>& lib_paths) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
 
+    // Set the target triple and the data layout for the module.
+    set_triple_and_data_layout(features);
+    auto data_layout = module->getDataLayout();
+
+    // Create object linking function callback.
+    auto object_linking_layer_creator = [&](llvm::orc::ExecutionSession& session,
+                                            const llvm::Triple& triple) {
+        // Create linking layer.
+        auto layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, []() {
+            return std::make_unique<llvm::SectionMemoryManager>();
+        });
+        for (const auto& lib_path: lib_paths) {
+            // For every library path, create a corresponding memory buffer.
+            auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
+            if (!memory_buffer)
+                throw std::runtime_error("Unable to create memory buffer for " + lib_path);
+
+            // Create a new JIT library instance for this session and resolve symbols.
+            auto& jd = session.createBareJITDylib(std::string(lib_path));
+            auto loaded =
+                llvm::orc::DynamicLibrarySearchGenerator::Load(lib_path.data(),
+                                                               data_layout.getGlobalPrefix());
+
+            if (!loaded)
+                throw std::runtime_error("Unable to load " + lib_path);
+            jd.addGenerator(std::move(*loaded));
+            cantFail(layer->add(jd, std::move(*memory_buffer)));
+        }
+
+        return layer;
+    };
+
     // Create IR compile function callback.
     auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
         // Create target machine with some features possibly turned off.
         auto tm = create_target(&tm_builder, features);
-
-        // Set the target triple and the data layout for the module.
-        module->setDataLayout(tm->createDataLayout());
-        module->setTargetTriple(tm->getTargetTriple().getTriple());
-
         return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(tm));
     };
 
     // Set JIT instance and extract the data layout from the module.
-    auto jit_instance = cantFail(
-        llvm::orc::LLJITBuilder().setCompileFunctionCreator(compile_function_creator).create());
-    auto data_layout = module->getDataLayout();
+    auto jit_instance = cantFail(llvm::orc::LLJITBuilder()
+                                     .setCompileFunctionCreator(compile_function_creator)
+                                     .setObjectLinkingLayerCreator(object_linking_layer_creator)
+                                     .create());
 
     // Add a ThreadSafeModule to the driver.
     llvm::orc::ThreadSafeModule tsm(std::move(module), std::make_unique<llvm::LLVMContext>());
@@ -80,5 +110,25 @@ std::unique_ptr<llvm::TargetMachine> JITDriver::create_target(
     return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
+void JITDriver::set_triple_and_data_layout(const std::string& features) {
+    // Get the default target triple for the host.
+    auto target_triple = llvm::sys::getDefaultTargetTriple();
+    std::string error_msg;
+    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
+    if (!target)
+        throw std::runtime_error("Error " + error_msg + "\n");
+
+    // Get the CPU information and set a target machine to create the data layout.
+    std::string cpu(llvm::sys::getHostCPUName());
+
+    std::unique_ptr<llvm::TargetMachine> tm(
+        target->createTargetMachine(target_triple, cpu, features, {}, {}));
+    if (!tm)
+        throw std::runtime_error("Error: could not create the target machine\n");
+
+    // Set data layout and the target triple to the module.
+    module->setDataLayout(tm->createDataLayout());
+    module->setTargetTriple(target_triple);
+}
 }  // namespace runner
 }  // namespace nmodl
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index f994a57303..d46e605054 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -37,7 +37,7 @@ class JITDriver {
         : module(std::move(m)) {}
 
     /// Initialize the JIT.
-    void init(std::string features);
+    void init(std::string features, std::vector<std::string>& lib_paths);
 
     /// Lookup the entry-point without arguments in the JIT and execute it, returning the result.
     template <typename ReturnType>
@@ -66,6 +66,9 @@ class JITDriver {
     /// A wrapper around llvm::createTargetMachine to turn on/off certain CPU features.
     std::unique_ptr<llvm::TargetMachine> create_target(llvm::orc::JITTargetMachineBuilder* builder,
                                                        const std::string& features);
+
+    /// Sets the triple and the data layout for the module.
+    void set_triple_and_data_layout(const std::string& features);
 };
 
 /**
@@ -79,9 +82,11 @@ class Runner {
     std::unique_ptr<JITDriver> driver = std::make_unique<JITDriver>(std::move(module));
 
   public:
-    Runner(std::unique_ptr<llvm::Module> m, std::string features = "")
+    Runner(std::unique_ptr<llvm::Module> m,
+           std::string features = "",
+           std::vector<std::string> lib_paths = {})
         : module(std::move(m)) {
-        driver->init(features);
+        driver->init(features, lib_paths);
     }
 
     /// Run the entry-point function without arguments.
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index 6ab9ff4982..4c49ce30df 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -43,7 +43,8 @@ void LLVMBenchmark::benchmark(const std::shared_ptr<ast::Program>& node) {
                                         output_dir,
                                         llvm_build_info.opt_passes,
                                         llvm_build_info.use_single_precision,
-                                        llvm_build_info.vector_width);
+                                        llvm_build_info.vector_width,
+                                        llvm_build_info.vec_lib);
     generate_llvm(visitor, node);
 
     // Finally, run the benchmark and log the measurements.
@@ -103,7 +104,7 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
 
     std::string features_str = llvm::join(features.begin(), features.end(), ",");
     std::unique_ptr<llvm::Module> m = visitor.get_module();
-    runner::Runner runner(std::move(m), features_str);
+    runner::Runner runner(std::move(m), features_str, shared_libs);
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/src/codegen/llvm/llvm_benchmark.hpp
index 30ebf182e8..d23567d79d 100644
--- a/src/codegen/llvm/llvm_benchmark.hpp
+++ b/src/codegen/llvm/llvm_benchmark.hpp
@@ -20,6 +20,7 @@ struct LLVMBuildInfo {
     int vector_width;
     bool opt_passes;
     bool use_single_precision;
+    std::string vec_lib;
 };
 
 /**
@@ -33,6 +34,8 @@ class LLVMBenchmark {
 
     std::string output_dir;
 
+    std::vector<std::string> shared_libs;
+
     int num_experiments;
 
     int instance_size;
@@ -65,12 +68,14 @@ class LLVMBenchmark {
   public:
     LLVMBenchmark(const std::string& mod_filename,
                   const std::string& output_dir,
+                  std::vector<std::string> shared_libs,
                   LLVMBuildInfo info,
                   int num_experiments,
                   int instance_size,
                   const std::string& backend)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
+        , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
         , backend(backend)
diff --git a/src/main.cpp b/src/main.cpp
index 2d49f76445..8ca759509d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -182,9 +182,12 @@ int main(int argc, const char* argv[]) {
     /// llvm vector width
     int llvm_vec_width = 1;
 
-    /// vector library
+    /// vector library name
     std::string vec_lib("none");
 
+    /// list of shared libraries to link
+    std::vector<std::string> libs;
+
     /// run llvm benchmark
     bool run_benchmark(false);
 
@@ -329,6 +332,9 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_flag("--run",
                        run_benchmark,
                        "Run LLVM benchmark ({})"_format(run_benchmark))->ignore_case();
+    benchmark_opt->add_option("--libs", libs, "Shared libraries to link IR against")
+            ->ignore_case()
+            ->check(CLI::ExistingFile);
     benchmark_opt->add_option("--instance-size",
                        instance_size,
                        "Instance struct size ({})"_format(instance_size))->ignore_case();
@@ -646,9 +652,12 @@ int main(int argc, const char* argv[]) {
 
             if (run_benchmark) {
                 logger->info("Running LLVM benchmark");
-                benchmark::LLVMBuildInfo info{llvm_vec_width, llvm_ir_opt_passes, llvm_float_type};
+                benchmark::LLVMBuildInfo info{llvm_vec_width,
+                                              llvm_ir_opt_passes,
+                                              llvm_float_type,
+                                              vec_lib};
                 benchmark::LLVMBenchmark bench(
-                    modfile, output_dir, info, repeat, instance_size, backend);
+                    modfile, output_dir, libs, info, repeat, instance_size, backend);
                 bench.benchmark(ast);
             }
 

From baf95f04d3e8ecc90c01e2266d05477e798ce6c0 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Sat, 24 Apr 2021 21:43:05 +0200
Subject: [PATCH 155/331] Avoid local std::ofstream object causing segfault
 (#614)

- std::ofstream().rdbuf() was used but as it was a local object,
   it becomes invalid at the end of function scope
 - make std::ofstream as member variable
---
 src/codegen/llvm/llvm_benchmark.cpp | 2 --
 src/codegen/llvm/llvm_benchmark.hpp | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index 4c49ce30df..c93b723cb0 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -145,8 +145,6 @@ void LLVMBenchmark::set_log_output() {
 
     // Otherwise, dump logs to the specified file.
     std::string filename = output_dir + "/" + mod_filename + ".log";
-    std::ofstream ofs;
-
     ofs.open(filename.c_str());
 
     if (ofs.fail())
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/src/codegen/llvm/llvm_benchmark.hpp
index d23567d79d..646912c253 100644
--- a/src/codegen/llvm/llvm_benchmark.hpp
+++ b/src/codegen/llvm/llvm_benchmark.hpp
@@ -46,6 +46,8 @@ class LLVMBenchmark {
 
     std::shared_ptr<std::ostream> log_stream;
 
+    std::ofstream ofs;
+
     /// Disable the specified feature.
     void disable(const std::string& feature, std::vector<std::string>& host_features);
 

From 272ffc5f8df43aa637d99032cfa6275a728ab604 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 30 Apr 2021 15:29:31 -0700
Subject: [PATCH 156/331] Refactoring of runners' infrastructure and dumping
 object files (#620)

The following is added:

1. Dumping object files in JIT.

A functionality to dump (enabled by default) the generated from LLVM IR file binary to `.o` has been added to benchmarking. Now, in addition to logs, a `v<vector_width>_<mod_filename>.o` is generated. The reasons it is an object file and not an assembly (hence not included in logs) are the following:

- LLVM does not have library functions that take the object and turn back into assembly, but rather `object -> file -> assembly` path. It also has a `llvm-objdump` tool, but it is intended as a command-line utility and does not have a well-defined API.

- Writing custom functions to produce a readable assembly is not a priority. Also, mimicking `objdump` functionality would be difficult.

- Both `objdump` and `llvm-objdump` can be used to isnpect the `.o` file manually.

2. Refactoring of `Runner` class.

In addition to the support of dumping the binary, `Runner`and `JITDriver` classes were refactored to have a nicer OOP-style.

fixes #611

Co-authored-by: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
---
 src/codegen/llvm/jit_driver.cpp              | 11 ++-
 src/codegen/llvm/jit_driver.hpp              | 98 +++++++++++++++-----
 src/codegen/llvm/llvm_benchmark.cpp          |  6 +-
 src/codegen/llvm/main.cpp                    |  3 +-
 test/unit/codegen/codegen_llvm_execution.cpp | 12 ++-
 5 files changed, 102 insertions(+), 28 deletions(-)

diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index ec08e8856d..7910036848 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/Support/Host.h"
@@ -24,7 +25,9 @@
 namespace nmodl {
 namespace runner {
 
-void JITDriver::init(std::string features, std::vector<std::string>& lib_paths) {
+void JITDriver::init(std::string features,
+                     std::vector<std::string> lib_paths,
+                     ObjDumpInfo* dump_info) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
 
@@ -83,6 +86,12 @@ void JITDriver::init(std::string features, std::vector<std::string>& lib_paths)
     llvm::orc::JITDylib& sym_tab = jit->getMainJITDylib();
     sym_tab.addGenerator(cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
         data_layout.getGlobalPrefix())));
+
+    // Optionally, dump the binary to the object file.
+    if (dump_info) {
+        jit->getObjTransformLayer().setTransform(
+            llvm::orc::DumpObjects(dump_info->output_dir, dump_info->filename));
+    }
 }
 
 std::unique_ptr<llvm::TargetMachine> JITDriver::create_target(
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index d46e605054..dfd06ca7ee 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -20,9 +20,18 @@
 namespace nmodl {
 namespace runner {
 
+/// A struct to hold the information for dumping object file.
+struct ObjDumpInfo {
+    /// Object file name.
+    std::string filename;
+
+    /// Object file output directory.
+    std::string output_dir;
+};
+
 /**
  * \class JITDriver
- * \brief Driver to execute MOD file function via LLVM IR backend
+ * \brief Driver to execute a MOD file function via LLVM IR backend.
  */
 class JITDriver {
   private:
@@ -33,13 +42,15 @@ class JITDriver {
     std::unique_ptr<llvm::Module> module;
 
   public:
-    JITDriver(std::unique_ptr<llvm::Module> m)
+    explicit JITDriver(std::unique_ptr<llvm::Module> m)
         : module(std::move(m)) {}
 
-    /// Initialize the JIT.
-    void init(std::string features, std::vector<std::string>& lib_paths);
+    /// Initializes the JIT.
+    void init(std::string features = "",
+              std::vector<std::string> lib_paths = {},
+              ObjDumpInfo* dump_info = nullptr);
 
-    /// Lookup the entry-point without arguments in the JIT and execute it, returning the result.
+    /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
     template <typename ReturnType>
     ReturnType execute_without_arguments(const std::string& entry_point) {
         auto expected_symbol = jit->lookup(entry_point);
@@ -51,7 +62,7 @@ class JITDriver {
         return result;
     }
 
-    /// Lookup the entry-point with an argument in the JIT and execute it, returning the result.
+    /// Lookups the entry-point with an argument in the JIT and executes it, returning the result.
     template <typename ReturnType, typename ArgType>
     ReturnType execute_with_arguments(const std::string& entry_point, ArgType arg) {
         auto expected_symbol = jit->lookup(entry_point);
@@ -63,7 +74,8 @@ class JITDriver {
         return result;
     }
 
-    /// A wrapper around llvm::createTargetMachine to turn on/off certain CPU features.
+  private:
+    /// Creates llvm::TargetMachine with certain CPU features turned on/off.
     std::unique_ptr<llvm::TargetMachine> create_target(llvm::orc::JITTargetMachineBuilder* builder,
                                                        const std::string& features);
 
@@ -72,35 +84,79 @@ class JITDriver {
 };
 
 /**
- * \class Runner
- * \brief A wrapper around JITDriver to execute an entry point in the LLVM IR module.
+ * \class BaseRunner
+ * \brief A base runner class that provides functionality to execute an
+ * entry point in the LLVM IR module.
  */
-class Runner {
-  private:
-    std::unique_ptr<llvm::Module> module;
+class BaseRunner {
+  protected:
+    std::unique_ptr<JITDriver> driver;
 
-    std::unique_ptr<JITDriver> driver = std::make_unique<JITDriver>(std::move(module));
+    explicit BaseRunner(std::unique_ptr<llvm::Module> m)
+        : driver(std::make_unique<JITDriver>(std::move(m))) {}
 
   public:
-    Runner(std::unique_ptr<llvm::Module> m,
-           std::string features = "",
-           std::vector<std::string> lib_paths = {})
-        : module(std::move(m)) {
-        driver->init(features, lib_paths);
-    }
+    /// Sets up the JIT driver.
+    virtual void initialize_driver() = 0;
 
-    /// Run the entry-point function without arguments.
+    /// Runs the entry-point function without arguments.
     template <typename ReturnType>
     ReturnType run_without_arguments(const std::string& entry_point) {
         return driver->template execute_without_arguments<ReturnType>(entry_point);
     }
 
-    /// Run the entry-point function with a pointer to the data as an argument.
+    /// Runs the entry-point function with a pointer to the data as an argument.
     template <typename ReturnType, typename ArgType>
     ReturnType run_with_argument(const std::string& entry_point, ArgType arg) {
         return driver->template execute_with_arguments<ReturnType, ArgType>(entry_point, arg);
     }
 };
 
+/**
+ * \class TestRunner
+ * \brief A simple runner for testing purposes.
+ */
+class TestRunner: public BaseRunner {
+  public:
+    explicit TestRunner(std::unique_ptr<llvm::Module> m)
+        : BaseRunner(std::move(m)) {}
+
+    virtual void initialize_driver() {
+        driver->init();
+    }
+};
+
+/**
+ * \class BenchmarkRunner
+ * \brief A runner with benchmarking functionality. It takes user-specified CPU
+ * features into account, as well as it can link against shared libraries.
+ */
+class BenchmarkRunner: public BaseRunner {
+  private:
+    /// Information on dumping object file generated from LLVM IR.
+    ObjDumpInfo dump_info;
+
+    /// CPU features specified by the user.
+    std::string features;
+
+    /// Shared libraries' paths to link against.
+    std::vector<std::string> shared_lib_paths;
+
+  public:
+    BenchmarkRunner(std::unique_ptr<llvm::Module> m,
+                    std::string filename,
+                    std::string output_dir,
+                    std::string features = "",
+                    std::vector<std::string> lib_paths = {})
+        : BaseRunner(std::move(m))
+        , dump_info{filename, output_dir}
+        , features(features)
+        , shared_lib_paths(lib_paths) {}
+
+    virtual void initialize_driver() {
+        driver->init(features, shared_lib_paths, &dump_info);
+    }
+};
+
 }  // namespace runner
 }  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index c93b723cb0..87e36ec822 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -104,7 +104,11 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
 
     std::string features_str = llvm::join(features.begin(), features.end(), ",");
     std::unique_ptr<llvm::Module> m = visitor.get_module();
-    runner::Runner runner(std::move(m), features_str, shared_libs);
+
+    // Create the benchmark runner and intialize it.
+    std::string filename = "v" + std::to_string(llvm_build_info.vector_width) + "_" + mod_filename;
+    runner::BenchmarkRunner runner(std::move(m), filename, output_dir, features_str, shared_libs);
+    runner.initialize_driver();
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index acbdc37f19..b700f5ad59 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -64,7 +64,8 @@ int main(int argc, const char* argv[]) {
         throw std::runtime_error(
             "Error: entry-point functions with non-double return type are not supported\n");
 
-    Runner runner(std::move(module));
+    TestRunner runner(std::move(module));
+    runner.initialize_driver();
 
     // Since only double type is supported, provide explicit double type to the running function.
     auto r = runner.run_without_arguments<double>(entry_point_name);
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 4e2717e45c..cec4e5017b 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -130,7 +130,8 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
-        Runner runner(std::move(m));
+        TestRunner runner(std::move(m));
+        runner.initialize_driver();
 
         THEN("functions are evaluated correctly") {
             auto exp_result = runner.run_without_arguments<double>("exponential");
@@ -231,7 +232,8 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
-        Runner runner(std::move(m));
+        TestRunner runner(std::move(m));
+        runner.initialize_driver();
 
         THEN("optimizations preserve function results") {
             // Check exponential is turned into a constant.
@@ -325,7 +327,8 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        Runner runner(std::move(module));
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
 
         THEN("Values in struct have changed according to the formula") {
             runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
@@ -412,7 +415,8 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        Runner runner(std::move(module));
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
 
         THEN("Values in struct have changed according to the formula") {
             runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",

From 3a8b8ff5def0dac716e4484106e15eb165f31193 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 7 May 2021 15:13:53 -0700
Subject: [PATCH 157/331] Optimisation levels for benchmarking (#623)

This PR adds two flags to the benchmarking pipeline:

* `--opt-level-ir`: This flag is used to run `-On` passes on the
   generated LLVM IR module.
* `--opt-level-codegen`: This flag is used for setting optimisation level
    for machine code generation inside the JIT target machine.
* As an example:

```bash
$ ./nmodl file.mod \
     llvm --ir --vector-width 1 \
     benchmark --run --instance-size 10000000 --repeat 20 --opt-level-ir 2 --opt-level-codegen 2
```

fixes #616
---
 cmake/LLVMHelper.cmake              |   3 +
 src/codegen/llvm/jit_driver.cpp     | 197 ++++++++++++++++++++--------
 src/codegen/llvm/jit_driver.hpp     |  36 ++---
 src/codegen/llvm/llvm_benchmark.cpp |  40 +++---
 src/codegen/llvm/llvm_benchmark.hpp |  59 +++++----
 src/main.cpp                        |  59 ++++++---
 6 files changed, 266 insertions(+), 128 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 2b7db94a85..b0c8b2a48b 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -12,9 +12,12 @@ llvm_map_components_to_libnames(
   core
   executionengine
   instcombine
+  ipo
   mc
   native
   orcjit
+  target
+  transformutils
   scalaropts
   support)
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index 7910036848..1e8eb4bfd0 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -8,6 +8,7 @@
 #include "jit_driver.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
@@ -18,21 +19,139 @@
 #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 
 namespace nmodl {
 namespace runner {
 
+/****************************************************************************************/
+/*                            Utilities for JIT driver                                  */
+/****************************************************************************************/
+
+/// Initialises some LLVM optimisation passes.
+static void initialise_optimisation_passes() {
+    auto& registry = *llvm::PassRegistry::getPassRegistry();
+    llvm::initializeCore(registry);
+    llvm::initializeTransformUtils(registry);
+    llvm::initializeScalarOpts(registry);
+    llvm::initializeInstCombine(registry);
+    llvm::initializeAnalysis(registry);
+}
+
+/// Populates pass managers with passes for the given optimisation levels.
+static void populate_pms(llvm::legacy::FunctionPassManager& func_pm,
+                         llvm::legacy::PassManager& module_pm,
+                         int opt_level,
+                         int size_level,
+                         llvm::TargetMachine* tm) {
+    // First, set the pass manager builder with some basic optimisation information.
+    llvm::PassManagerBuilder pm_builder;
+    pm_builder.OptLevel = opt_level;
+    pm_builder.SizeLevel = size_level;
+    pm_builder.DisableUnrollLoops = opt_level == 0;
+
+    // If target machine is defined, then initialise the TargetTransformInfo for the target.
+    if (tm) {
+        module_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
+        func_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
+    }
+
+    // Populate pass managers.
+    pm_builder.populateModulePassManager(module_pm);
+    pm_builder.populateFunctionPassManager(func_pm);
+}
+
+/// Runs the function and module passes on the provided module.
+static void run_optimisation_passes(llvm::Module& module,
+                                    llvm::legacy::FunctionPassManager& func_pm,
+                                    llvm::legacy::PassManager& module_pm) {
+    func_pm.doInitialization();
+    auto& functions = module.getFunctionList();
+    for (auto& function: functions) {
+        llvm::verifyFunction(function);
+        func_pm.run(function);
+    }
+    func_pm.doFinalization();
+    module_pm.run(module);
+}
+
+/// Optimises the given LLVM IR module.
+static void optimise_module(llvm::Module& module,
+                            int opt_level,
+                            llvm::TargetMachine* tm = nullptr) {
+    llvm::legacy::FunctionPassManager func_pm(&module);
+    llvm::legacy::PassManager module_pm;
+    populate_pms(func_pm, module_pm, opt_level, /*size_level=*/0, tm);
+    run_optimisation_passes(module, func_pm, module_pm);
+}
+
+/// Sets the target triple and the data layout of the module.
+static void set_triple_and_data_layout(llvm::Module& module, const std::string& features) {
+    // Get the default target triple for the host.
+    auto target_triple = llvm::sys::getDefaultTargetTriple();
+    std::string error_msg;
+    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
+    if (!target)
+        throw std::runtime_error("Error " + error_msg + "\n");
+
+    // Get the CPU information and set a target machine to create the data layout.
+    std::string cpu(llvm::sys::getHostCPUName());
+    std::unique_ptr<llvm::TargetMachine> tm(
+        target->createTargetMachine(target_triple, cpu, features, {}, {}));
+    if (!tm)
+        throw std::runtime_error("Error: could not create the target machine\n");
+
+    // Set data layout and the target triple to the module.
+    module.setDataLayout(tm->createDataLayout());
+    module.setTargetTriple(target_triple);
+}
+
+/// Creates llvm::TargetMachine with certain CPU features turned on/off.
+static std::unique_ptr<llvm::TargetMachine> create_target(
+    llvm::orc::JITTargetMachineBuilder* tm_builder,
+    const std::string& features,
+    int opt_level) {
+    // First, look up the target.
+    std::string error_msg;
+    auto target_triple = tm_builder->getTargetTriple().getTriple();
+    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
+    if (!target)
+        throw std::runtime_error("Error " + error_msg + "\n");
+
+    // Create default target machine with provided features.
+    auto tm = target->createTargetMachine(target_triple,
+                                          llvm::sys::getHostCPUName().str(),
+                                          features,
+                                          tm_builder->getOptions(),
+                                          tm_builder->getRelocationModel(),
+                                          tm_builder->getCodeModel(),
+                                          static_cast<llvm::CodeGenOpt::Level>(opt_level),
+                                          /*JIT=*/true);
+    if (!tm)
+        throw std::runtime_error("Error: could not create the target machine\n");
+
+    return std::unique_ptr<llvm::TargetMachine>(tm);
+}
+
+/****************************************************************************************/
+/*                                      JIT driver                                      */
+/****************************************************************************************/
+
 void JITDriver::init(std::string features,
                      std::vector<std::string> lib_paths,
-                     ObjDumpInfo* dump_info) {
+                     BenchmarkInfo* benchmark_info) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
+    initialise_optimisation_passes();
 
     // Set the target triple and the data layout for the module.
-    set_triple_and_data_layout(features);
+    set_triple_and_data_layout(*module, features);
     auto data_layout = module->getDataLayout();
 
     // Create object linking function callback.
@@ -67,11 +186,31 @@ void JITDriver::init(std::string features,
     auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
         // Create target machine with some features possibly turned off.
-        auto tm = create_target(&tm_builder, features);
+        auto tm = create_target(&tm_builder, features, benchmark_info->opt_level_codegen);
+
+        // Optimise the LLVM IR module.
+        optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
+
+        // Save optimised module to .ll file if benchmarking.
+        if (benchmark_info) {
+            std::error_code error_code;
+            std::unique_ptr<llvm::ToolOutputFile> out =
+                std::make_unique<llvm::ToolOutputFile>(benchmark_info->output_dir + "/" +
+                                                           benchmark_info->filename + "_opt.ll",
+                                                       error_code,
+                                                       llvm::sys::fs::OF_Text);
+            if (error_code)
+                throw std::runtime_error("Error: " + error_code.message());
+
+            std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
+            module->print(out->os(), annotator.get());
+            out->keep();
+        }
+
         return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(tm));
     };
 
-    // Set JIT instance and extract the data layout from the module.
+    // Set the JIT instance.
     auto jit_instance = cantFail(llvm::orc::LLJITBuilder()
                                      .setCompileFunctionCreator(compile_function_creator)
                                      .setObjectLinkingLayerCreator(object_linking_layer_creator)
@@ -88,56 +227,10 @@ void JITDriver::init(std::string features,
         data_layout.getGlobalPrefix())));
 
     // Optionally, dump the binary to the object file.
-    if (dump_info) {
+    if (benchmark_info) {
         jit->getObjTransformLayer().setTransform(
-            llvm::orc::DumpObjects(dump_info->output_dir, dump_info->filename));
+            llvm::orc::DumpObjects(benchmark_info->output_dir, benchmark_info->filename));
     }
 }
-
-std::unique_ptr<llvm::TargetMachine> JITDriver::create_target(
-    llvm::orc::JITTargetMachineBuilder* builder,
-    const std::string& features) {
-    // First, look up the target.
-    std::string error_msg;
-    auto target_triple = builder->getTargetTriple().getTriple();
-    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
-    if (!target)
-        throw std::runtime_error("Error " + error_msg + "\n");
-
-    // Create default target machine with provided features.
-    auto tm = target->createTargetMachine(target_triple,
-                                          llvm::sys::getHostCPUName().str(),
-                                          features,
-                                          builder->getOptions(),
-                                          builder->getRelocationModel(),
-                                          builder->getCodeModel(),
-                                          /*OL=*/llvm::CodeGenOpt::Default,
-                                          /*JIT=*/true);
-    if (!tm)
-        throw std::runtime_error("Error: could not create the target machine\n");
-
-    return std::unique_ptr<llvm::TargetMachine>(tm);
-}
-
-void JITDriver::set_triple_and_data_layout(const std::string& features) {
-    // Get the default target triple for the host.
-    auto target_triple = llvm::sys::getDefaultTargetTriple();
-    std::string error_msg;
-    auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error_msg);
-    if (!target)
-        throw std::runtime_error("Error " + error_msg + "\n");
-
-    // Get the CPU information and set a target machine to create the data layout.
-    std::string cpu(llvm::sys::getHostCPUName());
-
-    std::unique_ptr<llvm::TargetMachine> tm(
-        target->createTargetMachine(target_triple, cpu, features, {}, {}));
-    if (!tm)
-        throw std::runtime_error("Error: could not create the target machine\n");
-
-    // Set data layout and the target triple to the module.
-    module->setDataLayout(tm->createDataLayout());
-    module->setTargetTriple(target_triple);
-}
 }  // namespace runner
 }  // namespace nmodl
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index dfd06ca7ee..151ec177d8 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -20,13 +20,19 @@
 namespace nmodl {
 namespace runner {
 
-/// A struct to hold the information for dumping object file.
-struct ObjDumpInfo {
-    /// Object file name.
+/// A struct to hold the information for benchmarking.
+struct BenchmarkInfo {
+    /// Object filename to dump.
     std::string filename;
 
     /// Object file output directory.
     std::string output_dir;
+
+    /// Optimisation level for generated IR.
+    int opt_level_ir;
+
+    /// Optimisation level for machine code generation.
+    int opt_level_codegen;
 };
 
 /**
@@ -45,10 +51,10 @@ class JITDriver {
     explicit JITDriver(std::unique_ptr<llvm::Module> m)
         : module(std::move(m)) {}
 
-    /// Initializes the JIT.
+    /// Initializes the JIT driver.
     void init(std::string features = "",
               std::vector<std::string> lib_paths = {},
-              ObjDumpInfo* dump_info = nullptr);
+              BenchmarkInfo* benchmark_info = nullptr);
 
     /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
     template <typename ReturnType>
@@ -73,14 +79,6 @@ class JITDriver {
         ReturnType result = res(arg);
         return result;
     }
-
-  private:
-    /// Creates llvm::TargetMachine with certain CPU features turned on/off.
-    std::unique_ptr<llvm::TargetMachine> create_target(llvm::orc::JITTargetMachineBuilder* builder,
-                                                       const std::string& features);
-
-    /// Sets the triple and the data layout for the module.
-    void set_triple_and_data_layout(const std::string& features);
 };
 
 /**
@@ -133,8 +131,8 @@ class TestRunner: public BaseRunner {
  */
 class BenchmarkRunner: public BaseRunner {
   private:
-    /// Information on dumping object file generated from LLVM IR.
-    ObjDumpInfo dump_info;
+    /// Benchmarking information passed to JIT driver.
+    BenchmarkInfo benchmark_info;
 
     /// CPU features specified by the user.
     std::string features;
@@ -147,14 +145,16 @@ class BenchmarkRunner: public BaseRunner {
                     std::string filename,
                     std::string output_dir,
                     std::string features = "",
-                    std::vector<std::string> lib_paths = {})
+                    std::vector<std::string> lib_paths = {},
+                    int opt_level_ir = 0,
+                    int opt_level_codegen = 0)
         : BaseRunner(std::move(m))
-        , dump_info{filename, output_dir}
+        , benchmark_info{filename, output_dir, opt_level_ir, opt_level_codegen}
         , features(features)
         , shared_lib_paths(lib_paths) {}
 
     virtual void initialize_driver() {
-        driver->init(features, shared_lib_paths, &dump_info);
+        driver->init(features, shared_lib_paths, &benchmark_info);
     }
 };
 
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index 87e36ec822..df0c54517d 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -19,10 +19,24 @@
 namespace nmodl {
 namespace benchmark {
 
-
 /// Precision for the timing measurements.
 static constexpr int PRECISION = 9;
 
+/// Get the host CPU features in the format:
+///   +feature,+feature,-feature,+feature,...
+/// where `+` indicates that the feature is enabled.
+static std::vector<std::string> get_cpu_features() {
+    std::string cpu(llvm::sys::getHostCPUName());
+
+    llvm::SubtargetFeatures features;
+    llvm::StringMap<bool> host_features;
+    if (llvm::sys::getHostCPUFeatures(host_features)) {
+        for (auto& f: host_features)
+            features.AddFeature(f.first(), f.second);
+    }
+    return features.getFeatures();
+}
+
 
 void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>& host_features) {
     for (auto& host_feature: host_features) {
@@ -34,7 +48,7 @@ void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>
     }
 }
 
-void LLVMBenchmark::benchmark(const std::shared_ptr<ast::Program>& node) {
+void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
     // First, set the output stream for the logs.
     set_log_output();
 
@@ -65,18 +79,6 @@ void LLVMBenchmark::generate_llvm(codegen::CodegenLLVMVisitor& visitor,
                 << diff.count() << "\n\n";
 }
 
-std::vector<std::string> LLVMBenchmark::get_cpu_features() {
-    std::string cpu(llvm::sys::getHostCPUName());
-
-    llvm::SubtargetFeatures features;
-    llvm::StringMap<bool> host_features;
-    if (llvm::sys::getHostCPUFeatures(host_features)) {
-        for (auto& f: host_features)
-            features.AddFeature(f.first(), f.second);
-    }
-    return features.getFeatures();
-}
-
 void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
                                   const std::shared_ptr<ast::Program>& node) {
     // Set the codegen data helper and find the kernels.
@@ -105,9 +107,15 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
     std::string features_str = llvm::join(features.begin(), features.end(), ",");
     std::unique_ptr<llvm::Module> m = visitor.get_module();
 
-    // Create the benchmark runner and intialize it.
+    // Create the benchmark runner and initialize it.
     std::string filename = "v" + std::to_string(llvm_build_info.vector_width) + "_" + mod_filename;
-    runner::BenchmarkRunner runner(std::move(m), filename, output_dir, features_str, shared_libs);
+    runner::BenchmarkRunner runner(std::move(m),
+                                   filename,
+                                   output_dir,
+                                   features_str,
+                                   shared_libs,
+                                   opt_level_ir,
+                                   opt_level_codegen);
     runner.initialize_driver();
 
     // Benchmark every kernel.
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/src/codegen/llvm/llvm_benchmark.hpp
index 646912c253..c2c781d7f0 100644
--- a/src/codegen/llvm/llvm_benchmark.hpp
+++ b/src/codegen/llvm/llvm_benchmark.hpp
@@ -30,43 +30,39 @@ struct LLVMBuildInfo {
  */
 class LLVMBenchmark {
   private:
+    /// Source MOD file name.
     std::string mod_filename;
 
+    /// The output directory for logs and other files.
     std::string output_dir;
 
+    /// Paths to shared libraries.
     std::vector<std::string> shared_libs;
 
+    /// The number of experiments to repeat.
     int num_experiments;
 
+    /// The size of the instance struct for benchmarking.
     int instance_size;
 
+    /// Benchmarking backend
     std::string backend;
 
+    /// Optimisation level for LLVM IR transformations.
+    int opt_level_ir;
+
+    /// Optimisation level for machine code generation.
+    int opt_level_codegen;
+
+    /// LLVM visitor information.
     LLVMBuildInfo llvm_build_info;
 
+    /// The log output stream (file or stdout).
     std::shared_ptr<std::ostream> log_stream;
 
+    /// Filestream for dumping logs to the file.
     std::ofstream ofs;
 
-    /// Disable the specified feature.
-    void disable(const std::string& feature, std::vector<std::string>& host_features);
-
-    /// Visits the AST to construct the LLVM IR module.
-    void generate_llvm(codegen::CodegenLLVMVisitor& visitor,
-                       const std::shared_ptr<ast::Program>& node);
-
-    /// Get the host CPU features in the format:
-    ///   +feature,+feature,-feature,+feature,...
-    /// where `+` indicates that the feature is enabled.
-    std::vector<std::string> get_cpu_features();
-
-    /// Runs the main body of the benchmark, executing the compute kernels.
-    void run_benchmark(codegen::CodegenLLVMVisitor& visitor,
-                       const std::shared_ptr<ast::Program>& node);
-
-    /// Sets the log output stream (file or console).
-    void set_log_output();
-
   public:
     LLVMBenchmark(const std::string& mod_filename,
                   const std::string& output_dir,
@@ -74,17 +70,36 @@ class LLVMBenchmark {
                   LLVMBuildInfo info,
                   int num_experiments,
                   int instance_size,
-                  const std::string& backend)
+                  const std::string& backend,
+                  int opt_level_ir,
+                  int opt_level_codegen)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
         , backend(backend)
-        , llvm_build_info(info) {}
+        , llvm_build_info(info)
+        , opt_level_ir(opt_level_ir)
+        , opt_level_codegen(opt_level_codegen) {}
 
     /// Runs the benchmark.
-    void benchmark(const std::shared_ptr<ast::Program>& node);
+    void run(const std::shared_ptr<ast::Program>& node);
+
+  private:
+    /// Disables the specified feature in the target.
+    void disable(const std::string& feature, std::vector<std::string>& host_features);
+
+    /// Visits the AST to construct the LLVM IR module.
+    void generate_llvm(codegen::CodegenLLVMVisitor& visitor,
+                       const std::shared_ptr<ast::Program>& node);
+
+    /// Runs the main body of the benchmark, executing the compute kernels.
+    void run_benchmark(codegen::CodegenLLVMVisitor& visitor,
+                       const std::shared_ptr<ast::Program>& node);
+
+    /// Sets the log output stream (file or console).
+    void set_log_output();
 };
 
 
diff --git a/src/main.cpp b/src/main.cpp
index 8ca759509d..e5243037bd 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -183,19 +183,25 @@ int main(int argc, const char* argv[]) {
     int llvm_vec_width = 1;
 
     /// vector library name
-    std::string vec_lib("none");
-
-    /// list of shared libraries to link
-    std::vector<std::string> libs;
+    std::string vector_library("none");
 
     /// run llvm benchmark
-    bool run_benchmark(false);
+    bool run_llvm_benchmark(false);
+
+    /// optimisation level for IR generation
+    int llvm_opt_level_ir = 0;
+
+    /// optimisation level for machine code generation
+    int llvm_opt_level_codegen = 0;
+
+    /// list of shared libraries to link against in JIT
+    std::vector<std::string> shared_lib_paths;
 
     /// the size of the instance struct for the benchmark
     int instance_size = 10000;
 
-    /// the number of experiments to run for the benchmarking
-    int repeat = 100;
+    /// the number of repeated experiments for the benchmarking
+    int num_experiments = 100;
 
     /// specify the backend for LLVM IR to target
     std::string backend = "default";
@@ -324,23 +330,29 @@ int main(int argc, const char* argv[]) {
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
     llvm_opt->add_option("--veclib",
-                         vec_lib,
-                         "Vector library for maths functions ({})"_format(vec_lib))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
+                         vector_library,
+                         "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
 
     // LLVM IR benchmark options.
     auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
     benchmark_opt->add_flag("--run",
-                       run_benchmark,
-                       "Run LLVM benchmark ({})"_format(run_benchmark))->ignore_case();
-    benchmark_opt->add_option("--libs", libs, "Shared libraries to link IR against")
+                            run_llvm_benchmark,
+                            "Run LLVM benchmark ({})"_format(run_llvm_benchmark))->ignore_case();
+    benchmark_opt->add_option("--opt-level-ir",
+                              llvm_opt_level_ir,
+                              "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
+    benchmark_opt->add_option("--opt-level-codegen",
+                              llvm_opt_level_codegen,
+                              "Machine code optimisation level (O{})"_format(llvm_opt_level_codegen))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
+    benchmark_opt->add_option("--libs", shared_lib_paths, "Shared libraries to link IR against")
             ->ignore_case()
             ->check(CLI::ExistingFile);
     benchmark_opt->add_option("--instance-size",
                        instance_size,
                        "Instance struct size ({})"_format(instance_size))->ignore_case();
     benchmark_opt->add_option("--repeat",
-                       repeat,
-                       "Number of experiments for benchmarking ({})"_format(repeat))->ignore_case();
+                              num_experiments,
+                              "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
     benchmark_opt->add_option("--backend",
                        backend,
                        "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));
@@ -650,15 +662,22 @@ int main(int argc, const char* argv[]) {
 
 #ifdef NMODL_LLVM_BACKEND
 
-            if (run_benchmark) {
+            if (run_llvm_benchmark) {
                 logger->info("Running LLVM benchmark");
                 benchmark::LLVMBuildInfo info{llvm_vec_width,
                                               llvm_ir_opt_passes,
                                               llvm_float_type,
-                                              vec_lib};
-                benchmark::LLVMBenchmark bench(
-                    modfile, output_dir, libs, info, repeat, instance_size, backend);
-                bench.benchmark(ast);
+                                              vector_library};
+                benchmark::LLVMBenchmark benchmark(modfile,
+                                                   output_dir,
+                                                   shared_lib_paths,
+                                                   info,
+                                                   num_experiments,
+                                                   instance_size,
+                                                   backend,
+                                                   llvm_opt_level_ir,
+                                                   llvm_opt_level_codegen);
+                benchmark.run(ast);
             }
 
             else if (llvm_ir) {
@@ -668,7 +687,7 @@ int main(int argc, const char* argv[]) {
                                            llvm_ir_opt_passes,
                                            llvm_float_type,
                                            llvm_vec_width,
-                                           vec_lib);
+                                           vector_library);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));

From 8f2501c5b051e7fc06edb151d0bde7e6a10b3e94 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 8 May 2021 02:44:01 -0700
Subject: [PATCH 158/331] Adding function debug information (#628)

Added debug support to LLVM code generation pipeline. Currently,
only basic support was added:

1. Debug information about functions (name)
2. Debug information about module

**What has been changed and added**

1. A new class `DebugBuilder` was created. It is used as a wrapper
around LLVM's `DIBuilder` and holds important information such as
`LLVMContext`, debug file and compile unit. It also wraps `DIBuilder`'s
functionality into a more suitable API.

2. A temporary `Location` struct has been added. It encapsulates the
location of the source AST construct and reflects `ModToken` on LLVM
code generation level. It is only used if the location of the source NMODL
function is known.

3. LLVM visitor know takes an extra `add_debug_information` flag and
handles debug information creation. Fore readability, `IRBuilder` was
renamed to `ir_builder`.

4. JIT runner is now able to listen for GDB, perf (build LLVM with
`-DLLVM_USE_PERF=ON`) and VTune (build LLVM with
`-DLLVM_USE_INTEL_JITEVENTS=ON`) events.

5. Necessary cmake changes were added to optionally support JIT event
listeners (`-DNMODL_HAVE_JIT_EVENT_LISTENERS`).

**How to generate debug information**

Debug information is attached to every function, procedure or artificially
created kernel (and corresponding wrappers). Debug information is enable
by default, so to turn it off use ` --disable-debug-info` flag. For example,
the given NMODL
```nmodl
1   FUNCTION func(x) {
2     func = x
3   }
4
5   PROCEDURE proc() {}
```
is transformed (running `./bin/nmodl <filename>.mod llvm --ir`) into
```llvm
define double @func(double %x1) !dbg !4 {
  ; ...
}

define i32 @proc() !dbg !6 {
  ; ...
}

!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3}

!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "NMODL-LLVM", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
!1 = !DIFile(filename: "foo", directory: ".")
!2 = !{}
!3 = !{i32 2, !"Debug Version", i32 3}
!4 = distinct !DISubprogram(name: "func", linkageName: "func", scope: null, file: !1, line: 1, type: !5, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!5 = !DISubroutineType(types: !2)
!6 = distinct !DISubprogram(name: "proc", linkageName: "proc", scope: null, file: !1, line: 5, type: !5, scopeLine: 5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
```

fixes #592 #612

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 CMakeLists.txt                                |   2 +
 cmake/LLVMHelper.cmake                        |  38 ++--
 src/codegen/llvm/CMakeLists.txt               |   9 +-
 .../llvm/codegen_llvm_helper_visitor.cpp      |   3 +
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 201 ++++++++++--------
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  18 +-
 src/codegen/llvm/jit_driver.cpp               |  19 ++
 src/codegen/llvm/jit_driver.hpp               |  11 +
 src/codegen/llvm/llvm_benchmark.cpp           |   3 +-
 src/codegen/llvm/llvm_debug_builder.cpp       |  63 ++++++
 src/codegen/llvm/llvm_debug_builder.hpp       |  70 ++++++
 src/main.cpp                                  |   9 +-
 12 files changed, 337 insertions(+), 109 deletions(-)
 create mode 100644 src/codegen/llvm/llvm_debug_builder.cpp
 create mode 100644 src/codegen/llvm/llvm_debug_builder.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2dbc4cc8c..fb1baf78b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" OFF)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
 option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
+option(NMODL_ENABLE_JIT_EVENT_LISTENERS "Enable JITEventListener for Perf and Vtune" OFF)
 
 if(NMODL_ENABLE_LEGACY_UNITS)
   add_definitions(-DUSE_LEGACY_UNITS)
@@ -266,6 +267,7 @@ if(NMODL_ENABLE_LLVM)
   message(STATUS "  VERSION           | ${LLVM_PACKAGE_VERSION}")
   message(STATUS "  INCLUDE           | ${LLVM_INCLUDE_DIRS}")
   message(STATUS "  CMAKE             | ${LLVM_CMAKE_DIR}")
+  message(STATUS "  JIT LISTENERS     | ${NMODL_ENABLE_JIT_EVENT_LISTENERS}")
 endif()
 if(NMODL_CLANG_FORMAT)
   message(STATUS "Clang Format        | ${ClangFormat_EXECUTABLE}")
diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index b0c8b2a48b..780ae29cfa 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -4,22 +4,28 @@
 
 find_package(LLVM REQUIRED CONFIG)
 
-# include LLVM header and core library
-llvm_map_components_to_libnames(
-  LLVM_LIBS_TO_LINK
-  analysis
-  codegen
-  core
-  executionengine
-  instcombine
-  ipo
-  mc
-  native
-  orcjit
-  target
-  transformutils
-  scalaropts
-  support)
+# include LLVM libraries
+set(NMODL_LLVM_COMPONENTS
+    analysis
+    codegen
+    core
+    executionengine
+    instcombine
+    ipo
+    mc
+    native
+    orcjit
+    target
+    transformutils
+    scalaropts
+    support)
+
+if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
+  list(APPEND NMODL_LLVM_COMPONENTS inteljitevents perfjitevents)
+endif()
+
+llvm_map_components_to_libnames(LLVM_LIBS_TO_LINK ${NMODL_LLVM_COMPONENTS})
+
 set(CMAKE_REQUIRED_INCLUDES ${LLVM_INCLUDE_DIRS})
 set(CMAKE_REQUIRED_LIBRARIES ${LLVM_LIBS_TO_LINK})
 
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 8c2a295598..7814b502a3 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -9,7 +9,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
@@ -20,8 +22,11 @@ add_library(runner_obj OBJECT ${LLVM_CODEGEN_SOURCE_FILES})
 add_dependencies(runner_obj lexer_obj)
 set_property(TARGET runner_obj PROPERTY POSITION_INDEPENDENT_CODE ON)
 
-add_library(llvm_codegen STATIC $<TARGET_OBJECTS:runner_obj>)
+if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
+  target_compile_definitions(runner_obj PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
+endif()
 
+add_library(llvm_codegen STATIC $<TARGET_OBJECTS:runner_obj>)
 add_dependencies(llvm_codegen lexer util visitor)
 
 if(NOT NMODL_AS_SUBPROJECT)
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 0df364e649..de64e16bd3 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -193,6 +193,9 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     /// we have all information for code generation function, create a new node
     /// which will be inserted later into AST
     auto function = std::make_shared<ast::CodegenFunction>(fun_ret_type, name, arguments, block);
+    if (node.get_token()) {
+        function->set_token(*node.get_token()->clone());
+    }
     codegen_functions.push_back(function);
 }
 /**
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 1738d4139e..830814286e 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -16,7 +16,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/FileSystem.h"
@@ -69,7 +68,7 @@ llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, llvm::Value
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
     indices.push_back(index);
 
-    return builder.CreateInBoundsGEP(lookup(name), indices);
+    return ir_builder.CreateInBoundsGEP(lookup(name), indices);
 }
 
 llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
@@ -86,7 +85,7 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
         throw std::runtime_error("Error: " + member_name + " is not a member of the instance!");
 
     // Load the instance struct given its name from the ValueSymbolTable.
-    llvm::Value* instance_ptr = builder.CreateLoad(lookup(instance_name));
+    llvm::Value* instance_ptr = ir_builder.CreateLoad(lookup(instance_name));
 
     // Create a GEP instruction to get a pointer to the member.
     int member_index = instance_var_helper.get_variable_index(member_name);
@@ -95,7 +94,7 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     std::vector<llvm::Value*> indices;
     indices.push_back(llvm::ConstantInt::get(index_type, 0));
     indices.push_back(llvm::ConstantInt::get(index_type, member_index));
-    llvm::Value* member_ptr = builder.CreateInBoundsGEP(instance_ptr, indices);
+    llvm::Value* member_ptr = ir_builder.CreateInBoundsGEP(instance_ptr, indices);
 
     // Get the member AST node from the instance AST node, for which we proceed with the code
     // generation. If the member is scalar, return the pointer to it straight away.
@@ -122,25 +121,25 @@ llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstance
     // load the member which would be indexed later.
     llvm::Type* type = get_codegen_var_type(*codegen_var_with_type->get_type());
     llvm::Value* instance_member =
-        builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
+        ir_builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
 
     // Check if the code is vectorised and the index is indirect.
     std::string id = member_indexed_name->get_length()->get_node_name();
     if (id != kernel_id && is_kernel_code && vector_width > 1) {
         // Calculate a vector of addresses via GEP instruction, and then created a gather to load
         // indirectly.
-        llvm::Value* addresses = builder.CreateInBoundsGEP(instance_member, {i64_index});
-        return builder.CreateMaskedGather(addresses, llvm::Align());
+        llvm::Value* addresses = ir_builder.CreateInBoundsGEP(instance_member, {i64_index});
+        return ir_builder.CreateMaskedGather(addresses, llvm::Align());
     }
 
-    llvm::Value* member_addr = builder.CreateInBoundsGEP(instance_member, {i64_index});
+    llvm::Value* member_addr = ir_builder.CreateInBoundsGEP(instance_member, {i64_index});
 
     // If the code is vectorised, then bitcast to a vector pointer.
     if (is_kernel_code && vector_width > 1) {
         llvm::Type* vector_type =
             llvm::PointerType::get(llvm::FixedVectorType::get(type, vector_width),
                                    /*AddressSpace=*/0);
-        return builder.CreateBitCast(member_addr, vector_type);
+        return ir_builder.CreateBitCast(member_addr, vector_type);
     }
     return member_addr;
 }
@@ -152,7 +151,7 @@ llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
     llvm::Value* index_value;
     if (node.get_length()->is_name()) {
         llvm::Value* ptr = lookup(node.get_length()->get_node_name());
-        index_value = builder.CreateLoad(ptr);
+        index_value = ir_builder.CreateLoad(ptr);
     } else {
         node.get_length()->accept(*this);
         index_value = values.back();
@@ -169,15 +168,15 @@ llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
     if (auto index_type = llvm::dyn_cast<llvm::IntegerType>(index_value->getType())) {
         if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
             return index_value;
-        return builder.CreateSExtOrTrunc(index_value, i64_type);
+        return ir_builder.CreateSExtOrTrunc(index_value, i64_type);
     }
 
     auto vector_type = llvm::cast<llvm::FixedVectorType>(index_value->getType());
     auto element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
     if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
         return index_value;
-    return builder.CreateSExtOrTrunc(index_value,
-                                     llvm::FixedVectorType::get(i64_type, vector_width));
+    return ir_builder.CreateSExtOrTrunc(index_value,
+                                        llvm::FixedVectorType::get(i64_type, vector_width));
 }
 
 int CodegenLLVMVisitor::get_array_length(const ast::IndexedName& node) {
@@ -334,11 +333,12 @@ void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
         argument_values.push_back(value);
     }
 
-#define DISPATCH(method_name, intrinsic)                                                           \
-    if (name == (method_name)) {                                                                   \
-        llvm::Value* result = builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
-        values.push_back(result);                                                                  \
-        return;                                                                                    \
+#define DISPATCH(method_name, intrinsic)                                            \
+    if (name == (method_name)) {                                                    \
+        llvm::Value* result =                                                       \
+            ir_builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
+        values.push_back(result);                                                   \
+        return;                                                                     \
     }
 
     DISPATCH("exp", llvm::Intrinsic::exp);
@@ -360,7 +360,7 @@ void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
     std::vector<llvm::Value*> argument_values;
     argument_values.reserve(arguments.size());
     pack_function_call_arguments(arguments, argument_values);
-    llvm::Value* call = builder.CreateCall(func, argument_values);
+    llvm::Value* call = ir_builder.CreateCall(func, argument_values);
     values.push_back(call);
 }
 
@@ -382,7 +382,7 @@ void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& argumen
     std::vector<llvm::Value*> argument_values;
     argument_values.reserve(arguments.size());
     pack_function_call_arguments(arguments, argument_values);
-    builder.CreateCall(printf, argument_values);
+    ir_builder.CreateCall(printf, argument_values);
 }
 
 void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::CodegenFunction& node) {
@@ -397,10 +397,21 @@ void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::Codeg
     llvm::Type* return_type = get_codegen_var_type(*node.get_return_type());
 
     // Create a function that is automatically inserted into module's symbol table.
-    llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
-                           llvm::Function::ExternalLinkage,
-                           name,
-                           *module);
+    auto func =
+        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                               llvm::Function::ExternalLinkage,
+                               name,
+                               *module);
+
+    // Add function debug information, with location information if it exists.
+    if (add_debug_information) {
+        if (node.get_token()) {
+            Location loc{node.get_token()->start_line(), node.get_token()->start_column()};
+            debug_builder.add_function_debug_info(func, &loc);
+        } else {
+            debug_builder.add_function_debug_info(func);
+        }
+    }
 }
 
 llvm::Value* CodegenLLVMVisitor::lookup(const std::string& name) {
@@ -416,7 +427,7 @@ void CodegenLLVMVisitor::pack_function_call_arguments(const ast::ExpressionVecto
         if (arg->is_string()) {
             // If the argument is a string, create a global i8* variable with it.
             auto string_arg = std::dynamic_pointer_cast<ast::String>(arg);
-            llvm::Value* str = builder.CreateGlobalStringPtr(string_arg->get_value());
+            llvm::Value* str = ir_builder.CreateGlobalStringPtr(string_arg->get_value());
             arg_values.push_back(str);
         } else {
             arg->accept(*this);
@@ -443,10 +454,10 @@ llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
             result = llvm_fp_op(lhs, rhs);           \
         return result;
 
-        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd, builder.CreateAdd);
-        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv, builder.CreateSDiv);
-        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul, builder.CreateMul);
-        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub, builder.CreateSub);
+        DISPATCH(ast::BinaryOp::BOP_ADDITION, ir_builder.CreateFAdd, ir_builder.CreateAdd);
+        DISPATCH(ast::BinaryOp::BOP_DIVISION, ir_builder.CreateFDiv, ir_builder.CreateSDiv);
+        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, ir_builder.CreateFMul, ir_builder.CreateMul);
+        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, ir_builder.CreateFSub, ir_builder.CreateSub);
 
 #undef DISPATCH
 
@@ -461,15 +472,15 @@ void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm
         throw std::runtime_error("Error: only VarName assignment is supported!");
 
     llvm::Value* ptr = get_variable_ptr(*var);
-    builder.CreateStore(rhs, ptr);
+    ir_builder.CreateStore(rhs, ptr);
 }
 
 llvm::Value* CodegenLLVMVisitor::visit_logical_bin_op(llvm::Value* lhs,
                                                       llvm::Value* rhs,
                                                       unsigned op) {
     const auto& bin_op = static_cast<ast::BinaryOp>(op);
-    return bin_op == ast::BinaryOp::BOP_AND ? builder.CreateAnd(lhs, rhs)
-                                            : builder.CreateOr(lhs, rhs);
+    return bin_op == ast::BinaryOp::BOP_AND ? ir_builder.CreateAnd(lhs, rhs)
+                                            : ir_builder.CreateOr(lhs, rhs);
 }
 
 llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
@@ -488,12 +499,14 @@ llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
             result = i_llvm_op(lhs, rhs);                    \
         return result;
 
-        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, builder.CreateICmpEQ, builder.CreateFCmpOEQ);
-        DISPATCH(ast::BinaryOp::BOP_GREATER, builder.CreateICmpSGT, builder.CreateFCmpOGT);
-        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL, builder.CreateICmpSGE, builder.CreateFCmpOGE);
-        DISPATCH(ast::BinaryOp::BOP_LESS, builder.CreateICmpSLT, builder.CreateFCmpOLT);
-        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, builder.CreateICmpSLE, builder.CreateFCmpOLE);
-        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, builder.CreateICmpNE, builder.CreateFCmpONE);
+        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, ir_builder.CreateICmpEQ, ir_builder.CreateFCmpOEQ);
+        DISPATCH(ast::BinaryOp::BOP_GREATER, ir_builder.CreateICmpSGT, ir_builder.CreateFCmpOGT);
+        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL,
+                 ir_builder.CreateICmpSGE,
+                 ir_builder.CreateFCmpOGE);
+        DISPATCH(ast::BinaryOp::BOP_LESS, ir_builder.CreateICmpSLT, ir_builder.CreateFCmpOLT);
+        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, ir_builder.CreateICmpSLE, ir_builder.CreateFCmpOLE);
+        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, ir_builder.CreateICmpNE, ir_builder.CreateFCmpONE);
 
 #undef DISPATCH
 
@@ -602,7 +615,7 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     is_kernel_code = false;
 
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -633,31 +646,31 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     }
 
     // Branch to condition basic block and insert condition code there.
-    builder.CreateBr(for_cond);
-    builder.SetInsertPoint(for_cond);
+    ir_builder.CreateBr(for_cond);
+    ir_builder.SetInsertPoint(for_cond);
     node.get_condition()->accept(*this);
 
     // Extract the condition to decide whether to branch to the loop body or loop exit.
     llvm::Value* cond = values.back();
     values.pop_back();
-    builder.CreateCondBr(cond, for_body, exit);
+    ir_builder.CreateCondBr(cond, for_body, exit);
 
     // Generate code for the loop body and create the basic block for the increment.
-    builder.SetInsertPoint(for_body);
+    ir_builder.SetInsertPoint(for_body);
     is_kernel_code = true;
     const auto& statement_block = node.get_statement_block();
     statement_block->accept(*this);
     is_kernel_code = false;
-    builder.CreateBr(for_inc);
+    ir_builder.CreateBr(for_inc);
 
     // Process increment.
-    builder.SetInsertPoint(for_inc);
+    ir_builder.SetInsertPoint(for_inc);
     node.get_increment()->accept(*this);
 
     // Create a branch to condition block, then generate exit code out of the loop. Restore the
     // vector width.
-    builder.CreateBr(for_cond);
-    builder.SetInsertPoint(exit);
+    ir_builder.CreateBr(for_cond);
+    ir_builder.SetInsertPoint(exit);
     vector_width = tmp_vector_width;
     is_kernel_code = true;
 }
@@ -672,7 +685,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
     llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
-    builder.SetInsertPoint(body);
+    ir_builder.SetInsertPoint(body);
 
     // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
     // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
@@ -687,9 +700,10 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     unsigned i = 0;
     for (auto& arg: func->args()) {
         std::string arg_name = arguments[i++].get()->get_node_name();
-        llvm::Value* alloca = builder.CreateAlloca(arg.getType(), /*ArraySize=*/nullptr, arg_name);
+        llvm::Type* arg_type = arg.getType();
+        llvm::Value* alloca = ir_builder.CreateAlloca(arg_type, /*ArraySize=*/nullptr, arg_name);
         arg.setName(arg_name);
-        builder.CreateStore(&arg, alloca);
+        ir_builder.CreateStore(&arg, alloca);
     }
 
     // Process function or procedure body. If the function is a compute kernel, then set the
@@ -705,7 +719,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
     if (has_void_ret_type)
-        builder.CreateRetVoid();
+        ir_builder.CreateRetVoid();
 
     // Clear local values stack and remove the pointer to the local symbol table.
     values.clear();
@@ -717,8 +731,8 @@ void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturn
         throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
 
     std::string ret = "ret_" + current_func->getName().str();
-    llvm::Value* ret_value = builder.CreateLoad(lookup(ret));
-    builder.CreateRet(ret_value);
+    llvm::Value* ret_value = ir_builder.CreateLoad(lookup(ret));
+    ir_builder.CreateRet(ret_value);
 }
 
 void CodegenLLVMVisitor::visit_codegen_var_list_statement(
@@ -750,7 +764,7 @@ void CodegenLLVMVisitor::visit_codegen_var_list_statement(
         } else {
             throw std::runtime_error("Error: Unsupported local variable type");
         }
-        builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
+        ir_builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
     }
 }
 
@@ -785,7 +799,7 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
 
 void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -799,9 +813,9 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     values.pop_back();
 
     // Process the true block.
-    builder.SetInsertPoint(true_block);
+    ir_builder.SetInsertPoint(true_block);
     node.get_statement_block()->accept(*this);
-    builder.CreateBr(merge_block);
+    ir_builder.CreateBr(merge_block);
 
     // Save the merge block and proceed with codegen for `else if` statements.
     llvm::BasicBlock* exit = merge_block;
@@ -809,11 +823,11 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
         // Link the current block to the true and else blocks.
         llvm::BasicBlock* else_block =
             llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        builder.SetInsertPoint(curr_block);
-        builder.CreateCondBr(cond, true_block, else_block);
+        ir_builder.SetInsertPoint(curr_block);
+        ir_builder.CreateCondBr(cond, true_block, else_block);
 
         // Process else block.
-        builder.SetInsertPoint(else_block);
+        ir_builder.SetInsertPoint(else_block);
         else_if->get_condition()->accept(*this);
         cond = values.back();
         values.pop_back();
@@ -823,13 +837,13 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
         true_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
         llvm::BasicBlock* tmp = merge_block;
         merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        builder.SetInsertPoint(merge_block);
-        builder.CreateBr(tmp);
+        ir_builder.SetInsertPoint(merge_block);
+        ir_builder.CreateBr(tmp);
 
         // Process true block.
-        builder.SetInsertPoint(true_block);
+        ir_builder.SetInsertPoint(true_block);
         else_if->get_statement_block()->accept(*this);
-        builder.CreateBr(merge_block);
+        ir_builder.CreateBr(merge_block);
         curr_block = else_block;
     }
 
@@ -838,15 +852,15 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     llvm::BasicBlock* else_block;
     if (elses) {
         else_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        builder.SetInsertPoint(else_block);
+        ir_builder.SetInsertPoint(else_block);
         elses->get_statement_block()->accept(*this);
-        builder.CreateBr(merge_block);
+        ir_builder.CreateBr(merge_block);
     } else {
         else_block = merge_block;
     }
-    builder.SetInsertPoint(curr_block);
-    builder.CreateCondBr(cond, true_block, else_block);
-    builder.SetInsertPoint(exit);
+    ir_builder.SetInsertPoint(curr_block);
+    ir_builder.CreateCondBr(cond, true_block, else_block);
+    ir_builder.SetInsertPoint(exit);
 }
 
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
@@ -867,9 +881,13 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     CodegenLLVMHelperVisitor v{vector_width};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
-
     kernel_id = v.get_kernel_id();
 
+    // Create compile unit if adding debug information to the module.
+    if (add_debug_information) {
+        debug_builder.create_compile_unit(*module, module->getModuleIdentifier(), output_dir);
+    }
+
     // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
     for (const auto& func: functions) {
@@ -889,6 +907,11 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         visit_codegen_function(*func);
     }
 
+    // Finalize the debug information.
+    if (add_debug_information) {
+        debug_builder.finalize();
+    }
+
     // Verify the generated LLVM IR module.
     std::string error;
     llvm::raw_string_ostream ostream(error);
@@ -958,9 +981,9 @@ void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node
     llvm::Value* value = values.back();
     values.pop_back();
     if (op == ast::UOP_NEGATION) {
-        values.push_back(builder.CreateFNeg(value));
+        values.push_back(ir_builder.CreateFNeg(value));
     } else if (op == ast::UOP_NOT) {
-        values.push_back(builder.CreateNot(value));
+        values.push_back(ir_builder.CreateNot(value));
     } else {
         throw std::runtime_error("Error: unsupported unary operator\n");
     }
@@ -971,7 +994,7 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 
     // Finally, load the variable from the pointer value unless it has already been loaded (e.g. via
     // gather instruction).
-    llvm::Value* var = ptr->getType()->isPointerTy() ? builder.CreateLoad(ptr) : ptr;
+    llvm::Value* var = ptr->getType()->isPointerTy() ? ir_builder.CreateLoad(ptr) : ptr;
 
     // If the value should not be vectorised, or it is already a vector, add it to the stack.
     if (!is_kernel_code || vector_width <= 1 || var->getType()->isVectorTy()) {
@@ -981,13 +1004,13 @@ void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
 
     // Otherwise, if we are generating vectorised inside the loop, replicate the value to form a
     // vector of `vector_width`.
-    llvm::Value* vector_var = builder.CreateVectorSplat(vector_width, var);
+    llvm::Value* vector_var = ir_builder.CreateVectorSplat(vector_width, var);
     values.push_back(vector_var);
 }
 
 void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -996,20 +1019,20 @@ void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node)
     llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
 
-    builder.CreateBr(header);
-    builder.SetInsertPoint(header);
+    ir_builder.CreateBr(header);
+    ir_builder.SetInsertPoint(header);
 
     // Generate code for condition and create branch to the body block.
     node.get_condition()->accept(*this);
     llvm::Value* condition = values.back();
     values.pop_back();
-    builder.CreateCondBr(condition, body, exit);
+    ir_builder.CreateCondBr(condition, body, exit);
 
-    builder.SetInsertPoint(body);
+    ir_builder.SetInsertPoint(body);
     node.get_statement_block()->accept(*this);
-    builder.CreateBr(header);
+    ir_builder.CreateBr(header);
 
-    builder.SetInsertPoint(exit);
+    ir_builder.SetInsertPoint(exit);
 }
 
 void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
@@ -1050,17 +1073,23 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
             llvm::Function::ExternalLinkage,
             "__" + kernel_name + "_wrapper",
             *module);
+
+        // Optionally, add debug information for the wrapper function.
+        if (add_debug_information) {
+            debug_builder.add_function_debug_info(wrapper_func);
+        }
+
         llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
-        builder.SetInsertPoint(body);
+        ir_builder.SetInsertPoint(body);
 
         // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
         // and adding a terminator.
-        llvm::Value* bitcasted = builder.CreateBitCast(wrapper_func->getArg(0),
-                                                       instance_struct_ptr_type);
+        llvm::Value* bitcasted = ir_builder.CreateBitCast(wrapper_func->getArg(0),
+                                                          instance_struct_ptr_type);
         std::vector<llvm::Value*> args;
         args.push_back(bitcasted);
-        builder.CreateCall(kernel, args);
-        builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
+        ir_builder.CreateCall(kernel, args);
+        ir_builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
     }
 }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 099613f8d4..450e1872a4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -19,11 +19,13 @@
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+#include "codegen/llvm/llvm_debug_builder.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -74,7 +76,14 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
     std::unique_ptr<llvm::Module> module = std::make_unique<llvm::Module>(mod_filename, *context);
 
-    llvm::IRBuilder<> builder;
+    // LLVM IR builder.
+    llvm::IRBuilder<> ir_builder;
+
+    // Debug information builder.
+    DebugBuilder debug_builder;
+
+    // Add debug information to the module.
+    bool add_debug_information;
 
     // Pass manager for optimisation passes that are used for target code generation.
     llvm::legacy::FunctionPassManager codegen_pm;
@@ -129,14 +138,17 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        bool opt_passes,
                        bool use_single_precision = false,
                        int vector_width = 1,
-                       std::string vec_lib = "none")
+                       std::string vec_lib = "none",
+                       bool add_debug_information = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , use_single_precision(use_single_precision)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
-        , builder(*context)
+        , add_debug_information(add_debug_information)
+        , ir_builder(*context)
+        , debug_builder(*module)
         , codegen_pm(module.get())
         , opt_pm(module.get()) {}
 
diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index 1e8eb4bfd0..532cd20b8f 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -154,6 +154,16 @@ void JITDriver::init(std::string features,
     set_triple_and_data_layout(*module, features);
     auto data_layout = module->getDataLayout();
 
+    // If benchmarking, enable listeners to use GDB, perf or VTune. Note that LLVM should be built
+    // with listeners on (e.g. -DLLVM_USE_PERF=ON).
+    if (benchmark_info) {
+        gdb_event_listener = llvm::JITEventListener::createGDBRegistrationListener();
+#if defined(NMODL_HAVE_JIT_EVENT_LISTENERS)
+        perf_event_listener = llvm::JITEventListener::createPerfJITEventListener();
+        intel_event_listener = llvm::JITEventListener::createIntelJITEventListener();
+#endif
+    }
+
     // Create object linking function callback.
     auto object_linking_layer_creator = [&](llvm::orc::ExecutionSession& session,
                                             const llvm::Triple& triple) {
@@ -161,6 +171,15 @@ void JITDriver::init(std::string features,
         auto layer = std::make_unique<llvm::orc::RTDyldObjectLinkingLayer>(session, []() {
             return std::make_unique<llvm::SectionMemoryManager>();
         });
+
+        // Register event listeners if they exist.
+        if (gdb_event_listener)
+            layer->registerJITEventListener(*gdb_event_listener);
+        if (perf_event_listener)
+            layer->registerJITEventListener(*perf_event_listener);
+        if (intel_event_listener)
+            layer->registerJITEventListener(*intel_event_listener);
+
         for (const auto& lib_path: lib_paths) {
             // For every library path, create a corresponding memory buffer.
             auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
diff --git a/src/codegen/llvm/jit_driver.hpp b/src/codegen/llvm/jit_driver.hpp
index 151ec177d8..afb1317cd8 100644
--- a/src/codegen/llvm/jit_driver.hpp
+++ b/src/codegen/llvm/jit_driver.hpp
@@ -15,6 +15,7 @@
  * \brief \copybrief nmodl::runner::JITDriver
  */
 
+#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 
 namespace nmodl {
@@ -45,8 +46,18 @@ class JITDriver {
 
     std::unique_ptr<llvm::orc::LLJIT> jit;
 
+    /// LLVM IR module to execute.
     std::unique_ptr<llvm::Module> module;
 
+    /// GDB event listener.
+    llvm::JITEventListener* gdb_event_listener = nullptr;
+
+    /// perf event listener.
+    llvm::JITEventListener* perf_event_listener = nullptr;
+
+    /// Intel event listener.
+    llvm::JITEventListener* intel_event_listener = nullptr;
+
   public:
     explicit JITDriver(std::unique_ptr<llvm::Module> m)
         : module(std::move(m)) {}
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/src/codegen/llvm/llvm_benchmark.cpp
index df0c54517d..adbe653f1e 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/src/codegen/llvm/llvm_benchmark.cpp
@@ -58,7 +58,8 @@ void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
                                         llvm_build_info.opt_passes,
                                         llvm_build_info.use_single_precision,
                                         llvm_build_info.vector_width,
-                                        llvm_build_info.vec_lib);
+                                        llvm_build_info.vec_lib,
+                                        /*add_debug_information=*/true);
     generate_llvm(visitor, node);
 
     // Finally, run the benchmark and log the measurements.
diff --git a/src/codegen/llvm/llvm_debug_builder.cpp b/src/codegen/llvm/llvm_debug_builder.cpp
new file mode 100644
index 0000000000..5682a6e904
--- /dev/null
+++ b/src/codegen/llvm/llvm_debug_builder.cpp
@@ -0,0 +1,63 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/llvm_debug_builder.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+
+static constexpr const char debug_version_key[] = "Debug Version";
+
+
+void DebugBuilder::add_function_debug_info(llvm::Function* function, Location* loc) {
+    // Create the function debug type (subroutine type). We are not interested in parameters and
+    // types, and therefore passing llvm::None as argument suffices for now.
+    llvm::DISubroutineType* subroutine_type = di_builder.createSubroutineType(
+        di_builder.getOrCreateTypeArray(llvm::None));
+    llvm::DISubprogram::DISPFlags sp_flags = llvm::DISubprogram::SPFlagDefinition |
+                                             llvm::DISubprogram::SPFlagOptimized;
+    // If there is no location associated with the function, just use 0.
+    int line = loc ? loc->line : 0;
+    llvm::DISubprogram* program = di_builder.createFunction(compile_unit,
+                                                            function->getName(),
+                                                            function->getName(),
+                                                            file,
+                                                            line,
+                                                            subroutine_type,
+                                                            line,
+                                                            llvm::DINode::FlagZero,
+                                                            sp_flags);
+    function->setSubprogram(program);
+    di_builder.finalizeSubprogram(program);
+}
+
+void DebugBuilder::create_compile_unit(llvm::Module& module,
+                                       const std::string& debug_filename,
+                                       const std::string& debug_output_dir) {
+    // Create the debug file and compile unit for the module.
+    file = di_builder.createFile(debug_filename, debug_output_dir);
+    compile_unit = di_builder.createCompileUnit(llvm::dwarf::DW_LANG_C,
+                                                file,
+                                                /*Producer=*/"NMODL-LLVM",
+                                                /*isOptimized=*/false,
+                                                /*Flags=*/"",
+                                                /*RV=*/0);
+
+    // Add a flag to the module to specify that it has debug information.
+    if (!module.getModuleFlag(debug_version_key)) {
+        module.addModuleFlag(llvm::Module::Warning,
+                             debug_version_key,
+                             llvm::DEBUG_METADATA_VERSION);
+    }
+}
+
+void DebugBuilder::finalize() {
+    di_builder.finalize();
+}
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_debug_builder.hpp b/src/codegen/llvm/llvm_debug_builder.hpp
new file mode 100644
index 0000000000..9322cd461a
--- /dev/null
+++ b/src/codegen/llvm/llvm_debug_builder.hpp
@@ -0,0 +1,70 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
+namespace nmodl {
+namespace codegen {
+
+/// A struct to store AST location information.
+/// \todo Currently, not all AST nodes have location information. Moreover,
+/// some may not have it as they were artificially introduced (e.g.
+/// CodegenForStatement). This simple wrapper suffices for now, but in future
+/// we may want to handle this properly.
+struct Location {
+    /// Line in the file.
+    int line;
+
+    /// Column in the file.
+    int column;
+};
+
+
+/**
+ * \class DebugBuilder
+ * \brief A helper class to create debug information for LLVM IR module.
+ * \todo Only function debug information is supported.
+ */
+class DebugBuilder {
+  private:
+    /// Debug information builder.
+    llvm::DIBuilder di_builder;
+
+    /// LLVM context.
+    llvm::LLVMContext& context;
+
+    /// Debug compile unit for the module.
+    llvm::DICompileUnit* compile_unit = nullptr;
+
+    /// Debug file pointer.
+    llvm::DIFile* file = nullptr;
+
+  public:
+    DebugBuilder(llvm::Module& module)
+        : di_builder(module)
+        , context(module.getContext()) {}
+
+    /// Adds function debug information with an optional location.
+    void add_function_debug_info(llvm::Function* function, Location* loc = nullptr);
+
+    /// Creates the compile unit for and sets debug flags for the module.
+    void create_compile_unit(llvm::Module& module,
+                             const std::string& debug_filename,
+                             const std::string& debug_output_dir);
+
+    /// Finalizes the debug information.
+    void finalize();
+};
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/main.cpp b/src/main.cpp
index e5243037bd..e71325e057 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -185,6 +185,9 @@ int main(int argc, const char* argv[]) {
     /// vector library name
     std::string vector_library("none");
 
+    /// disable debug information generation for the IR
+    bool disable_debug_information(false);
+
     /// run llvm benchmark
     bool run_llvm_benchmark(false);
 
@@ -320,6 +323,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--ir",
         llvm_ir,
         "Generate LLVM IR ({})"_format(llvm_ir))->ignore_case();
+    llvm_opt->add_flag("--disable-debug-info",
+                       disable_debug_information,
+                       "Disable debug information ({})"_format(disable_debug_information))->ignore_case();
     llvm_opt->add_flag("--opt",
                        llvm_ir_opt_passes,
                        "Run LLVM optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
@@ -687,7 +693,8 @@ int main(int argc, const char* argv[]) {
                                            llvm_ir_opt_passes,
                                            llvm_float_type,
                                            llvm_vec_width,
-                                           vector_library);
+                                           vector_library,
+                                           !disable_debug_information);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));

From 3db45353076e41b4dec02cf6f1d14c04f6dfb658 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 8 May 2021 03:22:06 -0700
Subject: [PATCH 159/331] Fixed using benchmarking_info in TestRunner (#631)

---
 src/codegen/llvm/jit_driver.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/jit_driver.cpp b/src/codegen/llvm/jit_driver.cpp
index 532cd20b8f..2a6842d0fb 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/src/codegen/llvm/jit_driver.cpp
@@ -205,13 +205,13 @@ void JITDriver::init(std::string features,
     auto compile_function_creator = [&](llvm::orc::JITTargetMachineBuilder tm_builder)
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
         // Create target machine with some features possibly turned off.
-        auto tm = create_target(&tm_builder, features, benchmark_info->opt_level_codegen);
+        int opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
+        auto tm = create_target(&tm_builder, features, opt_level_codegen);
 
-        // Optimise the LLVM IR module.
-        optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
-
-        // Save optimised module to .ll file if benchmarking.
+        // Optimise the LLVM IR module and save it to .ll file if benchmarking.
         if (benchmark_info) {
+            optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
+
             std::error_code error_code;
             std::unique_ptr<llvm::ToolOutputFile> out =
                 std::make_unique<llvm::ToolOutputFile>(benchmark_info->output_dir + "/" +

From 33631b99317311b6dfc298e24ab9f6d75f80e3c1 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 11 May 2021 16:14:34 -0700
Subject: [PATCH 160/331] Fixed addition of SOLVE block to kernel's FOR loop
 (#636)

* Fix `append_statements_from_block` function in LLVM helper visitor.
* Before, if nonspecific current was not specified, the whole `BREAKPOINT`
   block would be added to the kernel body.
* This led to cases when `SOLVE` block was together with the actual
    solution to `DERIVATIVE`
---
 src/codegen/llvm/codegen_llvm_helper_visitor.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index de64e16bd3..c9968df8ee 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -248,7 +248,12 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
 static void append_statements_from_block(ast::StatementVector& statements,
                                          const std::shared_ptr<ast::StatementBlock>& block) {
     const auto& block_statements = block->get_statements();
-    statements.insert(statements.end(), block_statements.begin(), block_statements.end());
+    for (const auto& statement: block_statements) {
+        const auto& expression_statement = std::dynamic_pointer_cast<ast::ExpressionStatement>(
+            statement);
+        if (!expression_statement->get_expression()->is_solve_block())
+            statements.push_back(statement);
+    }
 }
 
 static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(std::string& lhs_str,
@@ -638,7 +643,6 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         /// add breakpoint block if no current
         if (info.currents.empty() && info.breakpoint_node != nullptr) {
             auto block = info.breakpoint_node->get_statement_block();
-            // \todo this automatically adds `SOLVE states METHOD ...`
             append_statements_from_block(loop_body_statements, block);
         }
 

From 05b18217495d7f867398e171b082c84f20f64d45 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 13 May 2021 00:49:23 -0700
Subject: [PATCH 161/331] IR builder redesign for LLVM IR code generation
 pipeline (#634)

Improves the code structure for the LLVM code generation pipeline

The following changes were added:

1. New IR builder class.

Before, LLVM visitor just simply used `llvm::IRBuilder<>` class to
generate instructions. Recently, this (as well as adding the functionality
to the visitor on the go) had led to code duplication and it became hard
to introduce new features nicely.

Hence, a special `IRBuilder` class is now used. This class is a wrapper
around `llvm::IRBuilder<>` that keeps track of certain IR generation
specific fields (that are unrelated to the visitor), defines an API that the
visitor can use to generate LLVM IR.

Also, this IR builder has been designed to be nearly fully-independent from
NMODL AST nodes. this allows it to be more generic and to be more
extensible.

2. Visitor clean-up

 LLVM visitor has been refactored to take the new IR builder class into
account. Also, the functions were reordered, refactored and renamed to
better reflect the intended use and provide encapsulation.

3. Scatter preparation

The functionality of the generating code for `CodegenInstanceVar` node
has been extended with `read_from_or_write_to_instance(...)` function.
Now, an optional `value_to_store` is passed to indicate whether the code
needs to be generated for reading the instance variable or writing to it.


fixes #538
---
 src/codegen/llvm/CMakeLists.txt           |   4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 926 ++++++++--------------
 src/codegen/llvm/codegen_llvm_visitor.hpp | 342 +++-----
 src/codegen/llvm/llvm_ir_builder.cpp      | 427 ++++++++++
 src/codegen/llvm/llvm_ir_builder.hpp      | 272 +++++++
 5 files changed, 1134 insertions(+), 837 deletions(-)
 create mode 100644 src/codegen/llvm/llvm_ir_builder.cpp
 create mode 100644 src/codegen/llvm/llvm_ir_builder.hpp

diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 7814b502a3..5ebf9c7acd 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -11,7 +11,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 830814286e..a86a5cd8b5 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -14,10 +14,8 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -34,7 +32,7 @@ static constexpr const char instance_struct_type_name[] = "__instance_var__type"
 
 
 /****************************************************************************************/
-/*                            Helper routines                                           */
+/*                                  Helper routines                                     */
 /****************************************************************************************/
 
 /// A utility to check for supported Statement AST nodes.
@@ -44,8 +42,8 @@ static bool is_supported_statement(const ast::Statement& statement) {
            statement.is_if_statement() || statement.is_while_statement();
 }
 
-/// A utility to check of the kernel body can be vectorised.
-static bool can_vectorise(const ast::CodegenForStatement& statement, symtab::SymbolTable* sym_tab) {
+/// A utility to check that the kernel body can be vectorised.
+static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::SymbolTable* sym_tab) {
     // Check that function calls are made to external methods only.
     const auto& function_calls = collect_nodes(statement, {ast::AstNodeType::FUNCTION_CALL});
     for (const auto& call: function_calls) {
@@ -62,458 +60,352 @@ static bool can_vectorise(const ast::CodegenForStatement& statement, symtab::Sym
     return collected.empty();
 }
 
-llvm::Value* CodegenLLVMVisitor::create_gep(const std::string& name, llvm::Value* index) {
-    llvm::Type* index_type = llvm::Type::getInt64Ty(*context);
-    std::vector<llvm::Value*> indices;
-    indices.push_back(llvm::ConstantInt::get(index_type, 0));
-    indices.push_back(index);
-
-    return ir_builder.CreateInBoundsGEP(lookup(name), indices);
-}
-
-llvm::Value* CodegenLLVMVisitor::codegen_indexed_name(const ast::IndexedName& node) {
-    llvm::Value* index = get_array_index(node);
-    return create_gep(node.get_node_name(), index);
+llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
+    node->accept(*this);
+    return ir_builder.pop_last_value();
 }
 
-llvm::Value* CodegenLLVMVisitor::codegen_instance_var(const ast::CodegenInstanceVar& node) {
-    const auto& member_node = node.get_member_var();
-    const auto& instance_name = node.get_instance_var()->get_node_name();
-    const auto& member_name = member_node->get_node_name();
-
-    if (!instance_var_helper.is_an_instance_variable(member_name))
-        throw std::runtime_error("Error: " + member_name + " is not a member of the instance!");
+void CodegenLLVMVisitor::create_external_function_call(const std::string& name,
+                                                       const ast::ExpressionVector& arguments) {
+    if (name == "printf") {
+        create_printf_call(arguments);
+        return;
+    }
 
-    // Load the instance struct given its name from the ValueSymbolTable.
-    llvm::Value* instance_ptr = ir_builder.CreateLoad(lookup(instance_name));
+    ValueVector argument_values;
+    TypeVector argument_types;
+    for (const auto& arg: arguments) {
+        llvm::Value* value = accept_and_get(arg);
+        llvm::Type* type = value->getType();
+        argument_types.push_back(type);
+        argument_values.push_back(value);
+    }
+    ir_builder.create_intrinsic(name, argument_values, argument_types);
+}
 
-    // Create a GEP instruction to get a pointer to the member.
-    int member_index = instance_var_helper.get_variable_index(member_name);
-    llvm::Type* index_type = llvm::Type::getInt32Ty(*context);
+void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
+                                              const std::string& name,
+                                              const ast::ExpressionVector& arguments) {
+    // Check that function is called with the expected number of arguments.
+    if (!func->isVarArg() && arguments.size() != func->arg_size()) {
+        throw std::runtime_error("Error: Incorrect number of arguments passed");
+    }
 
-    std::vector<llvm::Value*> indices;
-    indices.push_back(llvm::ConstantInt::get(index_type, 0));
-    indices.push_back(llvm::ConstantInt::get(index_type, member_index));
-    llvm::Value* member_ptr = ir_builder.CreateInBoundsGEP(instance_ptr, indices);
+    // Pack function call arguments to vector and create a call instruction.
+    ValueVector argument_values;
+    argument_values.reserve(arguments.size());
+    create_function_call_arguments(arguments, argument_values);
+    ir_builder.create_function_call(func, argument_values);
+}
 
-    // Get the member AST node from the instance AST node, for which we proceed with the code
-    // generation. If the member is scalar, return the pointer to it straight away.
-    auto codegen_var_with_type = instance_var_helper.get_variable(member_name);
-    if (!codegen_var_with_type->get_is_pointer()) {
-        return member_ptr;
+void CodegenLLVMVisitor::create_function_call_arguments(const ast::ExpressionVector& arguments,
+                                                        ValueVector& arg_values) {
+    for (const auto& arg: arguments) {
+        if (arg->is_string()) {
+            // If the argument is a string, create a global i8* variable with it.
+            auto string_arg = std::dynamic_pointer_cast<ast::String>(arg);
+            arg_values.push_back(ir_builder.create_global_string(*string_arg));
+        } else {
+            llvm::Value* value = accept_and_get(arg);
+            arg_values.push_back(value);
+        }
     }
+}
 
-    // Otherwise, the codegen variable is a pointer, and the member AST node must be an IndexedName.
-    auto member_var_name = std::dynamic_pointer_cast<ast::VarName>(member_node);
-    if (!member_var_name->get_name()->is_indexed_name())
-        throw std::runtime_error("Error: " + member_name + " is not an IndexedName!");
-
-    // Proceed to creating a GEP instruction to get the pointer to the member's element.
-    auto member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
-        member_var_name->get_name());
+void CodegenLLVMVisitor::create_function_declaration(const ast::CodegenFunction& node) {
+    const auto& name = node.get_node_name();
+    const auto& arguments = node.get_arguments();
 
-    if (!member_indexed_name->get_length()->is_name())
-        throw std::runtime_error("Error: " + member_name + " must be indexed with a variable!");
+    // Procedure or function parameters are doubles by default.
+    TypeVector arg_types;
+    for (size_t i = 0; i < arguments.size(); ++i)
+        arg_types.push_back(get_codegen_var_type(*arguments[i]->get_type()));
 
-    llvm::Value* i64_index = get_array_index(*member_indexed_name);
-
-    // The codegen variable type is always a scalar, so we need to transform it to a pointer. Then
-    // load the member which would be indexed later.
-    llvm::Type* type = get_codegen_var_type(*codegen_var_with_type->get_type());
-    llvm::Value* instance_member =
-        ir_builder.CreateLoad(llvm::PointerType::get(type, /*AddressSpace=*/0), member_ptr);
-
-    // Check if the code is vectorised and the index is indirect.
-    std::string id = member_indexed_name->get_length()->get_node_name();
-    if (id != kernel_id && is_kernel_code && vector_width > 1) {
-        // Calculate a vector of addresses via GEP instruction, and then created a gather to load
-        // indirectly.
-        llvm::Value* addresses = ir_builder.CreateInBoundsGEP(instance_member, {i64_index});
-        return ir_builder.CreateMaskedGather(addresses, llvm::Align());
-    }
+    llvm::Type* return_type = get_codegen_var_type(*node.get_return_type());
 
-    llvm::Value* member_addr = ir_builder.CreateInBoundsGEP(instance_member, {i64_index});
+    // Create a function that is automatically inserted into module's symbol table.
+    auto func =
+        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
+                               llvm::Function::ExternalLinkage,
+                               name,
+                               *module);
 
-    // If the code is vectorised, then bitcast to a vector pointer.
-    if (is_kernel_code && vector_width > 1) {
-        llvm::Type* vector_type =
-            llvm::PointerType::get(llvm::FixedVectorType::get(type, vector_width),
-                                   /*AddressSpace=*/0);
-        return ir_builder.CreateBitCast(member_addr, vector_type);
+    // Add function debug information, with location information if it exists.
+    if (add_debug_information) {
+        if (node.get_token()) {
+            Location loc{node.get_token()->start_line(), node.get_token()->start_column()};
+            debug_builder.add_function_debug_info(func, &loc);
+        } else {
+            debug_builder.add_function_debug_info(func);
+        }
     }
-    return member_addr;
 }
 
-llvm::Value* CodegenLLVMVisitor::get_array_index(const ast::IndexedName& node) {
-    // Process the index expression. It can either be a Name node:
-    //    k[id]     // id is an integer
-    // or an integer expression.
-    llvm::Value* index_value;
-    if (node.get_length()->is_name()) {
-        llvm::Value* ptr = lookup(node.get_length()->get_node_name());
-        index_value = ir_builder.CreateLoad(ptr);
-    } else {
-        node.get_length()->accept(*this);
-        index_value = values.back();
-        values.pop_back();
-    }
+void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& arguments) {
+    // First, create printf declaration or insert it if it does not exit.
+    std::string name = "printf";
+    llvm::Function* printf = module->getFunction(name);
+    if (!printf) {
+        llvm::FunctionType* printf_type = llvm::FunctionType::get(ir_builder.get_i32_type(),
+                                                                  ir_builder.get_i8_ptr_type(),
+                                                                  /*isVarArg=*/true);
 
-    // Check if index is a double. While it is possible to use casting from double to integer
-    // values, we choose not to support these cases.
-    if (!index_value->getType()->isIntOrIntVectorTy())
-        throw std::runtime_error("Error: only integer indexing is supported!");
-
-    // Conventionally, in LLVM array indices are 64 bit.
-    llvm::Type* i64_type = llvm::Type::getInt64Ty(*context);
-    if (auto index_type = llvm::dyn_cast<llvm::IntegerType>(index_value->getType())) {
-        if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
-            return index_value;
-        return ir_builder.CreateSExtOrTrunc(index_value, i64_type);
+        printf =
+            llvm::Function::Create(printf_type, llvm::Function::ExternalLinkage, name, *module);
     }
 
-    auto vector_type = llvm::cast<llvm::FixedVectorType>(index_value->getType());
-    auto element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
-    if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
-        return index_value;
-    return ir_builder.CreateSExtOrTrunc(index_value,
-                                        llvm::FixedVectorType::get(i64_type, vector_width));
+    // Create a call instruction.
+    ValueVector argument_values;
+    argument_values.reserve(arguments.size());
+    create_function_call_arguments(arguments, argument_values);
+    ir_builder.create_function_call(printf, argument_values, /*use_result=*/false);
 }
 
-int CodegenLLVMVisitor::get_array_length(const ast::IndexedName& node) {
-    auto integer = std::dynamic_pointer_cast<ast::Integer>(node.get_length());
-    if (!integer)
-        throw std::runtime_error("Error: only integer length is supported!");
-
-    // Check if integer value is taken from a macro.
-    if (!integer->get_macro())
-        return integer->get_value();
-    const auto& macro = sym_tab->lookup(integer->get_macro()->get_node_name());
-    return static_cast<int>(*macro->get_value());
+void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
+    // By convention, only kernel functions have a return type of void.
+    const auto& functions = module->getFunctionList();
+    for (const auto& func: functions) {
+        if (func.getReturnType()->isVoidTy()) {
+            container.push_back(func.getName().str());
+        }
+    }
 }
 
 llvm::Type* CodegenLLVMVisitor::get_codegen_var_type(const ast::CodegenVarType& node) {
     switch (node.get_type()) {
     case ast::AstNodeType::BOOLEAN:
-        return llvm::Type::getInt1Ty(*context);
+        return ir_builder.get_boolean_type();
     case ast::AstNodeType::DOUBLE:
-        return get_default_fp_type();
+        return ir_builder.get_fp_type();
     case ast::AstNodeType::INSTANCE_STRUCT:
         return get_instance_struct_type();
     case ast::AstNodeType::INTEGER:
-        return llvm::Type::getInt32Ty(*context);
+        return ir_builder.get_i32_type();
     case ast::AstNodeType::VOID:
-        return llvm::Type::getVoidTy(*context);
+        return ir_builder.get_void_type();
     default:
         throw std::runtime_error("Error: expecting a type in CodegenVarType node\n");
     }
 }
 
-llvm::Value* CodegenLLVMVisitor::get_constant_int_vector(int value) {
-    llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-    std::vector<llvm::Constant*> constants;
-    for (unsigned i = 0; i < vector_width; ++i) {
-        const auto& element = llvm::ConstantInt::get(i32_type, value);
-        constants.push_back(element);
-    }
-    return llvm::ConstantVector::get(constants);
+llvm::Value* CodegenLLVMVisitor::get_index(const ast::IndexedName& node) {
+    // In NMODL, the index is either an integer expression or a named constant, such as "id".
+    llvm::Value* index_value = node.get_length()->is_name()
+                                   ? ir_builder.create_load(node.get_length()->get_node_name())
+                                   : accept_and_get(node.get_length());
+    return ir_builder.create_index(index_value);
 }
 
-llvm::Value* CodegenLLVMVisitor::get_constant_fp_vector(const std::string& value) {
-    llvm::Type* fp_type = get_default_fp_type();
-    std::vector<llvm::Constant*> constants;
-    for (unsigned i = 0; i < vector_width; ++i) {
-        const auto& element = llvm::ConstantFP::get(fp_type, value);
-        constants.push_back(element);
+llvm::Type* CodegenLLVMVisitor::get_instance_struct_type() {
+    TypeVector member_types;
+    for (const auto& variable: instance_var_helper.instance->get_codegen_vars()) {
+        // Get the type information of the codegen variable.
+        const auto& is_pointer = variable->get_is_pointer();
+        const auto& nmodl_type = variable->get_type()->get_type();
+
+        // Create the corresponding LLVM type.
+        switch (nmodl_type) {
+        case ast::AstNodeType::DOUBLE:
+            member_types.push_back(is_pointer ? ir_builder.get_fp_ptr_type()
+                                              : ir_builder.get_fp_type());
+            break;
+        case ast::AstNodeType::INTEGER:
+            member_types.push_back(is_pointer ? ir_builder.get_i32_ptr_type()
+                                              : ir_builder.get_i32_type());
+            break;
+        default:
+            throw std::runtime_error("Error: unsupported type found in instance struct\n");
+        }
     }
-    return llvm::ConstantVector::get(constants);
-}
 
-llvm::Type* CodegenLLVMVisitor::get_default_fp_type() {
-    if (use_single_precision)
-        return llvm::Type::getFloatTy(*context);
-    return llvm::Type::getDoubleTy(*context);
+    return ir_builder.get_struct_ptr_type(mod_filename + instance_struct_type_name, member_types);
 }
 
-llvm::Type* CodegenLLVMVisitor::get_default_fp_ptr_type() {
-    if (use_single_precision)
-        return llvm::Type::getFloatPtrTy(*context);
-    return llvm::Type::getDoublePtrTy(*context);
+int CodegenLLVMVisitor::get_num_elements(const ast::IndexedName& node) {
+    // First, verify if the length is an integer value.
+    const auto& integer = std::dynamic_pointer_cast<ast::Integer>(node.get_length());
+    if (!integer)
+        throw std::runtime_error("Error: only integer length is supported\n");
+
+    // Check if the length value is a constant.
+    if (!integer->get_macro())
+        return integer->get_value();
+
+    // Otherwise, the length is taken from the macro.
+    const auto& macro = sym_tab->lookup(integer->get_macro()->get_node_name());
+    return static_cast<int>(*macro->get_value());
 }
 
-llvm::Type* CodegenLLVMVisitor::get_instance_struct_type() {
-    std::vector<llvm::Type*> members;
-    for (const auto& variable: instance_var_helper.instance->get_codegen_vars()) {
-        auto is_pointer = variable->get_is_pointer();
-        auto nmodl_type = variable->get_type()->get_type();
+llvm::Value* CodegenLLVMVisitor::read_from_or_write_to_instance(const ast::CodegenInstanceVar& node,
+                                                                llvm::Value* maybe_value_to_store) {
+    const auto& instance_name = node.get_instance_var()->get_node_name();
+    const auto& member_node = node.get_member_var();
+    const auto& member_name = member_node->get_node_name();
 
-        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::Type* i32ptr_type = llvm::Type::getInt32PtrTy(*context);
+    if (!instance_var_helper.is_an_instance_variable(member_name))
+        throw std::runtime_error("Error: " + member_name +
+                                 " is not a member of the instance variable\n");
 
-        switch (nmodl_type) {
-#define DISPATCH(type, llvm_ptr_type, llvm_type)                       \
-    case type:                                                         \
-        members.push_back(is_pointer ? (llvm_ptr_type) : (llvm_type)); \
-        break;
+    // Load the instance struct by its name.
+    llvm::Value* instance_ptr = ir_builder.create_load(instance_name);
 
-            DISPATCH(ast::AstNodeType::DOUBLE, get_default_fp_ptr_type(), get_default_fp_type());
-            DISPATCH(ast::AstNodeType::INTEGER, i32ptr_type, i32_type);
+    // Get the pointer to the specified member.
+    int member_index = instance_var_helper.get_variable_index(member_name);
+    llvm::Value* member_ptr = ir_builder.get_struct_member_ptr(instance_ptr, member_index);
 
-#undef DISPATCH
-        default:
-            throw std::runtime_error("Error: unsupported type found in instance struct");
+    // Check if the member is scalar. Load the value or store to it straight away. Otherwise, we
+    // need some extra handling.
+    auto codegen_var_with_type = instance_var_helper.get_variable(member_name);
+    if (!codegen_var_with_type->get_is_pointer()) {
+        if (maybe_value_to_store) {
+            ir_builder.create_store(member_ptr, maybe_value_to_store);
+            return nullptr;
+        } else {
+            return ir_builder.create_load(member_ptr);
         }
     }
 
-    llvm::StructType* llvm_struct_type =
-        llvm::StructType::create(*context, mod_filename + instance_struct_type_name);
-    llvm_struct_type->setBody(members);
-    return llvm::PointerType::get(llvm_struct_type, /*AddressSpace=*/0);
+    // Check that the member is an indexed name indeed, and that it is indexed by a named constant
+    // (e.g. "id").
+    const auto& member_var_name = std::dynamic_pointer_cast<ast::VarName>(member_node);
+    if (!member_var_name->get_name()->is_indexed_name())
+        throw std::runtime_error("Error: " + member_name + " is not an IndexedName\n");
+
+    const auto& member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
+        member_var_name->get_name());
+    if (!member_indexed_name->get_length()->is_name())
+        throw std::runtime_error("Error: " + member_name + " must be indexed with a variable!");
+
+    // Get the index to the member and the id used to index it.
+    llvm::Value* i64_index = get_index(*member_indexed_name);
+    const std::string id = member_indexed_name->get_length()->get_node_name();
+
+    // Load the member of the instance struct.
+    llvm::Value* instance_member = ir_builder.create_load(member_ptr);
+
+    // Create a pointer to the specified element of the struct member.
+    return ir_builder.load_to_or_store_from_array(id,
+                                                  i64_index,
+                                                  instance_member,
+                                                  maybe_value_to_store);
 }
 
-llvm::Value* CodegenLLVMVisitor::get_variable_ptr(const ast::VarName& node) {
+llvm::Value* CodegenLLVMVisitor::read_variable(const ast::VarName& node) {
     const auto& identifier = node.get_name();
-    if (!identifier->is_name() && !identifier->is_indexed_name() &&
-        !identifier->is_codegen_instance_var()) {
-        throw std::runtime_error("Error: Unsupported variable type - " + node.get_node_name());
-    }
 
-    llvm::Value* ptr;
-    if (identifier->is_name())
-        ptr = lookup(node.get_node_name());
+    if (identifier->is_name()) {
+        return ir_builder.create_load(node.get_node_name());
+    }
 
     if (identifier->is_indexed_name()) {
-        auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-        ptr = codegen_indexed_name(*indexed_name);
+        const auto& indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        llvm::Value* index = get_index(*indexed_name);
+        return ir_builder.create_load_from_array(node.get_node_name(), index);
     }
 
     if (identifier->is_codegen_instance_var()) {
-        auto instance_var = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(identifier);
-        ptr = codegen_instance_var(*instance_var);
+        const auto& instance_var = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(identifier);
+        return read_from_or_write_to_instance(*instance_var);
     }
-    return ptr;
-}
 
-std::shared_ptr<ast::InstanceStruct> CodegenLLVMVisitor::get_instance_struct_ptr() {
-    return instance_var_helper.instance;
+    throw std::runtime_error("Error: the type of '" + node.get_node_name() +
+                             "' is not supported\n");
 }
 
 void CodegenLLVMVisitor::run_ir_opt_passes() {
-    /// run some common optimisation passes that are commonly suggested
+    // Run some common optimisation passes that are commonly suggested.
     opt_pm.add(llvm::createInstructionCombiningPass());
     opt_pm.add(llvm::createReassociatePass());
     opt_pm.add(llvm::createGVNPass());
     opt_pm.add(llvm::createCFGSimplificationPass());
 
-    /// initialize pass manager
+    // Initialize pass manager.
     opt_pm.doInitialization();
 
-    /// iterate over all functions and run the optimisation passes
+    // Iterate over all functions and run the optimisation passes.
     auto& functions = module->getFunctionList();
     for (auto& function: functions) {
         llvm::verifyFunction(function);
         opt_pm.run(function);
     }
+    opt_pm.doFinalization();
 }
 
-void CodegenLLVMVisitor::create_external_method_call(const std::string& name,
-                                                     const ast::ExpressionVector& arguments) {
-    if (name == "printf") {
-        create_printf_call(arguments);
-        return;
+void CodegenLLVMVisitor::write_to_variable(const ast::VarName& node, llvm::Value* value) {
+    const auto& identifier = node.get_name();
+    if (!identifier->is_name() && !identifier->is_indexed_name() &&
+        !identifier->is_codegen_instance_var()) {
+        throw std::runtime_error("Error: the type of '" + node.get_node_name() +
+                                 "' is not supported\n");
     }
 
-    std::vector<llvm::Value*> argument_values;
-    std::vector<llvm::Type*> argument_types;
-    for (const auto& arg: arguments) {
-        arg->accept(*this);
-        llvm::Value* value = values.back();
-        llvm::Type* type = value->getType();
-        values.pop_back();
-        argument_types.push_back(type);
-        argument_values.push_back(value);
+    if (identifier->is_name()) {
+        ir_builder.create_store(node.get_node_name(), value);
     }
 
-#define DISPATCH(method_name, intrinsic)                                            \
-    if (name == (method_name)) {                                                    \
-        llvm::Value* result =                                                       \
-            ir_builder.CreateIntrinsic(intrinsic, argument_types, argument_values); \
-        values.push_back(result);                                                   \
-        return;                                                                     \
+    if (identifier->is_indexed_name()) {
+        const auto& indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+        llvm::Value* index = get_index(*indexed_name);
+        ir_builder.create_store_to_array(node.get_node_name(), index, value);
     }
 
-    DISPATCH("exp", llvm::Intrinsic::exp);
-    DISPATCH("pow", llvm::Intrinsic::pow);
-#undef DISPATCH
-
-    throw std::runtime_error("Error: External method" + name + " is not currently supported");
-}
-
-void CodegenLLVMVisitor::create_function_call(llvm::Function* func,
-                                              const std::string& name,
-                                              const ast::ExpressionVector& arguments) {
-    // Check that function is called with the expected number of arguments.
-    if (!func->isVarArg() && arguments.size() != func->arg_size()) {
-        throw std::runtime_error("Error: Incorrect number of arguments passed");
+    if (identifier->is_codegen_instance_var()) {
+        const auto& instance_var = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(identifier);
+        read_from_or_write_to_instance(*instance_var, value);
     }
-
-    // Pack function call arguments to vector and create a call instruction.
-    std::vector<llvm::Value*> argument_values;
-    argument_values.reserve(arguments.size());
-    pack_function_call_arguments(arguments, argument_values);
-    llvm::Value* call = ir_builder.CreateCall(func, argument_values);
-    values.push_back(call);
 }
 
-void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& arguments) {
-    // First, create printf declaration or insert it if it does not exit.
-    std::string name = "printf";
-    llvm::Function* printf = module->getFunction(name);
-    if (!printf) {
-        llvm::Type* ptr_type = llvm::Type::getInt8PtrTy(*context);
-        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::FunctionType* printf_type =
-            llvm::FunctionType::get(i32_type, ptr_type, /*isVarArg=*/true);
-
-        printf =
-            llvm::Function::Create(printf_type, llvm::Function::ExternalLinkage, name, *module);
-    }
-
-    // Create a call instruction.
-    std::vector<llvm::Value*> argument_values;
-    argument_values.reserve(arguments.size());
-    pack_function_call_arguments(arguments, argument_values);
-    ir_builder.CreateCall(printf, argument_values);
-}
+void CodegenLLVMVisitor::wrap_kernel_functions() {
+    // First, identify all kernels.
+    std::vector<std::string> kernel_names;
+    find_kernel_names(kernel_names);
 
-void CodegenLLVMVisitor::emit_procedure_or_function_declaration(const ast::CodegenFunction& node) {
-    const auto& name = node.get_node_name();
-    const auto& arguments = node.get_arguments();
+    for (const auto& kernel_name: kernel_names) {
+        // Get the kernel function and the instance struct type.
+        auto kernel = module->getFunction(kernel_name);
+        if (!kernel)
+            throw std::runtime_error("Error: kernel " + kernel_name + " is not found\n");
 
-    // Procedure or function parameters are doubles by default.
-    std::vector<llvm::Type*> arg_types;
-    for (size_t i = 0; i < arguments.size(); ++i)
-        arg_types.push_back(get_codegen_var_type(*arguments[i]->get_type()));
+        if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
+            throw std::runtime_error("Error: kernel " + kernel_name +
+                                     " must have a single argument\n");
 
-    llvm::Type* return_type = get_codegen_var_type(*node.get_return_type());
+        auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(
+            kernel->getArg(0)->getType());
+        if (!instance_struct_ptr_type)
+            throw std::runtime_error("Error: kernel " + kernel_name +
+                                     " does not have an instance struct pointer as an argument\n");
 
-    // Create a function that is automatically inserted into module's symbol table.
-    auto func =
-        llvm::Function::Create(llvm::FunctionType::get(return_type, arg_types, /*isVarArg=*/false),
-                               llvm::Function::ExternalLinkage,
-                               name,
-                               *module);
+        // Create a wrapper void function that takes a void pointer as a single argument.
+        llvm::Type* i32_type = ir_builder.get_i32_type();
+        llvm::Type* void_ptr_type = ir_builder.get_i8_ptr_type();
+        llvm::Function* wrapper_func = llvm::Function::Create(
+            llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
+            llvm::Function::ExternalLinkage,
+            "__" + kernel_name + "_wrapper",
+            *module);
 
-    // Add function debug information, with location information if it exists.
-    if (add_debug_information) {
-        if (node.get_token()) {
-            Location loc{node.get_token()->start_line(), node.get_token()->start_column()};
-            debug_builder.add_function_debug_info(func, &loc);
-        } else {
-            debug_builder.add_function_debug_info(func);
+        // Optionally, add debug information for the wrapper function.
+        if (add_debug_information) {
+            debug_builder.add_function_debug_info(wrapper_func);
         }
-    }
-}
 
-llvm::Value* CodegenLLVMVisitor::lookup(const std::string& name) {
-    auto val = current_func->getValueSymbolTable()->lookup(name);
-    if (!val)
-        throw std::runtime_error("Error: variable " + name + " is not in scope\n");
-    return val;
-}
+        ir_builder.create_block_and_set_insertion_point(wrapper_func);
 
-void CodegenLLVMVisitor::pack_function_call_arguments(const ast::ExpressionVector& arguments,
-                                                      std::vector<llvm::Value*>& arg_values) {
-    for (const auto& arg: arguments) {
-        if (arg->is_string()) {
-            // If the argument is a string, create a global i8* variable with it.
-            auto string_arg = std::dynamic_pointer_cast<ast::String>(arg);
-            llvm::Value* str = ir_builder.CreateGlobalStringPtr(string_arg->get_value());
-            arg_values.push_back(str);
-        } else {
-            arg->accept(*this);
-            llvm::Value* value = values.back();
-            values.pop_back();
-            arg_values.push_back(value);
-        }
-    }
-}
-
-llvm::Value* CodegenLLVMVisitor::visit_arithmetic_bin_op(llvm::Value* lhs,
-                                                         llvm::Value* rhs,
-                                                         unsigned op) {
-    const auto& bin_op = static_cast<ast::BinaryOp>(op);
-    llvm::Type* lhs_type = lhs->getType();
-    llvm::Value* result;
-
-    switch (bin_op) {
-#define DISPATCH(binary_op, llvm_fp_op, llvm_int_op) \
-    case binary_op:                                  \
-        if (lhs_type->isIntOrIntVectorTy())          \
-            result = llvm_int_op(lhs, rhs);          \
-        else                                         \
-            result = llvm_fp_op(lhs, rhs);           \
-        return result;
-
-        DISPATCH(ast::BinaryOp::BOP_ADDITION, ir_builder.CreateFAdd, ir_builder.CreateAdd);
-        DISPATCH(ast::BinaryOp::BOP_DIVISION, ir_builder.CreateFDiv, ir_builder.CreateSDiv);
-        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, ir_builder.CreateFMul, ir_builder.CreateMul);
-        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, ir_builder.CreateFSub, ir_builder.CreateSub);
-
-#undef DISPATCH
+        // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
+        // and adding a terminator.
+        llvm::Value* bitcasted = ir_builder.create_bitcast(wrapper_func->getArg(0),
+                                                           instance_struct_ptr_type);
+        ValueVector args;
+        args.push_back(bitcasted);
+        ir_builder.create_function_call(kernel, args, /*use_result=*/false);
 
-    default:
-        return nullptr;
+        // Create a 0 return value and a return instruction.
+        ir_builder.create_i32_constant(0);
+        ir_builder.create_return(ir_builder.pop_last_value());
     }
 }
 
-void CodegenLLVMVisitor::visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs) {
-    auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
-    if (!var)
-        throw std::runtime_error("Error: only VarName assignment is supported!");
-
-    llvm::Value* ptr = get_variable_ptr(*var);
-    ir_builder.CreateStore(rhs, ptr);
-}
-
-llvm::Value* CodegenLLVMVisitor::visit_logical_bin_op(llvm::Value* lhs,
-                                                      llvm::Value* rhs,
-                                                      unsigned op) {
-    const auto& bin_op = static_cast<ast::BinaryOp>(op);
-    return bin_op == ast::BinaryOp::BOP_AND ? ir_builder.CreateAnd(lhs, rhs)
-                                            : ir_builder.CreateOr(lhs, rhs);
-}
-
-llvm::Value* CodegenLLVMVisitor::visit_comparison_bin_op(llvm::Value* lhs,
-                                                         llvm::Value* rhs,
-                                                         unsigned op) {
-    const auto& bin_op = static_cast<ast::BinaryOp>(op);
-    llvm::Type* lhs_type = lhs->getType();
-    llvm::Value* result;
-
-    switch (bin_op) {
-#define DISPATCH(binary_op, i_llvm_op, f_llvm_op)            \
-    case binary_op:                                          \
-        if (lhs_type->isDoubleTy() || lhs_type->isFloatTy()) \
-            result = f_llvm_op(lhs, rhs);                    \
-        else                                                 \
-            result = i_llvm_op(lhs, rhs);                    \
-        return result;
-
-        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, ir_builder.CreateICmpEQ, ir_builder.CreateFCmpOEQ);
-        DISPATCH(ast::BinaryOp::BOP_GREATER, ir_builder.CreateICmpSGT, ir_builder.CreateFCmpOGT);
-        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL,
-                 ir_builder.CreateICmpSGE,
-                 ir_builder.CreateFCmpOGE);
-        DISPATCH(ast::BinaryOp::BOP_LESS, ir_builder.CreateICmpSLT, ir_builder.CreateFCmpOLT);
-        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, ir_builder.CreateICmpSLE, ir_builder.CreateFCmpOLE);
-        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, ir_builder.CreateICmpNE, ir_builder.CreateFCmpONE);
-
-#undef DISPATCH
-
-    default:
-        return nullptr;
-    }
-}
 
 /****************************************************************************************/
 /*                            Overloaded visitor routines                               */
@@ -525,43 +417,18 @@ void CodegenLLVMVisitor::visit_binary_expression(const ast::BinaryExpression& no
 
     // Process rhs first, since lhs is handled differently for assignment and binary
     // operators.
-    node.get_rhs()->accept(*this);
-    llvm::Value* rhs = values.back();
-    values.pop_back();
+    llvm::Value* rhs = accept_and_get(node.get_rhs());
     if (op == ast::BinaryOp::BOP_ASSIGN) {
-        visit_assign_op(node, rhs);
-        return;
-    }
+        auto var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
+        if (!var)
+            throw std::runtime_error("Error: only 'VarName' assignment is supported\n");
 
-    node.get_lhs()->accept(*this);
-    llvm::Value* lhs = values.back();
-    values.pop_back();
-
-    llvm::Value* result;
-    switch (op) {
-    case ast::BOP_ADDITION:
-    case ast::BOP_DIVISION:
-    case ast::BOP_MULTIPLICATION:
-    case ast::BOP_SUBTRACTION:
-        result = visit_arithmetic_bin_op(lhs, rhs, op);
-        break;
-    case ast::BOP_AND:
-    case ast::BOP_OR:
-        result = visit_logical_bin_op(lhs, rhs, op);
-        break;
-    case ast::BOP_EXACT_EQUAL:
-    case ast::BOP_GREATER:
-    case ast::BOP_GREATER_EQUAL:
-    case ast::BOP_LESS:
-    case ast::BOP_LESS_EQUAL:
-    case ast::BOP_NOT_EQUAL:
-        result = visit_comparison_bin_op(lhs, rhs, op);
-        break;
-    default:
-        throw std::runtime_error("Error: binary operator is not supported\n");
+        write_to_variable(*var, rhs);
+        return;
     }
 
-    values.push_back(result);
+    llvm::Value* lhs = accept_and_get(node.get_lhs());
+    ir_builder.create_binary_op(lhs, rhs, op);
 }
 
 void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node) {
@@ -573,9 +440,7 @@ void CodegenLLVMVisitor::visit_statement_block(const ast::StatementBlock& node)
 }
 
 void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
-    const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*context),
-                                                  node.get_value());
-    values.push_back(constant);
+    ir_builder.create_boolean_constant(node.get_value());
 }
 
 // Generating FOR loop in LLVM IR creates the following structure:
@@ -612,10 +477,10 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
 //  +---------------------------+
 void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatement& node) {
     // Disable vector code generation for condition and increment blocks.
-    is_kernel_code = false;
+    ir_builder.stop_vectorization();
 
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.get_current_block();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -631,10 +496,12 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     int tmp_vector_width = vector_width;
 
     // Check if the kernel can be vectorised. If not, generate scalar code.
-    if (!can_vectorise(node, sym_tab)) {
-        logger->info("Cannot vectorise the for loop in '" + current_func->getName().str() + "'");
+    if (!can_vectorize(node, sym_tab)) {
+        logger->info("Cannot vectorise the for loop in '" + ir_builder.get_current_function_name() +
+                     "'");
         logger->info("Generating scalar code...");
         vector_width = 1;
+        ir_builder.generate_scalar_code();
     }
 
     // First, initialise the loop in the same basic block. This block is optional. Also, reset
@@ -643,36 +510,33 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
         node.get_initialization()->accept(*this);
     } else {
         vector_width = 1;
+        ir_builder.generate_scalar_code();
     }
 
     // Branch to condition basic block and insert condition code there.
-    ir_builder.CreateBr(for_cond);
-    ir_builder.SetInsertPoint(for_cond);
-    node.get_condition()->accept(*this);
+    ir_builder.create_br_and_set_insertion_point(for_cond);
 
     // Extract the condition to decide whether to branch to the loop body or loop exit.
-    llvm::Value* cond = values.back();
-    values.pop_back();
-    ir_builder.CreateCondBr(cond, for_body, exit);
+    llvm::Value* cond = accept_and_get(node.get_condition());
+    ir_builder.create_cond_br(cond, for_body, exit);
 
     // Generate code for the loop body and create the basic block for the increment.
-    ir_builder.SetInsertPoint(for_body);
-    is_kernel_code = true;
+    ir_builder.set_insertion_point(for_body);
+    ir_builder.start_vectorization();
     const auto& statement_block = node.get_statement_block();
     statement_block->accept(*this);
-    is_kernel_code = false;
-    ir_builder.CreateBr(for_inc);
-
+    ir_builder.stop_vectorization();
+    ir_builder.create_br_and_set_insertion_point(for_inc);
     // Process increment.
-    ir_builder.SetInsertPoint(for_inc);
     node.get_increment()->accept(*this);
 
     // Create a branch to condition block, then generate exit code out of the loop. Restore the
     // vector width.
-    ir_builder.CreateBr(for_cond);
-    ir_builder.SetInsertPoint(exit);
+    ir_builder.create_br(for_cond);
+    ir_builder.set_insertion_point(exit);
     vector_width = tmp_vector_width;
-    is_kernel_code = true;
+    ir_builder.generate_vectorized_code();
+    ir_builder.start_vectorization();
 }
 
 
@@ -680,12 +544,11 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     const auto& name = node.get_node_name();
     const auto& arguments = node.get_arguments();
     llvm::Function* func = module->getFunction(name);
-    current_func = func;
+    ir_builder.set_function(func);
 
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
-    llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func);
-    ir_builder.SetInsertPoint(body);
+    llvm::BasicBlock* body = ir_builder.create_block_and_set_insertion_point(func);
 
     // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
     // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
@@ -697,84 +560,59 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
 
 
     // Allocate parameters on the stack and add them to the symbol table.
-    unsigned i = 0;
-    for (auto& arg: func->args()) {
-        std::string arg_name = arguments[i++].get()->get_node_name();
-        llvm::Type* arg_type = arg.getType();
-        llvm::Value* alloca = ir_builder.CreateAlloca(arg_type, /*ArraySize=*/nullptr, arg_name);
-        arg.setName(arg_name);
-        ir_builder.CreateStore(&arg, alloca);
-    }
+    ir_builder.allocate_function_arguments(func, arguments);
 
     // Process function or procedure body. If the function is a compute kernel, then set the
     // corresponding flags. The return statement is handled in a separate visitor.
     bool has_void_ret_type = node.get_return_type()->get_type() == ast::AstNodeType::VOID;
     if (has_void_ret_type) {
-        is_kernel_code = true;
+        ir_builder.start_vectorization();
         block->accept(*this);
-        is_kernel_code = false;
+        ir_builder.stop_vectorization();
     } else {
         block->accept(*this);
     }
 
     // If function has a void return type, add a terminator not handled by CodegenReturnVar.
     if (has_void_ret_type)
-        ir_builder.CreateRetVoid();
+        ir_builder.create_return();
 
     // Clear local values stack and remove the pointer to the local symbol table.
-    values.clear();
-    current_func = nullptr;
+    ir_builder.clear_function();
 }
 
 void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturnStatement& node) {
     if (!node.get_statement()->is_name())
         throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
 
-    std::string ret = "ret_" + current_func->getName().str();
-    llvm::Value* ret_value = ir_builder.CreateLoad(lookup(ret));
-    ir_builder.CreateRet(ret_value);
+    std::string ret = "ret_" + ir_builder.get_current_function_name();
+    llvm::Value* ret_value = ir_builder.create_load(ret);
+    ir_builder.create_return(ret_value);
 }
 
 void CodegenLLVMVisitor::visit_codegen_var_list_statement(
     const ast::CodegenVarListStatement& node) {
-    llvm::Type* scalar_var_type = get_codegen_var_type(*node.get_var_type());
+    llvm::Type* scalar_type = get_codegen_var_type(*node.get_var_type());
     for (const auto& variable: node.get_variables()) {
-        std::string name = variable->get_node_name();
         const auto& identifier = variable->get_name();
+        std::string name = variable->get_node_name();
+
         // Local variable can be a scalar (Node AST class) or an array (IndexedName AST class). For
-        // each case, create memory allocations with the corresponding LLVM type.
-        llvm::Type* var_type;
+        // each case, create memory allocations.
         if (identifier->is_indexed_name()) {
-            auto indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
-            int length = get_array_length(*indexed_name);
-            var_type = llvm::ArrayType::get(scalar_var_type, length);
+            const auto& indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(identifier);
+            int length = get_num_elements(*indexed_name);
+            ir_builder.create_array_alloca(name, scalar_type, length);
         } else if (identifier->is_name()) {
-            // This case corresponds to a scalar or vector local variable.
-            const auto& identifier_name = identifier->get_node_name();
-
-            // Even if generating vectorised code, some variables still need to be scalar.
-            // Particularly, the induction variable "id" and remainder loop variables (that start
-            // with "epilogue").
-            if (is_kernel_code && vector_width > 1 && identifier_name != kernel_id &&
-                identifier_name.rfind("epilogue", 0)) {
-                var_type = llvm::FixedVectorType::get(scalar_var_type, vector_width);
-            } else {
-                var_type = scalar_var_type;
-            }
+            ir_builder.create_scalar_or_vector_alloca(name, scalar_type);
         } else {
-            throw std::runtime_error("Error: Unsupported local variable type");
+            throw std::runtime_error("Error: unsupported local variable type\n");
         }
-        ir_builder.CreateAlloca(var_type, /*ArraySize=*/nullptr, name);
     }
 }
 
 void CodegenLLVMVisitor::visit_double(const ast::Double& node) {
-    if (is_kernel_code && vector_width > 1) {
-        values.push_back(get_constant_fp_vector(node.get_value()));
-        return;
-    }
-    const auto& constant = llvm::ConstantFP::get(get_default_fp_type(), node.get_value());
-    values.push_back(constant);
+    ir_builder.create_fp_constant(node.get_value());
 }
 
 void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
@@ -783,23 +621,22 @@ void CodegenLLVMVisitor::visit_function_block(const ast::FunctionBlock& node) {
 
 void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
     const auto& name = node.get_node_name();
-    auto func = module->getFunction(name);
+    llvm::Function* func = module->getFunction(name);
     if (func) {
         create_function_call(func, name, node.get_arguments());
     } else {
         auto symbol = sym_tab->lookup(name);
         if (symbol && symbol->has_any_property(symtab::syminfo::NmodlType::extern_method)) {
-            create_external_method_call(name, node.get_arguments());
+            create_external_function_call(name, node.get_arguments());
         } else {
-            throw std::runtime_error("Error: Unknown function name: " + name +
-                                     ". (External functions references are not supported)");
+            throw std::runtime_error("Error: unknown function name: " + name + "\n");
         }
     }
 }
 
 void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.get_current_block();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -808,14 +645,12 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     llvm::BasicBlock* merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
 
     // Add condition to the current block.
-    node.get_condition()->accept(*this);
-    llvm::Value* cond = values.back();
-    values.pop_back();
+    llvm::Value* cond = accept_and_get(node.get_condition());
 
     // Process the true block.
-    ir_builder.SetInsertPoint(true_block);
+    ir_builder.set_insertion_point(true_block);
     node.get_statement_block()->accept(*this);
-    ir_builder.CreateBr(merge_block);
+    ir_builder.create_br(merge_block);
 
     // Save the merge block and proceed with codegen for `else if` statements.
     llvm::BasicBlock* exit = merge_block;
@@ -823,27 +658,25 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
         // Link the current block to the true and else blocks.
         llvm::BasicBlock* else_block =
             llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        ir_builder.SetInsertPoint(curr_block);
-        ir_builder.CreateCondBr(cond, true_block, else_block);
+        ir_builder.set_insertion_point(curr_block);
+        ir_builder.create_cond_br(cond, true_block, else_block);
 
         // Process else block.
-        ir_builder.SetInsertPoint(else_block);
-        else_if->get_condition()->accept(*this);
-        cond = values.back();
-        values.pop_back();
+        ir_builder.set_insertion_point(else_block);
+        cond = accept_and_get(else_if->get_condition());
 
         // Reassign true and merge blocks respectively. Note that the new merge block has to be
         // connected to the old merge block (tmp).
         true_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
         llvm::BasicBlock* tmp = merge_block;
         merge_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        ir_builder.SetInsertPoint(merge_block);
-        ir_builder.CreateBr(tmp);
+        ir_builder.set_insertion_point(merge_block);
+        ir_builder.create_br(tmp);
 
         // Process true block.
-        ir_builder.SetInsertPoint(true_block);
+        ir_builder.set_insertion_point(true_block);
         else_if->get_statement_block()->accept(*this);
-        ir_builder.CreateBr(merge_block);
+        ir_builder.create_br(merge_block);
         curr_block = else_block;
     }
 
@@ -852,25 +685,19 @@ void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     llvm::BasicBlock* else_block;
     if (elses) {
         else_block = llvm::BasicBlock::Create(*context, /*Name=*/"", func, merge_block);
-        ir_builder.SetInsertPoint(else_block);
+        ir_builder.set_insertion_point(else_block);
         elses->get_statement_block()->accept(*this);
-        ir_builder.CreateBr(merge_block);
+        ir_builder.create_br(merge_block);
     } else {
         else_block = merge_block;
     }
-    ir_builder.SetInsertPoint(curr_block);
-    ir_builder.CreateCondBr(cond, true_block, else_block);
-    ir_builder.SetInsertPoint(exit);
+    ir_builder.set_insertion_point(curr_block);
+    ir_builder.create_cond_br(cond, true_block, else_block);
+    ir_builder.set_insertion_point(exit);
 }
 
 void CodegenLLVMVisitor::visit_integer(const ast::Integer& node) {
-    if (is_kernel_code && vector_width > 1) {
-        values.push_back(get_constant_int_vector(node.get_value()));
-        return;
-    }
-    const auto& constant = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context),
-                                                  node.get_value());
-    values.push_back(constant);
+    ir_builder.create_i32_constant(node.get_value());
 }
 
 void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
@@ -881,7 +708,11 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     CodegenLLVMHelperVisitor v{vector_width};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
-    kernel_id = v.get_kernel_id();
+    sym_tab = node.get_symbol_table();
+    std::string kernel_id = v.get_kernel_id();
+
+    // Initialize the builder for this NMODL program.
+    ir_builder.initialize(*sym_tab, kernel_id);
 
     // Create compile unit if adding debug information to the module.
     if (add_debug_information) {
@@ -891,12 +722,9 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     // For every function, generate its declaration. Thus, we can look up
     // `llvm::Function` in the symbol table in the module.
     for (const auto& func: functions) {
-        emit_procedure_or_function_declaration(*func);
+        create_function_declaration(*func);
     }
 
-    // Set the AST symbol table.
-    sym_tab = node.get_symbol_table();
-
     // Proceed with code generation. Right now, we do not do
     //     node.visit_children(*this);
     // The reason is that the node may contain AST nodes for which the visitor functions have been
@@ -977,40 +805,18 @@ void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node)
 
 void CodegenLLVMVisitor::visit_unary_expression(const ast::UnaryExpression& node) {
     ast::UnaryOp op = node.get_op().get_value();
-    node.get_expression()->accept(*this);
-    llvm::Value* value = values.back();
-    values.pop_back();
-    if (op == ast::UOP_NEGATION) {
-        values.push_back(ir_builder.CreateFNeg(value));
-    } else if (op == ast::UOP_NOT) {
-        values.push_back(ir_builder.CreateNot(value));
-    } else {
-        throw std::runtime_error("Error: unsupported unary operator\n");
-    }
+    llvm::Value* value = accept_and_get(node.get_expression());
+    ir_builder.create_unary_op(value, op);
 }
 
 void CodegenLLVMVisitor::visit_var_name(const ast::VarName& node) {
-    llvm::Value* ptr = get_variable_ptr(node);
-
-    // Finally, load the variable from the pointer value unless it has already been loaded (e.g. via
-    // gather instruction).
-    llvm::Value* var = ptr->getType()->isPointerTy() ? ir_builder.CreateLoad(ptr) : ptr;
-
-    // If the value should not be vectorised, or it is already a vector, add it to the stack.
-    if (!is_kernel_code || vector_width <= 1 || var->getType()->isVectorTy()) {
-        values.push_back(var);
-        return;
-    }
-
-    // Otherwise, if we are generating vectorised inside the loop, replicate the value to form a
-    // vector of `vector_width`.
-    llvm::Value* vector_var = ir_builder.CreateVectorSplat(vector_width, var);
-    values.push_back(vector_var);
+    llvm::Value* value = read_variable(node);
+    ir_builder.maybe_replicate_value(value);
 }
 
 void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node) {
     // Get the current and the next blocks within the function.
-    llvm::BasicBlock* curr_block = ir_builder.GetInsertBlock();
+    llvm::BasicBlock* curr_block = ir_builder.get_current_block();
     llvm::BasicBlock* next = curr_block->getNextNode();
     llvm::Function* func = curr_block->getParent();
 
@@ -1019,78 +825,18 @@ void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node)
     llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"", func, next);
 
-    ir_builder.CreateBr(header);
-    ir_builder.SetInsertPoint(header);
+    ir_builder.create_br_and_set_insertion_point(header);
+
 
     // Generate code for condition and create branch to the body block.
-    node.get_condition()->accept(*this);
-    llvm::Value* condition = values.back();
-    values.pop_back();
-    ir_builder.CreateCondBr(condition, body, exit);
+    llvm::Value* condition = accept_and_get(node.get_condition());
+    ir_builder.create_cond_br(condition, body, exit);
 
-    ir_builder.SetInsertPoint(body);
+    ir_builder.set_insertion_point(body);
     node.get_statement_block()->accept(*this);
-    ir_builder.CreateBr(header);
+    ir_builder.create_br(header);
 
-    ir_builder.SetInsertPoint(exit);
-}
-
-void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
-    // By convention, only the kernel functions return void type.
-    const auto& functions = module->getFunctionList();
-    for (const auto& func: functions) {
-        if (func.getReturnType()->isVoidTy()) {
-            container.push_back(func.getName().str());
-        }
-    }
-}
-
-void CodegenLLVMVisitor::wrap_kernel_functions() {
-    // First, identify all kernels.
-    std::vector<std::string> kernel_names;
-    find_kernel_names(kernel_names);
-
-    for (const auto& kernel_name: kernel_names) {
-        // Get the kernel function and the instance struct type.
-        auto kernel = module->getFunction(kernel_name);
-        if (!kernel)
-            throw std::runtime_error("Kernel " + kernel_name + " is not found!");
-
-        if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
-            throw std::runtime_error("Kernel " + kernel_name + " must have a single argument!");
-
-        auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(
-            kernel->getArg(0)->getType());
-        if (!instance_struct_ptr_type)
-            throw std::runtime_error("Kernel " + kernel_name +
-                                     " does not have an instance struct pointer argument!");
-
-        // Create a wrapper void function that takes a void pointer as a single argument.
-        llvm::Type* i32_type = llvm::Type::getInt32Ty(*context);
-        llvm::Type* void_ptr_type = llvm::Type::getInt8PtrTy(*context);
-        llvm::Function* wrapper_func = llvm::Function::Create(
-            llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
-            llvm::Function::ExternalLinkage,
-            "__" + kernel_name + "_wrapper",
-            *module);
-
-        // Optionally, add debug information for the wrapper function.
-        if (add_debug_information) {
-            debug_builder.add_function_debug_info(wrapper_func);
-        }
-
-        llvm::BasicBlock* body = llvm::BasicBlock::Create(*context, /*Name=*/"", wrapper_func);
-        ir_builder.SetInsertPoint(body);
-
-        // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
-        // and adding a terminator.
-        llvm::Value* bitcasted = ir_builder.CreateBitCast(wrapper_func->getArg(0),
-                                                          instance_struct_ptr_type);
-        std::vector<llvm::Value*> args;
-        args.push_back(bitcasted);
-        ir_builder.CreateCall(kernel, args);
-        ir_builder.CreateRet(llvm::ConstantInt::get(i32_type, 0));
-    }
+    ir_builder.set_insertion_point(exit);
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 450e1872a4..0ada7b8097 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -20,13 +20,13 @@
 
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 #include "codegen/llvm/llvm_debug_builder.hpp"
+#include "codegen/llvm/llvm_ir_builder.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
@@ -63,76 +63,50 @@ static const std::map<std::string, llvm::TargetLibraryInfoImpl::VectorLibrary> v
  * \brief %Visitor for transforming NMODL AST to LLVM IR
  */
 class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
-    // Name of mod file (without .mod suffix)
+    /// Name of mod file (without .mod suffix).
     std::string mod_filename;
 
-    // Output directory for code generation
+    /// Output directory for code generation.
     std::string output_dir;
 
   private:
-    InstanceVarHelper instance_var_helper;
-
+    /// Underlying LLVM context.
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
 
+    /// Underlying LLVM module.
     std::unique_ptr<llvm::Module> module = std::make_unique<llvm::Module>(mod_filename, *context);
 
-    // LLVM IR builder.
-    llvm::IRBuilder<> ir_builder;
+    /// LLVM IR builder.
+    IRBuilder ir_builder;
 
-    // Debug information builder.
+    /// Debug information builder.
     DebugBuilder debug_builder;
 
-    // Add debug information to the module.
+    /// Add debug information to the module.
     bool add_debug_information;
 
-    // Pass manager for optimisation passes that are used for target code generation.
-    llvm::legacy::FunctionPassManager codegen_pm;
-
-    // Vector library used for maths functions.
-    llvm::TargetLibraryInfoImpl::VectorLibrary vector_library;
-
-    // Pass manager for optimisation passes that are run on IR and are not related to target.
-    llvm::legacy::FunctionPassManager opt_pm;
-
-    // Stack to hold visited values
-    std::vector<llvm::Value*> values;
-
-    // Pointer to the current function.
-    llvm::Function* current_func = nullptr;
-
-    // Pointer to AST symbol table.
+    /// Pointer to AST symbol table.
     symtab::SymbolTable* sym_tab;
 
-    // Run optimisation passes if true.
-    bool opt_passes;
+    /// Instance variable helper.
+    InstanceVarHelper instance_var_helper;
 
-    // Use 32-bit floating-point type if true. Otherwise, use deafult 64-bit.
-    bool use_single_precision;
+    /// Run optimisation passes if true.
+    bool opt_passes;
 
-    // Explicit vectorisation width.
-    int vector_width;
+    /// Pass manager for optimisation passes that are run on IR and are not related to target.
+    llvm::legacy::FunctionPassManager opt_pm;
 
-    // The name of induction variable used in the kernel functions.
-    std::string kernel_id;
+    /// Pass manager for optimisation passes that are used for target code generation.
+    llvm::legacy::FunctionPassManager codegen_pm;
 
-    // A flag to indicate that the code is generated for the kernel.
-    bool is_kernel_code = false;
+    /// Vector library used for maths functions.
+    llvm::TargetLibraryInfoImpl::VectorLibrary vector_library;
 
-    /**
-     *\brief Run LLVM optimisation passes on generated IR
-     *
-     * LLVM provides number of optimisation passes that can be run on the generated IR.
-     * Here we run common optimisation LLVM passes that benefits code optimisation.
-     */
-    void run_ir_opt_passes();
+    /// Explicit vectorisation width.
+    int vector_width;
 
   public:
-    /**
-     * \brief Constructs the LLVM code generator visitor
-     *
-     * This constructor instantiates an NMODL LLVM code generator. This is
-     * just template to work with initial implementation.
-     */
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
                        bool opt_passes,
@@ -143,202 +117,44 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
-        , use_single_precision(use_single_precision)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
         , add_debug_information(add_debug_information)
-        , ir_builder(*context)
+        , ir_builder(*context, use_single_precision, vector_width)
         , debug_builder(*module)
         , codegen_pm(module.get())
         , opt_pm(module.get()) {}
 
+    /// Dumps the generated LLVM IR module to string.
+    std::string dump_module() const {
+        std::string str;
+        llvm::raw_string_ostream os(str);
+        os << *module;
+        os.flush();
+        return str;
+    }
 
-    /**
-     * Generates LLVM code for the given IndexedName
-     * \param node IndexedName NMODL AST node
-     * \return LLVM code generated for this AST node
-     */
-    llvm::Value* codegen_indexed_name(const ast::IndexedName& node);
-
-    /**
-     * Generates LLVM code for the given Instance variable
-     * \param node CodegenInstanceVar NMODL AST node
-     * \return LLVM code generated for this AST node
-     */
-    llvm::Value* codegen_instance_var(const ast::CodegenInstanceVar& node);
-
-    /**
-     * Returns GEP instruction to 1D array
-     * \param name 1D array name
-     * \param index element index
-     * \return GEP instruction value
-     */
-    llvm::Value* create_gep(const std::string& name, llvm::Value* index);
-
-    /**
-     * Returns array index from given IndexedName
-     * \param node IndexedName representing array
-     * \return array index
-     */
-    llvm::Value* get_array_index(const ast::IndexedName& node);
-
-    /**
-     * Returns array length from given IndexedName
-     * \param node IndexedName representing array
-     * \return array length
-     */
-    int get_array_length(const ast::IndexedName& node);
-
-    /**
-     * Returns LLVM type for the given CodegenVarType node
-     * \param node CodegenVarType
-     * \return LLVM type
-     */
-    llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
+    /// Fills the container with the names of kernel functions from the MOD file.
+    void find_kernel_names(std::vector<std::string>& container);
 
-    /**
-     * Returns LLVM vector with `vector_width` int values.
-     * \param int value to replicate
-     * \return LLVM value
-     */
-    llvm::Value* get_constant_int_vector(int value);
-
-    /**
-     * Returns LLVM vector with `vector_width` double values.
-     * \param string a double value to replicate
-     * \return LLVM value
-     */
-    llvm::Value* get_constant_fp_vector(const std::string& value);
-
-    /**
-     * Returns 64-bit or 32-bit LLVM floating type
-     * \return     \c LLVM floating point type according to `use_single_precision` flag
-     */
-    llvm::Type* get_default_fp_type();
-
-    /**
-     * Returns pointer to 64-bit or 32-bit LLVM floating type
-     * \return     \c LLVM pointer to floating point type according to `use_single_precision` flag
-     */
-    llvm::Type* get_default_fp_ptr_type();
-
-    /**
-     * Returns a pointer to LLVM struct type
-     * \return LLVM pointer type
-     */
-    llvm::Type* get_instance_struct_type();
+    /// Returns underlying module.
+    std::unique_ptr<llvm::Module> get_module() {
+        return std::move(module);
+    }
 
-    /**
-     * Returns a LLVM value corresponding to the VarName node
-     * \return LLVM value
-     */
-    llvm::Value* get_variable_ptr(const ast::VarName& node);
-
-    /**
-     * Returns shared_ptr to generated ast::InstanceStruct
-     * \return std::shared_ptr<ast::InstanceStruct>
-     */
-    std::shared_ptr<ast::InstanceStruct> get_instance_struct_ptr();
-
-    /**
-     * Create a function call to an external method
-     * \param name external method name
-     * \param arguments expressions passed as arguments to the given external method
-     */
-    void create_external_method_call(const std::string& name,
-                                     const ast::ExpressionVector& arguments);
-
-    /**
-     * Create a function call to NMODL function or procedure in the same mod file
-     * \param func LLVM function corresponding ti this call
-     * \param name function name
-     * \param arguments expressions passed as arguments to the function call
-     */
-    void create_function_call(llvm::Function* func,
-                              const std::string& name,
-                              const ast::ExpressionVector& arguments);
-    /**
-     * Create a function call to printf function
-     * \param arguments expressions passed as arguments to the printf call
-     */
-    void create_printf_call(const ast::ExpressionVector& arguments);
+    /// Returns shared_ptr to generated ast::InstanceStruct.
+    std::shared_ptr<ast::InstanceStruct> get_instance_struct_ptr() {
+        return instance_var_helper.instance;
+    }
 
-    /**
-     * Emit function or procedure declaration in LLVM given the node
-     *
-     * \param node the AST node representing the function or procedure in NMODL
-     */
-    void emit_procedure_or_function_declaration(const ast::CodegenFunction& node);
-
-    /**
-     * Return InstanceVarHelper
-     * \return InstanceVarHelper
-     */
+    /// Returns InstanceVarHelper for the given MOD file.
     InstanceVarHelper get_instance_var_helper() {
         return instance_var_helper;
     }
 
-    /**
-     * Return module pointer
-     * \return LLVM IR module pointer
-     */
-    std::unique_ptr<llvm::Module> get_module() {
-        return std::move(module);
-    }
-
-    /**
-     * Lookup the given name in the current function's symbol table
-     * \return LLVM value
-     */
-    llvm::Value* lookup(const std::string& name);
-
-    /**
-     * Fills values vector with processed NMODL function call arguments
-     * \param arguments expression vector
-     * \param arg_values vector of LLVM IR values to fill
-     */
-    void pack_function_call_arguments(const ast::ExpressionVector& arguments,
-                                      std::vector<llvm::Value*>& arg_values);
-
-    /**
-     * Visit nmodl arithmetic binary operator
-     * \param lhs LLVM value of evaluated lhs expression
-     * \param rhs LLVM value of evaluated rhs expression
-     * \param op the AST binary operator (ADD, DIV, MUL, SUB)
-     * \return LLVM IR value result
-     */
-    llvm::Value* visit_arithmetic_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
-
-    /**
-     * Visit nmodl assignment operator (ASSIGN)
-     * \param node the AST node representing the binary expression in NMODL
-     * \param rhs LLVM value of evaluated rhs expression
-     */
-    void visit_assign_op(const ast::BinaryExpression& node, llvm::Value* rhs);
-
-    /**
-     * Visit nmodl logical binary operator
-     * \param lhs LLVM value of evaluated lhs expression
-     * \param rhs LLVM value of evaluated rhs expression
-     * \param op the AST binary operator (AND, OR)
-     * \return LLVM IR value result
-     */
-    llvm::Value* visit_logical_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
-
-    /**
-     * Visit nmodl comparison binary operator
-     * \param lhs LLVM value of evaluated lhs expression
-     * \param rhs LLVM value of evaluated rhs expression
-     * \param op the AST binary operator (EXACT_EQUAL, GREATER, GREATER_EQUAL, LESS, LESS_EQUAL,
-     * NOT_EQUAL) \return LLVM IR value result
-     */
-    llvm::Value* visit_comparison_bin_op(llvm::Value* lhs, llvm::Value* rhs, unsigned op);
-
-
-    // Visitors
+    // Visitors.
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
-    void visit_statement_block(const ast::StatementBlock& node) override;
     void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
@@ -350,31 +166,65 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_integer(const ast::Integer& node) override;
     void visit_procedure_block(const ast::ProcedureBlock& node) override;
     void visit_program(const ast::Program& node) override;
+    void visit_statement_block(const ast::StatementBlock& node) override;
     void visit_unary_expression(const ast::UnaryExpression& node) override;
     void visit_var_name(const ast::VarName& node) override;
     void visit_while_statement(const ast::WhileStatement& node) override;
 
-    /**
-     * Dumps the generated LLVM IR module to string.
-     */
-    std::string dump_module() const {
-        std::string str;
-        llvm::raw_string_ostream os(str);
-        os << *module;
-        os.flush();
-        return str;
-    }
+    /// Wraps all kernel function calls into wrapper functions that use `void*` to pass the data to
+    /// the kernel.
+    void wrap_kernel_functions();
 
-    /**
-     * Fills the container with the names of kernel functions from the MOD file.
-     */
-    void find_kernel_names(std::vector<std::string>& container);
+  private:
+    /// Accepts the given AST node and returns the processed value.
+    llvm::Value* accept_and_get(const std::shared_ptr<ast::Node>& node);
 
-    /**
-     * Wraps all kernel function calls into wrapper functions that use void* to pass the data to the
-     * kernel.
-     */
-    void wrap_kernel_functions();
+    /// Creates a call to an external function (e.g pow, exp, etc.)
+    void create_external_function_call(const std::string& name,
+                                       const ast::ExpressionVector& arguments);
+
+    /// Creates a call to NMODL function or procedure in the same MOD file.
+    void create_function_call(llvm::Function* func,
+                              const std::string& name,
+                              const ast::ExpressionVector& arguments);
+
+    /// Fills values vector with processed NMODL function call arguments.
+    void create_function_call_arguments(const ast::ExpressionVector& arguments,
+                                        ValueVector& arg_values);
+
+    /// Creates the function declaration for the given AST node.
+    void create_function_declaration(const ast::CodegenFunction& node);
+
+    /// Creates a call to `printf` function.
+    void create_printf_call(const ast::ExpressionVector& arguments);
+
+    /// Returns LLVM type for the given CodegenVarType AST node.
+    llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
+
+    /// Returns the index value from the IndexedName AST node.
+    llvm::Value* get_index(const ast::IndexedName& node);
+
+    /// Returns an instance struct type.
+    llvm::Type* get_instance_struct_type();
+
+    /// Returns the number of elements in the array specified by the IndexedName AST node.
+    int get_num_elements(const ast::IndexedName& node);
+
+    /// If the value to store is specified, writes it to the instance. Otherwise, returns the
+    /// instance variable.
+    llvm::Value* read_from_or_write_to_instance(const ast::CodegenInstanceVar& node,
+                                                llvm::Value* maybe_value_to_store = nullptr);
+
+    /// Reads the given variable and returns the processed value.
+    llvm::Value* read_variable(const ast::VarName& node);
+
+
+    /// Run multiple LLVM optimisation passes on generated IR.
+    /// TODO: this can be moved to a dedicated file or deprecated.
+    void run_ir_opt_passes();
+
+    //// Writes the value to the given variable.
+    void write_to_variable(const ast::VarName& node, llvm::Value* value);
 };
 
 /** \} */  // end of llvm_backends
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
new file mode 100644
index 0000000000..2773e6929b
--- /dev/null
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -0,0 +1,427 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/llvm_ir_builder.hpp"
+#include "ast/all.hpp"
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/ValueSymbolTable.h"
+
+namespace nmodl {
+namespace codegen {
+
+
+/****************************************************************************************/
+/*                            LLVM type utilities                                       */
+/****************************************************************************************/
+
+llvm::Type* IRBuilder::get_boolean_type() {
+    return llvm::Type::getInt1Ty(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_i8_ptr_type() {
+    return llvm::Type::getInt8PtrTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_i32_type() {
+    return llvm::Type::getInt32Ty(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_i32_ptr_type() {
+    return llvm::Type::getInt32PtrTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_i64_type() {
+    return llvm::Type::getInt64Ty(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_fp_type() {
+    if (fp_precision == single_precision)
+        return llvm::Type::getFloatTy(builder.getContext());
+    return llvm::Type::getDoubleTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_fp_ptr_type() {
+    if (fp_precision == single_precision)
+        return llvm::Type::getFloatPtrTy(builder.getContext());
+    return llvm::Type::getDoublePtrTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_void_type() {
+    return llvm::Type::getVoidTy(builder.getContext());
+}
+
+llvm::Type* IRBuilder::get_struct_ptr_type(const std::string& struct_type_name,
+                                           TypeVector& member_types) {
+    llvm::StructType* llvm_struct_type = llvm::StructType::create(builder.getContext(),
+                                                                  struct_type_name);
+    llvm_struct_type->setBody(member_types);
+    return llvm::PointerType::get(llvm_struct_type, /*AddressSpace=*/0);
+}
+
+
+/****************************************************************************************/
+/*                            LLVM value utilities                                      */
+/****************************************************************************************/
+
+llvm::Value* IRBuilder::lookup_value(const std::string& value_name) {
+    auto value = current_function->getValueSymbolTable()->lookup(value_name);
+    if (!value)
+        throw std::runtime_error("Error: variable " + value_name + " is not in the scope\n");
+    return value;
+}
+
+llvm::Value* IRBuilder::pop_last_value() {
+    // Check if the stack is empty.
+    if (value_stack.empty())
+        throw std::runtime_error("Error: popping a value from the empty stack\n");
+
+    // Return the last added value and delete it from the stack.
+    llvm::Value* last = value_stack.back();
+    value_stack.pop_back();
+    return last;
+}
+
+/****************************************************************************************/
+/*                            LLVM constants utilities                                  */
+/****************************************************************************************/
+
+void IRBuilder::create_boolean_constant(int value) {
+    value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_boolean_type(), value));
+}
+
+void IRBuilder::create_fp_constant(const std::string& value) {
+    if (instruction_width > 1 && vectorize) {
+        value_stack.push_back(get_vector_constant<llvm::ConstantFP>(get_fp_type(), value));
+    } else {
+        value_stack.push_back(get_scalar_constant<llvm::ConstantFP>(get_fp_type(), value));
+    }
+}
+
+llvm::Value* IRBuilder::create_global_string(const ast::String& node) {
+    return builder.CreateGlobalStringPtr(node.get_value());
+}
+
+void IRBuilder::create_i32_constant(int value) {
+    if (instruction_width > 1 && vectorize) {
+        value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_i32_type(), value));
+    } else {
+        value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_i32_type(), value));
+    }
+}
+
+template <typename C, typename V>
+llvm::Value* IRBuilder::get_scalar_constant(llvm::Type* type, V value) {
+    return C::get(type, value);
+}
+
+template <typename C, typename V>
+llvm::Value* IRBuilder::get_vector_constant(llvm::Type* type, V value) {
+    ConstantVector constants;
+    for (unsigned i = 0; i < instruction_width; ++i) {
+        const auto& element = C::get(type, value);
+        constants.push_back(element);
+    }
+    return llvm::ConstantVector::get(constants);
+}
+
+/****************************************************************************************/
+/*                              LLVM function utilities                                 */
+/****************************************************************************************/
+
+void IRBuilder::allocate_function_arguments(llvm::Function* function,
+                                            const ast::CodegenVarWithTypeVector& nmodl_arguments) {
+    unsigned i = 0;
+    for (auto& arg: function->args()) {
+        std::string arg_name = nmodl_arguments[i++].get()->get_node_name();
+        llvm::Type* arg_type = arg.getType();
+        llvm::Value* alloca = builder.CreateAlloca(arg_type, /*ArraySize=*/nullptr, arg_name);
+        arg.setName(arg_name);
+        builder.CreateStore(&arg, alloca);
+    }
+}
+
+std::string IRBuilder::get_current_function_name() {
+    return current_function->getName().str();
+}
+
+void IRBuilder::create_function_call(llvm::Function* callee,
+                                     ValueVector& arguments,
+                                     bool use_result) {
+    llvm::Value* call_instruction = builder.CreateCall(callee, arguments);
+    if (use_result)
+        value_stack.push_back(call_instruction);
+}
+
+void IRBuilder::create_intrinsic(const std::string& name,
+                                 ValueVector& argument_values,
+                                 TypeVector& argument_types) {
+    unsigned intrinsic_id = llvm::StringSwitch<llvm::Intrinsic::ID>(name)
+                                .Case("exp", llvm::Intrinsic::exp)
+                                .Case("pow", llvm::Intrinsic::pow)
+                                .Default(llvm::Intrinsic::not_intrinsic);
+    if (intrinsic_id) {
+        llvm::Value* intrinsic =
+            builder.CreateIntrinsic(intrinsic_id, argument_types, argument_values);
+        value_stack.push_back(intrinsic);
+    } else {
+        throw std::runtime_error("Error: calls to " + name + " are not valid or not supported\n");
+    }
+}
+
+/****************************************************************************************/
+/*                             LLVM instruction utilities                               */
+/****************************************************************************************/
+
+void IRBuilder::create_array_alloca(const std::string& name,
+                                    llvm::Type* element_type,
+                                    int num_elements) {
+    llvm::Type* array_type = llvm::ArrayType::get(element_type, num_elements);
+    builder.CreateAlloca(array_type, /*ArraySize=*/nullptr, name);
+}
+
+void IRBuilder::create_binary_op(llvm::Value* lhs, llvm::Value* rhs, ast::BinaryOp op) {
+    // Check that both lhs and rhs have the same types.
+    if (lhs->getType() != rhs->getType())
+        throw std::runtime_error(
+            "Error: lhs and rhs of the binary operator have different types\n");
+
+    llvm::Value* result;
+    switch (op) {
+#define DISPATCH(binary_op, fp_instruction, integer_instruction) \
+    case binary_op:                                              \
+        if (lhs->getType()->isIntOrIntVectorTy())                \
+            result = integer_instruction(lhs, rhs);              \
+        else                                                     \
+            result = fp_instruction(lhs, rhs);                   \
+        break;
+
+        // Arithmetic instructions.
+        DISPATCH(ast::BinaryOp::BOP_ADDITION, builder.CreateFAdd, builder.CreateAdd);
+        DISPATCH(ast::BinaryOp::BOP_DIVISION, builder.CreateFDiv, builder.CreateSDiv);
+        DISPATCH(ast::BinaryOp::BOP_MULTIPLICATION, builder.CreateFMul, builder.CreateMul);
+        DISPATCH(ast::BinaryOp::BOP_SUBTRACTION, builder.CreateFSub, builder.CreateSub);
+
+        // Comparison instructions.
+        DISPATCH(ast::BinaryOp::BOP_EXACT_EQUAL, builder.CreateFCmpOEQ, builder.CreateICmpEQ);
+        DISPATCH(ast::BinaryOp::BOP_GREATER, builder.CreateFCmpOGT, builder.CreateICmpSGT);
+        DISPATCH(ast::BinaryOp::BOP_GREATER_EQUAL, builder.CreateFCmpOGE, builder.CreateICmpSGE);
+        DISPATCH(ast::BinaryOp::BOP_LESS, builder.CreateFCmpOLT, builder.CreateICmpSLT);
+        DISPATCH(ast::BinaryOp::BOP_LESS_EQUAL, builder.CreateFCmpOLE, builder.CreateICmpSLE);
+        DISPATCH(ast::BinaryOp::BOP_NOT_EQUAL, builder.CreateFCmpONE, builder.CreateICmpNE);
+
+#undef DISPATCH
+
+    // Logical instructions.
+    case ast::BinaryOp::BOP_AND:
+        result = builder.CreateAnd(lhs, rhs);
+        break;
+    case ast::BinaryOp::BOP_OR:
+        result = builder.CreateOr(lhs, rhs);
+        break;
+
+    default:
+        throw std::runtime_error("Error: unsupported binary operator\n");
+    }
+    value_stack.push_back(result);
+}
+
+llvm::Value* IRBuilder::create_bitcast(llvm::Value* value, llvm::Type* dst_type) {
+    return builder.CreateBitCast(value, dst_type);
+}
+
+llvm::Value* IRBuilder::create_inbounds_gep(const std::string& var_name, llvm::Value* index) {
+    llvm::Value* variable_ptr = lookup_value(var_name);
+
+    // Since we index through the pointer, we need an extra 0 index in the indices list for GEP.
+    ValueVector indices{llvm::ConstantInt::get(get_i64_type(), 0), index};
+    return builder.CreateInBoundsGEP(variable_ptr, indices);
+}
+
+llvm::Value* IRBuilder::create_inbounds_gep(llvm::Value* variable, llvm::Value* index) {
+    return builder.CreateInBoundsGEP(variable, {index});
+}
+
+llvm::Value* IRBuilder::create_index(llvm::Value* value) {
+    // Check if index is a double. While it is possible to use casting from double to integer
+    // values, we choose not to support these cases.
+    llvm::Type* value_type = value->getType();
+    if (!value_type->isIntOrIntVectorTy())
+        throw std::runtime_error("Error: only integer indexing is supported\n");
+
+    // Conventionally, in LLVM array indices are 64 bit.
+    llvm::Type* i64_type = get_i64_type();
+    if (auto index_type = llvm::dyn_cast<llvm::IntegerType>(value_type)) {
+        if (index_type->getBitWidth() == i64_type->getIntegerBitWidth())
+            return value;
+        return builder.CreateSExtOrTrunc(value, i64_type);
+    }
+
+    const auto& vector_type = llvm::cast<llvm::FixedVectorType>(value_type);
+    const auto& element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
+    if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
+        return value;
+    return builder.CreateSExtOrTrunc(value,
+                                     llvm::FixedVectorType::get(i64_type, instruction_width));
+}
+
+llvm::Value* IRBuilder::create_load(const std::string& name) {
+    llvm::Value* ptr = lookup_value(name);
+    llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
+    return builder.CreateLoad(loaded_type, ptr);
+}
+
+llvm::Value* IRBuilder::create_load(llvm::Value* ptr) {
+    llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
+    return builder.CreateLoad(loaded_type, ptr);
+}
+
+llvm::Value* IRBuilder::create_load_from_array(const std::string& name, llvm::Value* index) {
+    llvm::Value* element_ptr = create_inbounds_gep(name, index);
+    return create_load(element_ptr);
+}
+
+void IRBuilder::create_store(const std::string& name, llvm::Value* value) {
+    llvm::Value* ptr = lookup_value(name);
+    builder.CreateStore(value, ptr);
+}
+
+void IRBuilder::create_store(llvm::Value* ptr, llvm::Value* value) {
+    builder.CreateStore(value, ptr);
+}
+
+void IRBuilder::create_store_to_array(const std::string& name,
+                                      llvm::Value* index,
+                                      llvm::Value* value) {
+    llvm::Value* element_ptr = create_inbounds_gep(name, index);
+    create_store(element_ptr, value);
+}
+
+void IRBuilder::create_return(llvm::Value* return_value) {
+    if (return_value)
+        builder.CreateRet(return_value);
+    else
+        builder.CreateRetVoid();
+}
+
+void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
+                                               llvm::Type* element_or_scalar_type) {
+    // Even if generating vectorised code, some variables still need to be scalar. Particularly, the
+    // induction variable "id" and remainder loop variables (that start with "epilogue" prefix).
+    llvm::Type* type;
+    if (instruction_width > 1 && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+        type = llvm::FixedVectorType::get(element_or_scalar_type, instruction_width);
+    } else {
+        type = element_or_scalar_type;
+    }
+    builder.CreateAlloca(type, /*ArraySize=*/nullptr, name);
+}
+
+void IRBuilder::create_unary_op(llvm::Value* value, ast::UnaryOp op) {
+    if (op == ast::UOP_NEGATION) {
+        value_stack.push_back(builder.CreateFNeg(value));
+    } else if (op == ast::UOP_NOT) {
+        value_stack.push_back(builder.CreateNot(value));
+    } else {
+        throw std::runtime_error("Error: unsupported unary operator\n");
+    }
+}
+
+llvm::Value* IRBuilder::get_struct_member_ptr(llvm::Value* struct_variable, int member_index) {
+    ValueVector indices;
+    indices.push_back(llvm::ConstantInt::get(get_i32_type(), 0));
+    indices.push_back(llvm::ConstantInt::get(get_i32_type(), member_index));
+    return builder.CreateInBoundsGEP(struct_variable, indices);
+}
+
+llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
+                                                    llvm::Value* id_value,
+                                                    llvm::Value* array,
+                                                    llvm::Value* maybe_value_to_store) {
+    // First, calculate the address of the element in the array.
+    llvm::Value* element_ptr = create_inbounds_gep(array, id_value);
+
+    // If the vector code is generated, we need to distinguish between two cases. If the array is
+    // indexed indirectly (i.e. not by an induction variable `kernel_id`), create a gather
+    // instruction.
+    if (id_name != kernel_id && vectorize && instruction_width > 1)
+        return builder.CreateMaskedGather(element_ptr, llvm::Align());
+
+    llvm::Value* ptr;
+    if (vectorize && instruction_width > 1) {
+        // If direct indexing is used during the vectorization, we simply bitcast the scalar pointer
+        // to a vector pointer
+        llvm::Type* vector_type = llvm::PointerType::get(
+            llvm::FixedVectorType::get(element_ptr->getType()->getPointerElementType(),
+                                       instruction_width),
+            /*AddressSpace=*/0);
+        ptr = builder.CreateBitCast(element_ptr, vector_type);
+    } else {
+        // Otherwise, scalar code is generated and hence return the element pointer.
+        ptr = element_ptr;
+    }
+
+    if (maybe_value_to_store) {
+        create_store(ptr, maybe_value_to_store);
+        return nullptr;
+    } else {
+        return create_load(ptr);
+    }
+}
+
+void IRBuilder::maybe_replicate_value(llvm::Value* value) {
+    // If the value should not be vectorised, or it is already a vector, add it to the stack.
+    if (!vectorize || instruction_width == 1 || value->getType()->isVectorTy()) {
+        value_stack.push_back(value);
+    } else {
+        // Otherwise, we generate vectorized code inside the loop, so replicate the value to form a
+        // vector.
+        llvm::Value* vector_value = builder.CreateVectorSplat(instruction_width, value);
+        value_stack.push_back(vector_value);
+    }
+}
+
+
+/****************************************************************************************/
+/*                                 LLVM block utilities                                 */
+/****************************************************************************************/
+
+llvm::BasicBlock* IRBuilder::create_block_and_set_insertion_point(llvm::Function* function,
+                                                                  llvm::BasicBlock* insert_before,
+                                                                  std::string name) {
+    llvm::BasicBlock* block =
+        llvm::BasicBlock::Create(builder.getContext(), name, function, insert_before);
+    builder.SetInsertPoint(block);
+    return block;
+}
+
+void IRBuilder::create_br(llvm::BasicBlock* block) {
+    builder.CreateBr(block);
+}
+
+void IRBuilder::create_br_and_set_insertion_point(llvm::BasicBlock* block) {
+    builder.CreateBr(block);
+    builder.SetInsertPoint(block);
+}
+
+void IRBuilder::create_cond_br(llvm::Value* condition,
+                               llvm::BasicBlock* true_block,
+                               llvm::BasicBlock* false_block) {
+    builder.CreateCondBr(condition, true_block, false_block);
+}
+
+llvm::BasicBlock* IRBuilder::get_current_block() {
+    return builder.GetInsertBlock();
+}
+
+void IRBuilder::set_insertion_point(llvm::BasicBlock* block) {
+    builder.SetInsertPoint(block);
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
new file mode 100644
index 0000000000..b1b23ff0cf
--- /dev/null
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -0,0 +1,272 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+#include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+#include "symtab/symbol_table.hpp"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+
+namespace nmodl {
+namespace codegen {
+
+/// Floating point bit widths.
+static constexpr const unsigned single_precision = 32;
+static constexpr const unsigned double_precision = 64;
+
+/// Some typedefs.
+using ConstantVector = std::vector<llvm::Constant*>;
+using TypeVector = std::vector<llvm::Type*>;
+using ValueVector = std::vector<llvm::Value*>;
+
+/**
+ * \class IRBuilder
+ * \brief A helper class to generate LLVM IR for NMODL AST.
+ */
+class IRBuilder {
+  private:
+    /// Underlying LLVM IR builder.
+    llvm::IRBuilder<> builder;
+
+    /// Stack to hold visited and processed values.
+    ValueVector value_stack;
+
+    /// Pointer to the current function for which the code is generated.
+    llvm::Function* current_function;
+
+    /// Symbol table of the NMODL AST.
+    symtab::SymbolTable* symbol_table;
+
+    /// Flag to indicate that the generated IR should be vectorized.
+    bool vectorize;
+
+    /// Precision of the floating-point numbers (32 or 64 bit).
+    unsigned fp_precision;
+
+    /// If 1, indicates that the scalar code is generated. Otherwise, the current vectorization
+    /// width.
+    unsigned instruction_width;
+
+    /// The vector width used for the vectorized code.
+    unsigned vector_width;
+
+    /// The name of induction variable used in kernel loops.
+    std::string kernel_id;
+
+  public:
+    IRBuilder(llvm::LLVMContext& context,
+              bool use_single_precision = false,
+              unsigned vector_width = 1)
+        : builder(context)
+        , symbol_table(nullptr)
+        , current_function(nullptr)
+        , vectorize(false)
+        , fp_precision(use_single_precision ? single_precision : double_precision)
+        , vector_width(vector_width)
+        , instruction_width(vector_width)
+        , kernel_id("") {}
+
+    /// Initializes the builder with the symbol table and the kernel induction variable id.
+    void initialize(symtab::SymbolTable& symbol_table, std::string& kernel_id) {
+        this->symbol_table = &symbol_table;
+        this->kernel_id = kernel_id;
+    }
+
+    /// Explicitly sets the builder to produce scalar code (even during vectorization).
+    void generate_scalar_code() {
+        instruction_width = 1;
+    }
+
+    /// Explicitly sets the builder to produce vectorized code.
+    void generate_vectorized_code() {
+        instruction_width = vector_width;
+    }
+
+    /// Turns on vectorization mode.
+    void start_vectorization() {
+        vectorize = true;
+    }
+
+    /// Turns off vectorization mode.
+    void stop_vectorization() {
+        vectorize = false;
+    }
+
+    /// Sets the current function for which LLVM IR is generated.
+    void set_function(llvm::Function* function) {
+        current_function = function;
+    }
+
+    /// Clears the stack of the values and unsets the current function.
+    void clear_function() {
+        value_stack.clear();
+        current_function = nullptr;
+    }
+
+    /// Generates LLVM IR to allocate the arguments of the function on the stack.
+    void allocate_function_arguments(llvm::Function* function,
+                                     const ast::CodegenVarWithTypeVector& nmodl_arguments);
+
+    /// Generates IR for allocating an array.
+    void create_array_alloca(const std::string& name, llvm::Type* element_type, int num_elements);
+
+    /// Generates LLVM IR for the given binary operator.
+    void create_binary_op(llvm::Value* lhs, llvm::Value* rhs, ast::BinaryOp op);
+
+    /// Generates LLVM IR for the bitcast instruction.
+    llvm::Value* create_bitcast(llvm::Value* value, llvm::Type* dst_type);
+
+    /// Create a basic block and set the builder's insertion point to it.
+    llvm::BasicBlock* create_block_and_set_insertion_point(
+        llvm::Function* function,
+        llvm::BasicBlock* insert_before = nullptr,
+        std::string name = "");
+
+    /// Generates LLVM IR for unconditional branch.
+    void create_br(llvm::BasicBlock* block);
+
+    /// Generates LLVM IR for unconditional branch and sets the insertion point to this block.
+    void create_br_and_set_insertion_point(llvm::BasicBlock* block);
+
+    /// Generates LLVM IR for conditional branch.
+    void create_cond_br(llvm::Value* condition,
+                        llvm::BasicBlock* true_block,
+                        llvm::BasicBlock* false_block);
+
+    /// Generates LLVM IR for the boolean constant.
+    void create_boolean_constant(int value);
+
+    /// Generates LLVM IR for the floating-point constant.
+    void create_fp_constant(const std::string& value);
+
+    /// Generates LLVM IR for a call to the function.
+    void create_function_call(llvm::Function* callee,
+                              ValueVector& arguments,
+                              bool use_result = true);
+
+    /// Generates LLVM IR for the string value.
+    llvm::Value* create_global_string(const ast::String& node);
+
+    /// Generates LLVM IR to transform the value into an index by possibly sign-extending it.
+    llvm::Value* create_index(llvm::Value* value);
+
+    /// Generates an intrinsic that corresponds to the given name.
+    void create_intrinsic(const std::string& name,
+                          ValueVector& argument_values,
+                          TypeVector& argument_types);
+
+    /// Generates LLVM IR for the integer constant.
+    void create_i32_constant(int value);
+
+    /// Generates LLVM IR to load the value specified by its name and returns it.
+    llvm::Value* create_load(const std::string& name);
+
+    /// Generates LLVM IR to load the value from the pointer and returns it.
+    llvm::Value* create_load(llvm::Value* ptr);
+
+    /// Generates LLVM IR to load the element at the specified index from the given array name and
+    /// returns it.
+    llvm::Value* create_load_from_array(const std::string& name, llvm::Value* index);
+
+    /// Generates LLVM IR to store the value to the location specified by the name.
+    void create_store(const std::string& name, llvm::Value* value);
+
+    /// Generates LLVM IR to store the value to the location specified by the pointer.
+    void create_store(llvm::Value* ptr, llvm::Value* value);
+
+    /// Generates LLVM IR to store the value to the array element, where array is specified by the
+    /// name.
+    void create_store_to_array(const std::string& name, llvm::Value* index, llvm::Value* value);
+
+    /// Generates LLVM IR return instructions.
+    void create_return(llvm::Value* return_value = nullptr);
+
+    /// Generates IR for allocating a scalar or vector variable.
+    void create_scalar_or_vector_alloca(const std::string& name,
+                                        llvm::Type* element_or_scalar_type);
+
+    /// Generates LLVM IR for the given unary operator.
+    void create_unary_op(llvm::Value* value, ast::UnaryOp op);
+
+    /// Creates a boolean (1-bit integer) type.
+    llvm::Type* get_boolean_type();
+
+    /// Returns current basic block.
+    llvm::BasicBlock* get_current_block();
+
+    /// Returns the name of the function for which LLVM IR is generated.
+    std::string get_current_function_name();
+
+    /// Creates a pointer to 8-bit integer type.
+    llvm::Type* get_i8_ptr_type();
+
+    /// Creates a 32-bit integer type.
+    llvm::Type* get_i32_type();
+
+    /// Creates a pointer to 32-bit integer type.
+    llvm::Type* get_i32_ptr_type();
+
+    /// Creates a 64-bit integer type.
+    llvm::Type* get_i64_type();
+
+    /// Creates a floating-point type.
+    llvm::Type* get_fp_type();
+
+    /// Creates a pointer to floating-point type.
+    llvm::Type* get_fp_ptr_type();
+
+    /// Creates a void type.
+    llvm::Type* get_void_type();
+
+    /// Generates LLVM IR to get the address of the struct's member at given index. Returns the
+    /// calculated value.
+    llvm::Value* get_struct_member_ptr(llvm::Value* struct_variable, int member_index);
+
+    /// Creates a pointer to struct type with the given name and given members.
+    llvm::Type* get_struct_ptr_type(const std::string& struct_type_name, TypeVector& member_types);
+
+    /// Generates IR that loads the elements of the array even during vectorization. If the value is
+    /// specified, then it is stored to the array at the given index.
+    llvm::Value* load_to_or_store_from_array(const std::string& id_name,
+                                             llvm::Value* id_value,
+                                             llvm::Value* array,
+                                             llvm::Value* maybe_value_to_store = nullptr);
+
+    /// Lookups the value by  its name in the current function's symbol table.
+    llvm::Value* lookup_value(const std::string& value_name);
+
+    /// Generates IR to replicate the value if vectorizing the code.
+    void maybe_replicate_value(llvm::Value* value);
+
+    /// Sets builder's insertion point to the given block.
+    void set_insertion_point(llvm::BasicBlock* block);
+
+    /// Pops the last visited value from the value stack.
+    llvm::Value* pop_last_value();
+
+  private:
+    /// Generates an inbounds GEP instruction for the given name and returns calculated address.
+    llvm::Value* create_inbounds_gep(const std::string& variable_name, llvm::Value* index);
+
+    /// Generates an inbounds GEP instruction for the given value and returns calculated address.
+    llvm::Value* create_inbounds_gep(llvm::Value* variable, llvm::Value* index);
+
+    /// Returns a scalar constant of the provided type.
+    template <typename C, typename V>
+    llvm::Value* get_scalar_constant(llvm::Type* type, V value);
+
+    /// Returns a vector constant of the provided type.
+    template <typename C, typename V>
+    llvm::Value* get_vector_constant(llvm::Type* type, V value);
+};
+
+}  // namespace codegen
+}  // namespace nmodl

From 1e4809e7c68e212801281d6e5ea29bcbcef1f4e0 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 13 May 2021 05:46:12 -0700
Subject: [PATCH 162/331] Fixed initialisation of `CodegenAtomicStatement`
 (#642)

  * Fixed CodegenAtomicStatement initialisation
  * Removed unused variable and changed comment
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c9968df8ee..10aee780ce 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -256,10 +256,18 @@ static void append_statements_from_block(ast::StatementVector& statements,
     }
 }
 
-static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(std::string& lhs_str,
-                                                                            std::string& op_str,
-                                                                            std::string& rhs_str) {
-    auto lhs = std::make_shared<ast::Name>(new ast::String(lhs_str));
+static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(
+    std::string& ion_varname,
+    std::string& index_varname,
+    std::string& op_str,
+    std::string& rhs_str) {
+    // create lhs expression
+    auto varname = new ast::Name(new ast::String(ion_varname));
+    auto index = new ast::Name(new ast::String(index_varname));
+    auto lhs = std::make_shared<ast::VarName>(new ast::IndexedName(varname, index),
+                                              /*at=*/nullptr,
+                                              /*index=*/nullptr);
+
     auto op = ast::BinaryOperator(ast::string_to_binaryop(op_str));
     auto rhs = create_expression(rhs_str);
     return std::make_shared<ast::CodegenAtomicStatement>(lhs, op, rhs);
@@ -362,12 +370,11 @@ void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
         std::string index_varname = "{}_id"_format(ion_varname);
         // load index
         std::string index_statement = "{} = {}_index[id]"_format(index_varname, ion_varname);
-        // ion variable to write (with index)
-        std::string ion_to_write = "{}[{}]"_format(ion_varname, index_varname);
         // push index definition, index statement and actual write statement
         int_variables.push_back(index_varname);
         index_statements.push_back(visitor::create_statement(index_statement));
-        body_statements.push_back(create_atomic_statement(ion_to_write, op, rhs));
+        // pass ion variable to write and its index
+        body_statements.push_back(create_atomic_statement(ion_varname, index_varname, op, rhs));
     };
 
     /// iterate over all ions and create write ion statements for given block type

From 07fe46836b94041ae4df5d3132e0f8e90e1eec86 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Thu, 13 May 2021 15:25:24 +0200
Subject: [PATCH 163/331] Fix instance struct data generation for
 testing/benchmarking (#641)

* Instance data structure initialization had following bug
   - instance struct has int member variables which act as
     offsets to other vectors (e.g. node_index, na_ion_index)
   - these variables were initialized from 1 to N where N was
     incremented always without considering the upper bound on
     for offset.
* With this fix
   - index / integer variables are always initialized from
     0 to N-1.
   - Variables are initialised 1e-5 prevision so that we have
     reaosanbly bigger values
   - Update tests to check offset from 0 to N-1
---
 test/unit/codegen/codegen_data_helper.cpp     |  9 +++++++-
 test/unit/codegen/codegen_data_helper.hpp     | 23 +++++++++----------
 .../codegen/codegen_llvm_instance_struct.cpp  |  9 +++++---
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index 4bf94f583d..a0ee6ec957 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -115,7 +115,14 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
         // allocate memory and setup a pointer
         void* member;
         posix_memalign(&member, NBYTE_ALIGNMENT, member_size * num_elements);
-        initialize_variable(var, member, variable_index, num_elements);
+
+        // integer values are often offsets so they must start from
+        // 0 to num_elements-1 to avoid out of bound accesses.
+        int initial_value = variable_index;
+        if (type == ast::AstNodeType::INTEGER) {
+            initial_value = 0;
+        }
+        initialize_variable(var, member, initial_value, num_elements);
         data.num_bytes += member_size * num_elements;
 
         // copy address at specific location in the struct
diff --git a/test/unit/codegen/codegen_data_helper.hpp b/test/unit/codegen/codegen_data_helper.hpp
index ef8e869366..76c4f422d9 100644
--- a/test/unit/codegen/codegen_data_helper.hpp
+++ b/test/unit/codegen/codegen_data_helper.hpp
@@ -57,11 +57,12 @@ struct CodegenInstanceData {
 /**
  * Generate vector of dummy data according to the template type specified
  *
- * For double type: generate vector starting from (initial_value + 1e-15)
- *                  with increments of 1e-15
- * For float type:  generate vector starting from (initial_value + 1e-6)
- *                  with increments of 1e-6
- * For int type:    generate vector starting from (initial_value + 1) with
+ * For double or float type: generate vector starting from `initial_value`
+ *                  with an increment of 1e-5. The increment can be any other
+ *                  value but 1e-5 is chosen because when we benchmark with
+ *                  a million elements then the values are in the range of
+ *                  <initial_value, initial_value + 10>.
+ * For int type:    generate vector starting from initial_value with an
  *                  increments of 1
  *
  * \param inital_value Base value for initializing the data
@@ -71,16 +72,14 @@ struct CodegenInstanceData {
 template <typename T>
 std::vector<T> generate_dummy_data(size_t initial_value, size_t num_elements) {
     std::vector<T> data(num_elements);
-    T precision;
-    if (std::is_same<T, double>::value) {
-        precision = 1e-15;
-    } else if (std::is_same<T, float>::value) {
-        precision = 1e-6;
+    T increment;
+    if (std::is_same<T, int>::value) {
+        increment = 1;
     } else {
-        precision = 1;
+        increment = 1e-5;
     }
     for (size_t i = 0; i < num_elements; i++) {
-        data[i] = initial_value + precision * (i + 1);
+        data[i] = initial_value + increment * i;
     }
     return data;
 }
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 52b9bb9868..e77b6844ae 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -132,8 +132,12 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
                             generate_dummy_data<double>(ena_index, num_elements)));
             REQUIRE(compare(instance_data.members[ion_ena_index],
                             generate_dummy_data<double>(ion_ena_index, num_elements)));
+            // index variables are offsets, they start from 0
+            REQUIRE(compare(instance_data.members[ion_ena_index_index],
+                            generate_dummy_data<int>(0, num_elements)));
             REQUIRE(compare(instance_data.members[node_index_index],
-                            generate_dummy_data<int>(node_index_index, num_elements)));
+                            generate_dummy_data<int>(0, num_elements)));
+
             REQUIRE(*static_cast<double*>(instance_data.members[t_index]) ==
                     default_nthread_t_value);
             REQUIRE(*static_cast<int*>(instance_data.members[node_count_index]) == num_elements);
@@ -164,8 +168,7 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
             REQUIRE(compare(instance->ena, generate_dummy_data<double>(ena_index, num_elements)));
             REQUIRE(compare(instance->ion_ena,
                             generate_dummy_data<double>(ion_ena_index, num_elements)));
-            REQUIRE(compare(instance->node_index,
-                            generate_dummy_data<int>(node_index_index, num_elements)));
+            REQUIRE(compare(instance->node_index, generate_dummy_data<int>(0, num_elements)));
             REQUIRE(instance->t == default_nthread_t_value);
             REQUIRE(instance->celsius == default_celsius_value);
             REQUIRE(instance->secondorder == default_second_order_value);

From f7017a7ac06087bf78d0316904d1ca55b58065ed Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 13 May 2021 13:14:13 -0700
Subject: [PATCH 164/331] Basic scatter support (#643)

Added basic support to transform indirect writes into
`llvm.masked.scatter` intrinsic. Currently, the scatter functionality is
limited to non-atomic writes and assignment (e.g. `+=` operator is
not yet supported). Hence, a warning is logged to the console
indicating all limitations.

Corresponding IR and execution tests were also added.

fixes #539
---
 src/codegen/llvm/codegen_llvm_visitor.cpp    | 42 +++++++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp    |  1 +
 src/codegen/llvm/llvm_ir_builder.cpp         |  7 +-
 test/unit/codegen/codegen_llvm_execution.cpp | 76 ++++++++++++++++++++
 test/unit/codegen/codegen_llvm_ir.cpp        | 47 ++++++++++++
 5 files changed, 165 insertions(+), 8 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index a86a5cd8b5..39594169f4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -37,9 +37,10 @@ static constexpr const char instance_struct_type_name[] = "__instance_var__type"
 
 /// A utility to check for supported Statement AST nodes.
 static bool is_supported_statement(const ast::Statement& statement) {
-    return statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_codegen_for_statement() || statement.is_codegen_return_statement() ||
-           statement.is_if_statement() || statement.is_while_statement();
+    return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
+           statement.is_if_statement() || statement.is_codegen_return_statement() ||
+           statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
+           statement.is_while_statement();
 }
 
 /// A utility to check that the kernel body can be vectorised.
@@ -162,10 +163,12 @@ void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& argumen
 }
 
 void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
-    // By convention, only kernel functions have a return type of void.
+    // By convention, only kernel functions have a return type of void and single argument. The
+    // number of arguments check is needed to avoid LLVM void intrinsics to be considered as
+    // kernels.
     const auto& functions = module->getFunctionList();
     for (const auto& func: functions) {
-        if (func.getReturnType()->isVoidTy()) {
+        if (func.getReturnType()->isVoidTy() && llvm::hasSingleElement(func.args())) {
             container.push_back(func.getName().str());
         }
     }
@@ -366,7 +369,7 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         if (!kernel)
             throw std::runtime_error("Error: kernel " + kernel_name + " is not found\n");
 
-        if (std::distance(kernel->args().begin(), kernel->args().end()) != 1)
+        if (!llvm::hasSingleElement(kernel->args()))
             throw std::runtime_error("Error: kernel " + kernel_name +
                                      " must have a single argument\n");
 
@@ -443,6 +446,33 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
     ir_builder.create_boolean_constant(node.get_value());
 }
 
+/**
+ * Currently, this functions is very similar to visiting the binary operator. However, the
+ * difference here is that the writes to the LHS variable must be atomic. These has a particular
+ * use case in synapse kernels. For simplicity, we choose not to support atomic writes at this
+ * stage and emit a warning.
+ *
+ * \todo support this properly.
+ */
+void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomicStatement& node) {
+    if (vector_width > 1)
+        logger->warn("Atomic operations are not supported");
+
+    // Support only assignment for now.
+    llvm::Value* rhs = accept_and_get(node.get_rhs());
+    if (node.get_atomic_op().get_value() != ast::BinaryOp::BOP_ASSIGN)
+        throw std::runtime_error(
+            "Error: only assignment is supported for CodegenAtomicStatement\n");
+    const auto& var = dynamic_cast<ast::VarName*>(node.get_lhs().get());
+    if (!var)
+        throw std::runtime_error("Error: only 'VarName' assignment is supported\n");
+
+    // Process the assignment as if it was non-atomic.
+    if (vector_width > 1)
+        logger->warn("Treating write as non-atomic");
+    write_to_variable(*var, rhs);
+}
+
 // Generating FOR loop in LLVM IR creates the following structure:
 //
 //  +---------------------------+
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 0ada7b8097..14a608d3ca 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -155,6 +155,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     // Visitors.
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
+    void visit_codegen_atomic_statement(const ast::CodegenAtomicStatement& node) override;
     void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 2773e6929b..04e36e50cd 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -349,8 +349,11 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     // If the vector code is generated, we need to distinguish between two cases. If the array is
     // indexed indirectly (i.e. not by an induction variable `kernel_id`), create a gather
     // instruction.
-    if (id_name != kernel_id && vectorize && instruction_width > 1)
-        return builder.CreateMaskedGather(element_ptr, llvm::Align());
+    if (id_name != kernel_id && vectorize && instruction_width > 1) {
+        return maybe_value_to_store
+                   ? builder.CreateMaskedScatter(maybe_value_to_store, element_ptr, llvm::Align())
+                   : builder.CreateMaskedGather(element_ptr, llvm::Align());
+    }
 
     llvm::Value* ptr;
     if (vectorize && instruction_width > 1) {
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index cec4e5017b..296417c5f3 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -432,3 +432,79 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         }
     }
 }
+
+//=============================================================================
+// Vectorised kernel with ion writes.
+//=============================================================================
+
+SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
+    GIVEN("Simple MOD file with ion writes") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                USEION ca WRITE cai
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+                : increment cai to test scatter
+                cai = cai + 1
+            }
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/false,
+                                                 /*use_single_precision=*/false,
+                                                 /*vector_width=*/2);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_functions();
+
+        // Create the instance struct data.
+        int num_elements = 5;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // Fill the instance struct data with some values.
+        std::vector<double> cai = {1.0, 2.0, 3.0, 4.0, 5.0};
+        std::vector<double> ion_cai = {1.0, 2.0, 3.0, 4.0, 5.0};
+        std::vector<int> ion_cai_index = {4, 2, 3, 0, 1};
+
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
+        initialise_instance_variable(instance_info, cai, "cai");
+        initialise_instance_variable(instance_info, ion_cai, "ion_cai");
+        initialise_instance_variable(instance_info, ion_cai_index, "ion_cai_index");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
+
+        THEN("Ion values in struct have been updated correctly") {
+            runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
+                                                 instance_data.base_ptr);
+            // cai[id] = ion_cai[ion_cai_index[id]]
+            // cai[id] += 1
+            std::vector<double> cai_expected = {6.0, 4.0, 5.0, 2.0, 3.0};
+            REQUIRE(check_instance_variable(instance_info, cai_expected, "cai"));
+
+            // ion_cai[ion_cai_index[id]] = cai[id]
+            std::vector<double> ion_cai_expected = {2.0, 3.0, 4.0, 5.0, 6.0};
+            REQUIRE(check_instance_variable(instance_info, ion_cai_expected, "ion_cai"));
+        }
+    }
+}
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 93fd269b8e..11f2faf99b 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -956,6 +956,53 @@ SCENARIO("Vectorised simple kernel", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// Scatter for vectorised kernel
+//=============================================================================
+
+SCENARIO("Vectorised simple kernel with ion writes", "[visitor][llvm]") {
+    GIVEN("An indirect indexing of ca ion") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX hh
+                USEION ca WRITE cai
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {}
+        )";
+
+        THEN("a scatter instructions is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text,
+                                                         /*opt=*/false,
+                                                         /*use_single_precision=*/false,
+                                                         /*vector_width=*/4);
+            std::smatch m;
+
+            // Check scatter intrinsic is correctly declared.
+            std::regex declaration(
+                R"(declare void @llvm\.masked\.scatter\.v4f64\.v4p0f64\(<4 x double>, <4 x double\*>, i32 immarg, <4 x i1>\))");
+            REQUIRE(std::regex_search(module_string, m, declaration));
+
+            // Check that the indices vector is created correctly and extended to i64.
+            std::regex index_load(R"(load <4 x i32>, <4 x i32>\* %ion_cai_id)");
+            std::regex sext(R"(sext <4 x i32> %.* to <4 x i64>)");
+            REQUIRE(std::regex_search(module_string, m, index_load));
+            REQUIRE(std::regex_search(module_string, m, sext));
+
+            // Check that store to `ion_cai` is performed via scatter instruction.
+            //      ion_cai[ion_cai_id] = cai[id]
+            std::regex scatter(
+                "call void @llvm\\.masked\\.scatter\\.v4f64\\.v4p0f64\\(<4 x double> %.*, <4 x "
+                "double\\*> %.*, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>\\)");
+            REQUIRE(std::regex_search(module_string, m, scatter));
+        }
+    }
+}
+
 //=============================================================================
 // Derivative block : test optimization
 //=============================================================================

From 38c61bf782a5af6b53958e3db98ca5dd0516d34c Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Sun, 16 May 2021 18:07:01 +0200
Subject: [PATCH 165/331] Benchmarking code re-organisation and minor
 improvements (#647)

* Move benchmark + JIT related code from src/codegen/llvm
  to test/benchmark
* Common execution of CodegenLLVMVisitor for llvm --ir and
  benchmark option. With this, ast transformed for LLVM
  code generation is dumped to file.
* Previous object file is removed (if exist) so that output
  file name is same / deterministic
* Benchmark output is always printed to stdout via common
  logger object
* Remove unnecessary LLVMBuildInfo struct
---
 CMakeLists.txt                                |  1 +
 src/CMakeLists.txt                            |  3 +-
 src/codegen/llvm/CMakeLists.txt               | 10 +--
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  5 ++
 src/codegen/llvm/main.cpp                     |  2 +-
 src/main.cpp                                  | 39 +++++-----
 test/benchmark/CMakeLists.txt                 | 17 +++++
 .../llvm => test/benchmark}/jit_driver.cpp    |  8 ++
 .../llvm => test/benchmark}/jit_driver.hpp    |  0
 .../benchmark}/llvm_benchmark.cpp             | 73 +++++--------------
 .../benchmark}/llvm_benchmark.hpp             | 33 +++------
 test/unit/CMakeLists.txt                      |  5 +-
 test/unit/codegen/codegen_llvm_execution.cpp  |  2 +-
 13 files changed, 85 insertions(+), 113 deletions(-)
 create mode 100644 test/benchmark/CMakeLists.txt
 rename {src/codegen/llvm => test/benchmark}/jit_driver.cpp (97%)
 rename {src/codegen/llvm => test/benchmark}/jit_driver.hpp (100%)
 rename {src/codegen/llvm => test/benchmark}/llvm_benchmark.cpp (61%)
 rename {src/codegen/llvm => test/benchmark}/llvm_benchmark.hpp (76%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fb1baf78b6..a321d5e558 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -199,6 +199,7 @@ set(MEMORYCHECK_COMMAND_OPTIONS
 # do not enable tests if nmodl is used as submodule
 if(NOT NMODL_AS_SUBPROJECT)
   include(CTest)
+  add_subdirectory(test/benchmark)
   add_subdirectory(test/unit)
   add_subdirectory(test/integration)
 endif()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d27a039de7..7544da699d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -34,9 +34,8 @@ target_link_libraries(
   util
   lexer
   ${NMODL_WRAPPER_LIBS})
-
 if(NMODL_ENABLE_LLVM)
-  target_link_libraries(nmodl llvm_codegen llvm_benchmark ${LLVM_LIBS_TO_LINK})
+  target_link_libraries(nmodl llvm_codegen llvm_benchmark benchmark_data ${LLVM_LIBS_TO_LINK})
 endif()
 
 # =============================================================================
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 5ebf9c7acd..b927475f15 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -6,10 +6,6 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_visitor.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_llvm_helper_visitor.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.cpp
@@ -24,10 +20,6 @@ add_library(runner_obj OBJECT ${LLVM_CODEGEN_SOURCE_FILES})
 add_dependencies(runner_obj lexer_obj)
 set_property(TARGET runner_obj PROPERTY POSITION_INDEPENDENT_CODE ON)
 
-if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
-  target_compile_definitions(runner_obj PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
-endif()
-
 add_library(llvm_codegen STATIC $<TARGET_OBJECTS:runner_obj>)
 add_dependencies(llvm_codegen lexer util visitor)
 
@@ -36,9 +28,9 @@ if(NOT NMODL_AS_SUBPROJECT)
 
   target_link_libraries(
     nmodl_llvm_runner
+    llvm_benchmark
     llvm_codegen
     codegen
-    llvm_benchmark
     visitor
     symtab
     lexer
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 14a608d3ca..990485d8e2 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -152,6 +152,11 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         return instance_var_helper;
     }
 
+    /// Returns vector width
+    int get_vector_width() const {
+        return vector_width;
+    }
+
     // Visitors.
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_boolean(const ast::Boolean& node) override;
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index b700f5ad59..2f4e1f653d 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -9,8 +9,8 @@
 
 #include "ast/program.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "jit_driver.hpp"
 #include "parser/nmodl_driver.hpp"
+#include "test/benchmark/jit_driver.hpp"
 #include "utils/logger.hpp"
 #include "visitors/symtab_visitor.hpp"
 
diff --git a/src/main.cpp b/src/main.cpp
index e71325e057..77067b0ef4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -20,7 +20,7 @@
 
 #ifdef NMODL_LLVM_BACKEND
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "codegen/llvm/llvm_benchmark.hpp"
+#include "test/benchmark/llvm_benchmark.hpp"
 #endif
 
 #include "config/config.h"
@@ -328,7 +328,7 @@ int main(int argc, const char* argv[]) {
                        "Disable debug information ({})"_format(disable_debug_information))->ignore_case();
     llvm_opt->add_flag("--opt",
                        llvm_ir_opt_passes,
-                       "Run LLVM optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
+                       "Run few common LLVM IR optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
@@ -667,26 +667,7 @@ int main(int argc, const char* argv[]) {
             }
 
 #ifdef NMODL_LLVM_BACKEND
-
-            if (run_llvm_benchmark) {
-                logger->info("Running LLVM benchmark");
-                benchmark::LLVMBuildInfo info{llvm_vec_width,
-                                              llvm_ir_opt_passes,
-                                              llvm_float_type,
-                                              vector_library};
-                benchmark::LLVMBenchmark benchmark(modfile,
-                                                   output_dir,
-                                                   shared_lib_paths,
-                                                   info,
-                                                   num_experiments,
-                                                   instance_size,
-                                                   backend,
-                                                   llvm_opt_level_ir,
-                                                   llvm_opt_level_codegen);
-                benchmark.run(ast);
-            }
-
-            else if (llvm_ir) {
+            if (llvm_ir || run_llvm_benchmark) {
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
                                            output_dir,
@@ -698,6 +679,20 @@ int main(int argc, const char* argv[]) {
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
+
+                if (run_llvm_benchmark) {
+                    logger->info("Running LLVM benchmark");
+                    benchmark::LLVMBenchmark benchmark(visitor,
+                                                       modfile,
+                                                       output_dir,
+                                                       shared_lib_paths,
+                                                       num_experiments,
+                                                       instance_size,
+                                                       backend,
+                                                       llvm_opt_level_ir,
+                                                       llvm_opt_level_codegen);
+                    benchmark.run(ast);
+                }
             }
 #endif
         }
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000..4441d53251
--- /dev/null
+++ b/test/benchmark/CMakeLists.txt
@@ -0,0 +1,17 @@
+# =============================================================================
+# llvm benchmark sources
+# =============================================================================
+set(LLVM_BENCHMARK_SOURCE_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
+
+# =============================================================================
+# LLVM benchmark library
+# =============================================================================
+include_directories(${LLVM_INCLUDE_DIRS})
+add_library(llvm_benchmark STATIC ${LLVM_BENCHMARK_SOURCE_FILES})
+add_dependencies(llvm_benchmark lexer util visitor)
+
+if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
+  target_compile_definitions(llvm_benchmark PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
+endif()
diff --git a/src/codegen/llvm/jit_driver.cpp b/test/benchmark/jit_driver.cpp
similarity index 97%
rename from src/codegen/llvm/jit_driver.cpp
rename to test/benchmark/jit_driver.cpp
index 2a6842d0fb..a2d8df63f4 100644
--- a/src/codegen/llvm/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -7,6 +7,7 @@
 
 #include "jit_driver.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "utils/common_utils.hpp"
 
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
@@ -247,6 +248,13 @@ void JITDriver::init(std::string features,
 
     // Optionally, dump the binary to the object file.
     if (benchmark_info) {
+        std::string object_file = benchmark_info->filename + ".o";
+        if (utils::file_exists(object_file)) {
+            int status = remove(object_file.c_str());
+            if (status) {
+                throw std::runtime_error("Can not remove object file " + object_file);
+            }
+        }
         jit->getObjTransformLayer().setTransform(
             llvm::orc::DumpObjects(benchmark_info->output_dir, benchmark_info->filename));
     }
diff --git a/src/codegen/llvm/jit_driver.hpp b/test/benchmark/jit_driver.hpp
similarity index 100%
rename from src/codegen/llvm/jit_driver.hpp
rename to test/benchmark/jit_driver.hpp
diff --git a/src/codegen/llvm/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
similarity index 61%
rename from src/codegen/llvm/llvm_benchmark.cpp
rename to test/benchmark/llvm_benchmark.cpp
index adbe653f1e..f6811fd664 100644
--- a/src/codegen/llvm/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -9,8 +9,8 @@
 #include <fstream>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "codegen/llvm/jit_driver.hpp"
 #include "llvm_benchmark.hpp"
+#include "test/benchmark/jit_driver.hpp"
 #include "llvm/Support/Host.h"
 
 #include "test/unit/codegen/codegen_data_helper.hpp"
@@ -42,57 +42,42 @@ void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>
     for (auto& host_feature: host_features) {
         if (feature == host_feature.substr(1)) {
             host_feature[0] = '-';
-            *log_stream << host_feature << "\n";
+            logger->info("{}", host_feature);
             return;
         }
     }
 }
 
 void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
-    // First, set the output stream for the logs.
-    set_log_output();
-
-    // Then, record the time taken for building the LLVM IR module.
-    codegen::CodegenLLVMVisitor visitor(mod_filename,
-                                        output_dir,
-                                        llvm_build_info.opt_passes,
-                                        llvm_build_info.use_single_precision,
-                                        llvm_build_info.vector_width,
-                                        llvm_build_info.vec_lib,
-                                        /*add_debug_information=*/true);
-    generate_llvm(visitor, node);
-
+    // create functions
+    generate_llvm(node);
     // Finally, run the benchmark and log the measurements.
-    run_benchmark(visitor, node);
+    run_benchmark(node);
 }
 
-void LLVMBenchmark::generate_llvm(codegen::CodegenLLVMVisitor& visitor,
-                                  const std::shared_ptr<ast::Program>& node) {
+void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
     // First, visit the AST to build the LLVM IR module and wrap the kernel function calls.
     auto start = std::chrono::high_resolution_clock::now();
-    visitor.visit_program(*node);
-    visitor.wrap_kernel_functions();
+    llvm_visitor.wrap_kernel_functions();
     auto end = std::chrono::high_resolution_clock::now();
 
     // Log the time taken to visit the AST and build LLVM IR.
     std::chrono::duration<double> diff = end - start;
-    *log_stream << "Created LLVM IR module from NMODL AST in " << std::setprecision(PRECISION)
-                << diff.count() << "\n\n";
+    logger->info("Created LLVM IR module from NMODL AST in {} sec", diff.count());
 }
 
-void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
-                                  const std::shared_ptr<ast::Program>& node) {
+void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     // Set the codegen data helper and find the kernels.
-    auto codegen_data = codegen::CodegenDataHelper(node, visitor.get_instance_struct_ptr());
+    auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
     std::vector<std::string> kernel_names;
-    visitor.find_kernel_names(kernel_names);
+    llvm_visitor.find_kernel_names(kernel_names);
 
     // Get feature's string and turn them off depending on the backend.
     std::vector<std::string> features = get_cpu_features();
-    *log_stream << "Backend: " << backend << "\n";
+    logger->info("Backend: {}", backend);
     if (backend == "avx2") {
         // Disable SSE.
-        *log_stream << "Disabling features:\n";
+        logger->info("Disabling features:");
         disable("sse", features);
         disable("sse2", features);
         disable("sse3", features);
@@ -100,16 +85,17 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
         disable("sse4.2", features);
     } else if (backend == "sse2") {
         // Disable AVX.
-        *log_stream << "Disabling features:\n";
+        logger->info("Disabling features:");
         disable("avx", features);
         disable("avx2", features);
     }
 
     std::string features_str = llvm::join(features.begin(), features.end(), ",");
-    std::unique_ptr<llvm::Module> m = visitor.get_module();
+    std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
     // Create the benchmark runner and initialize it.
-    std::string filename = "v" + std::to_string(llvm_build_info.vector_width) + "_" + mod_filename;
+    std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
+                           mod_filename;
     runner::BenchmarkRunner runner(std::move(m),
                                    filename,
                                    output_dir,
@@ -125,7 +111,7 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
         auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
 
         double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-        *log_stream << "Benchmarking kernel '" << kernel_name << ", with " << size_mbs << " MBs\n";
+        logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
 
         // For every kernel run the benchmark `num_experiments` times.
         double time_sum = 0.0;
@@ -138,32 +124,13 @@ void LLVMBenchmark::run_benchmark(codegen::CodegenLLVMVisitor& visitor,
             std::chrono::duration<double> diff = end - start;
 
             // Log the time taken for each run.
-            *log_stream << "Experiment " << i << ": compute time = " << std::setprecision(9)
-                        << diff.count() << "\n";
+            logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
 
             time_sum += diff.count();
         }
         // Log the average time taken for the kernel.
-        *log_stream << "Average compute time = " << std::setprecision(PRECISION)
-                    << time_sum / num_experiments << "\n\n";
-    }
-}
-
-void LLVMBenchmark::set_log_output() {
-    // If the output directory is not specified, dump logs to the console.
-    if (output_dir == ".") {
-        log_stream = std::make_shared<std::ostream>(std::cout.rdbuf());
-        return;
+        logger->info("Average compute time = {:.6f} \n", time_sum / num_experiments);
     }
-
-    // Otherwise, dump logs to the specified file.
-    std::string filename = output_dir + "/" + mod_filename + ".log";
-    ofs.open(filename.c_str());
-
-    if (ofs.fail())
-        throw std::runtime_error("Error while opening a file '" + filename + "'");
-
-    log_stream = std::make_shared<std::ostream>(ofs.rdbuf());
 }
 
 }  // namespace benchmark
diff --git a/src/codegen/llvm/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
similarity index 76%
rename from src/codegen/llvm/llvm_benchmark.hpp
rename to test/benchmark/llvm_benchmark.hpp
index c2c781d7f0..9696191172 100644
--- a/src/codegen/llvm/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -10,19 +10,11 @@
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-
+#include "utils/logger.hpp"
 
 namespace nmodl {
 namespace benchmark {
 
-/// A struct to hold LLVM visitor information.
-struct LLVMBuildInfo {
-    int vector_width;
-    bool opt_passes;
-    bool use_single_precision;
-    std::string vec_lib;
-};
-
 /**
  * \class LLVMBenchmark
  * \brief A wrapper to execute MOD file kernels via LLVM IR backend, and
@@ -30,6 +22,9 @@ struct LLVMBuildInfo {
  */
 class LLVMBenchmark {
   private:
+    /// LLVM visitor.
+    codegen::CodegenLLVMVisitor& llvm_visitor;
+
     /// Source MOD file name.
     std::string mod_filename;
 
@@ -54,32 +49,26 @@ class LLVMBenchmark {
     /// Optimisation level for machine code generation.
     int opt_level_codegen;
 
-    /// LLVM visitor information.
-    LLVMBuildInfo llvm_build_info;
-
-    /// The log output stream (file or stdout).
-    std::shared_ptr<std::ostream> log_stream;
-
     /// Filestream for dumping logs to the file.
     std::ofstream ofs;
 
   public:
-    LLVMBenchmark(const std::string& mod_filename,
+    LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,
+                  const std::string& mod_filename,
                   const std::string& output_dir,
                   std::vector<std::string> shared_libs,
-                  LLVMBuildInfo info,
                   int num_experiments,
                   int instance_size,
                   const std::string& backend,
                   int opt_level_ir,
                   int opt_level_codegen)
-        : mod_filename(mod_filename)
+        : llvm_visitor(llvm_visitor)
+        , mod_filename(mod_filename)
         , output_dir(output_dir)
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
         , backend(backend)
-        , llvm_build_info(info)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen) {}
 
@@ -91,12 +80,10 @@ class LLVMBenchmark {
     void disable(const std::string& feature, std::vector<std::string>& host_features);
 
     /// Visits the AST to construct the LLVM IR module.
-    void generate_llvm(codegen::CodegenLLVMVisitor& visitor,
-                       const std::shared_ptr<ast::Program>& node);
+    void generate_llvm(const std::shared_ptr<ast::Program>& node);
 
     /// Runs the main body of the benchmark, executing the compute kernels.
-    void run_benchmark(codegen::CodegenLLVMVisitor& visitor,
-                       const std::shared_ptr<ast::Program>& node);
+    void run_benchmark(const std::shared_ptr<ast::Program>& node);
 
     /// Sets the log output stream (file or console).
     void set_log_output();
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 1aa091c7fd..903e19214f 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -108,8 +108,8 @@ target_link_libraries(
 if(NMODL_ENABLE_LLVM)
   include_directories(${LLVM_INCLUDE_DIRS} codegen)
 
-  add_library(llvm_benchmark STATIC codegen/codegen_data_helper.cpp)
-  add_dependencies(llvm_benchmark lexer)
+  add_library(benchmark_data STATIC codegen/codegen_data_helper.cpp)
+  add_dependencies(benchmark_data lexer)
 
   add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp
                           codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
@@ -130,6 +130,7 @@ if(NMODL_ENABLE_LLVM)
   target_link_libraries(
     test_llvm_runner
     llvm_codegen
+    llvm_benchmark
     codegen
     visitor
     symtab
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 296417c5f3..baa370143b 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -9,9 +9,9 @@
 
 #include "ast/program.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "codegen/llvm/jit_driver.hpp"
 #include "codegen_data_helper.hpp"
 #include "parser/nmodl_driver.hpp"
+#include "test/benchmark/jit_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
 #include "visitors/neuron_solve_visitor.hpp"
 #include "visitors/solve_block_visitor.hpp"

From bb3ecd009a05a3627c315972d552613ed4647980 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 16 May 2021 22:04:31 -0700
Subject: [PATCH 166/331] Added attributes and metadata to LLVM IR compute
 kernels (#648)

Previously, there was no metadata and attributes associated with the
instance struct pointer, compute kernels or loops. This commit fixes
this.

- New instance struct attributes

Since all pointers contained in the instance struct do not alias, we add
a `noalias` (LLVM's `__restrict` alternative) attribute to it. In addition,
we add `nocapture` (No capturing occurs in the function) and
`readonly` (Struct pointer is not written to) attributes.

This means that some load instructions can be moved out from the loop
body. Example:
```llvm
; BEFORE
for.body.lr.ph:                                   ; preds = %0
  %5 = getelementptr inbounds %avx__instance_var__type, %avx__instance_var__type* %mech1, i64 0, i32 1
  br label %for.body

for.body:                                         ; preds = %for.body.lr.ph, %for.body
  %15 = load double*, double** %5, align 8
  ; ...


; AFTER
for.body.lr.ph:                                   ; preds = %0
  %5 = getelementptr inbounds %avx__instance_var__type, %avx__instance_var__type* %mech1, i64 0, i32 1
  %6 = load double*, double** %5, align 8
  br label %for.body
```

- New function attributes

Now, compute kernels are marked with `nofree` and `nounwind`
attributes.

- Loop metadata

Also, loop metadata is added to scalar kernels, specifying that no
vectorization is needed. The reason for this is because we want to
benchmark truly scalar kernels, and disable LLVM's vectorization if
necessary.

Note that for vector loop epilogue there is no metadata that disables
vectorization.

fixes #607
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 93 +++++++++++++----------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  3 +
 src/codegen/llvm/llvm_ir_builder.cpp      | 54 ++++++++++++-
 src/codegen/llvm/llvm_ir_builder.hpp      | 13 +++-
 test/unit/codegen/codegen_llvm_ir.cpp     | 21 ++++-
 5 files changed, 134 insertions(+), 50 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 39594169f4..2124ad82c9 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -163,13 +163,11 @@ void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& argumen
 }
 
 void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
-    // By convention, only kernel functions have a return type of void and single argument. The
-    // number of arguments check is needed to avoid LLVM void intrinsics to be considered as
-    // kernels.
-    const auto& functions = module->getFunctionList();
-    for (const auto& func: functions) {
-        if (func.getReturnType()->isVoidTy() && llvm::hasSingleElement(func.args())) {
-            container.push_back(func.getName().str());
+    auto& functions = module->getFunctionList();
+    for (auto& func: functions) {
+        const std::string name = func.getName().str();
+        if (is_kernel_function(name)) {
+            container.push_back(name);
         }
     }
 }
@@ -239,6 +237,36 @@ int CodegenLLVMVisitor::get_num_elements(const ast::IndexedName& node) {
     return static_cast<int>(*macro->get_value());
 }
 
+/**
+ * Currently, functions are identified as compute kernels if they satisfy the following:
+ *   1. They have a void return type
+ *   2. They have a single argument
+ *   3. The argument is a struct type pointer
+ * This is not robust, and hence it would be better to find what functions are kernels on the NMODL
+ * AST side (e.g. via a flag, or via names list).
+ *
+ * \todo identify kernels on NMODL AST side.
+ */
+bool CodegenLLVMVisitor::is_kernel_function(const std::string& function_name) {
+    llvm::Function* function = module->getFunction(function_name);
+    if (!function)
+        throw std::runtime_error("Error: function " + function_name + " does not exist\n");
+
+    // By convention, only kernel functions have a return type of void and single argument. The
+    // number of arguments check is needed to avoid LLVM void intrinsics to be considered as
+    // kernels.
+    if (!function->getReturnType()->isVoidTy() || !llvm::hasSingleElement(function->args()))
+        return false;
+
+    // Kernel's argument is a pointer to the instance struct type.
+    llvm::Type* arg_type = function->getArg(0)->getType();
+    if (auto pointer_type = llvm::dyn_cast<llvm::PointerType>(arg_type)) {
+        if (pointer_type->getElementType()->isStructTy())
+            return true;
+    }
+    return false;
+}
+
 llvm::Value* CodegenLLVMVisitor::read_from_or_write_to_instance(const ast::CodegenInstanceVar& node,
                                                                 llvm::Value* maybe_value_to_store) {
     const auto& instance_name = node.get_instance_var()->get_node_name();
@@ -364,20 +392,8 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
     find_kernel_names(kernel_names);
 
     for (const auto& kernel_name: kernel_names) {
-        // Get the kernel function and the instance struct type.
+        // Get the kernel function.
         auto kernel = module->getFunction(kernel_name);
-        if (!kernel)
-            throw std::runtime_error("Error: kernel " + kernel_name + " is not found\n");
-
-        if (!llvm::hasSingleElement(kernel->args()))
-            throw std::runtime_error("Error: kernel " + kernel_name +
-                                     " must have a single argument\n");
-
-        auto instance_struct_ptr_type = llvm::dyn_cast<llvm::PointerType>(
-            kernel->getArg(0)->getType());
-        if (!instance_struct_ptr_type)
-            throw std::runtime_error("Error: kernel " + kernel_name +
-                                     " does not have an instance struct pointer as an argument\n");
 
         // Create a wrapper void function that takes a void pointer as a single argument.
         llvm::Type* i32_type = ir_builder.get_i32_type();
@@ -398,7 +414,7 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         // Proceed with bitcasting the void pointer to the struct pointer type, calling the kernel
         // and adding a terminator.
         llvm::Value* bitcasted = ir_builder.create_bitcast(wrapper_func->getArg(0),
-                                                           instance_struct_ptr_type);
+                                                           kernel->getArg(0)->getType());
         ValueVector args;
         args.push_back(bitcasted);
         ir_builder.create_function_call(kernel, args, /*use_result=*/false);
@@ -522,9 +538,6 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
-    // Save the vector width.
-    int tmp_vector_width = vector_width;
-
     // Check if the kernel can be vectorised. If not, generate scalar code.
     if (!can_vectorize(node, sym_tab)) {
         logger->info("Cannot vectorise the for loop in '" + ir_builder.get_current_function_name() +
@@ -534,21 +547,20 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
         ir_builder.generate_scalar_code();
     }
 
-    // First, initialise the loop in the same basic block. This block is optional. Also, reset
-    // vector width to 1 if processing the remainder of the loop.
-    if (node.get_initialization()) {
+    // First, initialise the loop in the same basic block. This block is optional. Also, generate
+    // scalar code if processing the remainder of the loop.
+    if (node.get_initialization())
         node.get_initialization()->accept(*this);
-    } else {
-        vector_width = 1;
+    else
         ir_builder.generate_scalar_code();
-    }
 
     // Branch to condition basic block and insert condition code there.
     ir_builder.create_br_and_set_insertion_point(for_cond);
 
     // Extract the condition to decide whether to branch to the loop body or loop exit.
     llvm::Value* cond = accept_and_get(node.get_condition());
-    ir_builder.create_cond_br(cond, for_body, exit);
+    llvm::BranchInst* loop_br = ir_builder.create_cond_br(cond, for_body, exit);
+    ir_builder.set_loop_metadata(loop_br);
 
     // Generate code for the loop body and create the basic block for the increment.
     ir_builder.set_insertion_point(for_body);
@@ -560,11 +572,9 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     // Process increment.
     node.get_increment()->accept(*this);
 
-    // Create a branch to condition block, then generate exit code out of the loop. Restore the
-    // vector width.
+    // Create a branch to condition block, then generate exit code out of the loop.
     ir_builder.create_br(for_cond);
     ir_builder.set_insertion_point(exit);
-    vector_width = tmp_vector_width;
     ir_builder.generate_vectorized_code();
     ir_builder.start_vectorization();
 }
@@ -578,7 +588,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
 
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
-    llvm::BasicBlock* body = ir_builder.create_block_and_set_insertion_point(func);
+    ir_builder.create_block_and_set_insertion_point(func);
 
     // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
     // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
@@ -588,14 +598,12 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     visitor::RenameVisitor v(name, return_var_name);
     block->accept(v);
 
-
     // Allocate parameters on the stack and add them to the symbol table.
     ir_builder.allocate_function_arguments(func, arguments);
 
     // Process function or procedure body. If the function is a compute kernel, then set the
-    // corresponding flags. The return statement is handled in a separate visitor.
-    bool has_void_ret_type = node.get_return_type()->get_type() == ast::AstNodeType::VOID;
-    if (has_void_ret_type) {
+    // corresponding flags. If so, the return statement is handled in a separate visitor.
+    if (is_kernel_function(name)) {
         ir_builder.start_vectorization();
         block->accept(*this);
         ir_builder.stop_vectorization();
@@ -603,9 +611,12 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
         block->accept(*this);
     }
 
-    // If function has a void return type, add a terminator not handled by CodegenReturnVar.
-    if (has_void_ret_type)
+    // If function is a compute kernel, add a void terminator explicitly, since there is no
+    // `CodegenReturnVar` node. Also, set the necessary attributes.
+    if (is_kernel_function(name)) {
+        ir_builder.set_kernel_attributes();
         ir_builder.create_return();
+    }
 
     // Clear local values stack and remove the pointer to the local symbol table.
     ir_builder.clear_function();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 990485d8e2..22505a304c 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -216,6 +216,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Returns the number of elements in the array specified by the IndexedName AST node.
     int get_num_elements(const ast::IndexedName& node);
 
+    /// Returns whether the function is an NMODL compute kernel.
+    bool is_kernel_function(const std::string& function_name);
+
     /// If the value to store is specified, writes it to the instance. Otherwise, returns the
     /// instance variable.
     llvm::Value* read_from_or_write_to_instance(const ast::CodegenInstanceVar& node,
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 04e36e50cd..06ba8d00ef 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -174,6 +174,52 @@ void IRBuilder::create_intrinsic(const std::string& name,
     }
 }
 
+void IRBuilder::set_kernel_attributes() {
+    // By convention, the compute kernel does not free memory and does not throw exceptions.
+    current_function->setDoesNotFreeMemory();
+    current_function->setDoesNotThrow();
+
+    // We also want to specify that the pointers that instance struct holds, do not alias. In order
+    // to do that, we add a `noalias` attribute to the argument. As per Clang's specification:
+    //  > The `noalias` attribute indicates that the only memory accesses inside function are loads
+    //  > and stores from objects pointed to by its pointer-typed arguments, with arbitrary
+    //  > offsets.
+    current_function->addParamAttr(0, llvm::Attribute::NoAlias);
+
+    // Finally, specify that the struct pointer does not capture and is read-only.
+    current_function->addParamAttr(0, llvm::Attribute::NoCapture);
+    current_function->addParamAttr(0, llvm::Attribute::ReadOnly);
+}
+
+/****************************************************************************************/
+/*                                LLVM metadata utilities                               */
+/****************************************************************************************/
+
+void IRBuilder::set_loop_metadata(llvm::BranchInst* branch) {
+    llvm::LLVMContext& context = builder.getContext();
+    MetadataVector loop_metadata;
+
+    // Add nullptr to reserve the first place for loop's metadata self-reference.
+    loop_metadata.push_back(nullptr);
+
+    // If `vector_width` is 1, explicitly disable vectorization for benchmarking purposes.
+    if (vector_width == 1) {
+        llvm::MDString* name = llvm::MDString::get(context, "llvm.loop.vectorize.enable");
+        llvm::Value* false_value = llvm::ConstantInt::get(get_boolean_type(), 0);
+        llvm::ValueAsMetadata* value = llvm::ValueAsMetadata::get(false_value);
+        loop_metadata.push_back(llvm::MDNode::get(context, {name, value}));
+    }
+
+    // No metadata to add.
+    if (loop_metadata.size() <= 1)
+        return;
+
+    // Add loop's metadata self-reference and attach it to the branch.
+    llvm::MDNode* metadata = llvm::MDNode::get(context, loop_metadata);
+    metadata->replaceOperandWith(0, metadata);
+    branch->setMetadata(llvm::LLVMContext::MD_loop, metadata);
+}
+
 /****************************************************************************************/
 /*                             LLVM instruction utilities                               */
 /****************************************************************************************/
@@ -412,10 +458,10 @@ void IRBuilder::create_br_and_set_insertion_point(llvm::BasicBlock* block) {
     builder.SetInsertPoint(block);
 }
 
-void IRBuilder::create_cond_br(llvm::Value* condition,
-                               llvm::BasicBlock* true_block,
-                               llvm::BasicBlock* false_block) {
-    builder.CreateCondBr(condition, true_block, false_block);
+llvm::BranchInst* IRBuilder::create_cond_br(llvm::Value* condition,
+                                            llvm::BasicBlock* true_block,
+                                            llvm::BasicBlock* false_block) {
+    return builder.CreateCondBr(condition, true_block, false_block);
 }
 
 llvm::BasicBlock* IRBuilder::get_current_block() {
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index b1b23ff0cf..e0cda2cf93 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -24,6 +24,7 @@ static constexpr const unsigned double_precision = 64;
 
 /// Some typedefs.
 using ConstantVector = std::vector<llvm::Constant*>;
+using MetadataVector = std::vector<llvm::Metadata*>;
 using TypeVector = std::vector<llvm::Type*>;
 using ValueVector = std::vector<llvm::Value*>;
 
@@ -137,9 +138,9 @@ class IRBuilder {
     void create_br_and_set_insertion_point(llvm::BasicBlock* block);
 
     /// Generates LLVM IR for conditional branch.
-    void create_cond_br(llvm::Value* condition,
-                        llvm::BasicBlock* true_block,
-                        llvm::BasicBlock* false_block);
+    llvm::BranchInst* create_cond_br(llvm::Value* condition,
+                                     llvm::BasicBlock* true_block,
+                                     llvm::BasicBlock* false_block);
 
     /// Generates LLVM IR for the boolean constant.
     void create_boolean_constant(int value);
@@ -249,6 +250,12 @@ class IRBuilder {
     /// Sets builder's insertion point to the given block.
     void set_insertion_point(llvm::BasicBlock* block);
 
+    /// Sets the necessary attributes for the kernel and its arguments.
+    void set_kernel_attributes();
+
+    /// Sets the loop metadata for the given branch from the loop.
+    void set_loop_metadata(llvm::BranchInst* branch);
+
     /// Pops the last visited value from the value stack.
     llvm::Value* pop_last_value();
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 11f2faf99b..3295411f7a 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -838,15 +838,19 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check the struct type and the kernel declaration.
+            // Check the struct type with correct attributes and the kernel declaration.
             std::regex struct_type(
                 "%.*__instance_var__type = type \\{ double\\*, double\\*, double\\*, double\\*, "
                 "double\\*, double\\*, double\\*, i32\\*, double, double, double, i32, i32 \\}");
             std::regex kernel_declaration(
-                R"(define void @nrn_state_hh\(%.*__instance_var__type\* .*\))");
+                R"(define void @nrn_state_hh\(%.*__instance_var__type\* noalias nocapture readonly .*\) #0)");
             REQUIRE(std::regex_search(module_string, m, struct_type));
             REQUIRE(std::regex_search(module_string, m, kernel_declaration));
 
+            // Check kernel attributes.
+            std::regex kernel_attributes(R"(attributes #0 = \{ nofree nounwind \})");
+            REQUIRE(std::regex_search(module_string, m, kernel_attributes));
+
             // Check for correct variables initialisation and a branch to condition block.
             std::regex id_initialisation(R"(%id = alloca i32)");
             std::regex node_id_initialisation(R"(%node_id = alloca i32)");
@@ -871,6 +875,15 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, condition));
             REQUIRE(std::regex_search(module_string, m, cond_br));
 
+            // Check that loop metadata is attached to the scalar kernel.
+            std::regex loop_metadata(R"(!llvm\.loop !0)");
+            std::regex loop_metadata_self_reference(R"(!0 = distinct !\{!0, !1\})");
+            std::regex loop_metadata_disable_vectorization(
+                R"(!1 = !\{!\"llvm\.loop\.vectorize\.enable\", i1 false\})");
+            REQUIRE(std::regex_search(module_string, m, loop_metadata));
+            REQUIRE(std::regex_search(module_string, m, loop_metadata_self_reference));
+            REQUIRE(std::regex_search(module_string, m, loop_metadata_disable_vectorization));
+
             // Check for correct loads from the struct with GEPs.
             std::regex load_from_struct(
                 "  %.* = load %.*__instance_var__type\\*, %.*__instance_var__type\\*\\* %.*\n"
@@ -934,6 +947,10 @@ SCENARIO("Vectorised simple kernel", "[visitor][llvm]") {
                                                          /*vector_width=*/4);
             std::smatch m;
 
+            // Check that no loop metadata is attached.
+            std::regex loop_metadata(R"(!llvm\.loop !.*)");
+            REQUIRE(!std::regex_search(module_string, m, loop_metadata));
+
             // Check gather intrinsic is correctly declared.
             std::regex declaration(
                 R"(declare <4 x double> @llvm\.masked\.gather\.v4f64\.v4p0f64\(<4 x double\*>, i32 immarg, <4 x i1>, <4 x double>\) )");

From 167c23c36f6b77b24037f20b0c919753b7f43bc4 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 18 May 2021 03:14:08 -0700
Subject: [PATCH 167/331] Added loaded value to the stack (#655)

- fixes the case, where loaded value was taken from the stack, but was never actually put there
---
 src/codegen/llvm/llvm_ir_builder.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 06ba8d00ef..8828aa83c5 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -319,12 +319,16 @@ llvm::Value* IRBuilder::create_index(llvm::Value* value) {
 llvm::Value* IRBuilder::create_load(const std::string& name) {
     llvm::Value* ptr = lookup_value(name);
     llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
-    return builder.CreateLoad(loaded_type, ptr);
+    llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
+    value_stack.push_back(loaded);
+    return loaded;
 }
 
 llvm::Value* IRBuilder::create_load(llvm::Value* ptr) {
     llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
-    return builder.CreateLoad(loaded_type, ptr);
+    llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
+    value_stack.push_back(loaded);
+    return loaded;
 }
 
 llvm::Value* IRBuilder::create_load_from_array(const std::string& name, llvm::Value* index) {

From a02a8d082da7e357d2efe3b3de64b5403d7320a9 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 20 May 2021 00:32:40 -0700
Subject: [PATCH 168/331] Basic predication support for LLVM backend (#652)

Added support for vector predication. Currently, we support a very basic
predication pattern (that will be extended in the future):
```c++
IF (/*condition*/) {
  // code here, no nested conditionals
} ELSE {
  // code here, no nested conditionals
}
```
**What has been changed and added**

1. Removed vectorization check

Before, in the `FOR` statement visitor we were checking whether the code
can be vectorized. After refactoring `llvm::IRBuilder<>` into a separate class,
there is no interface to reset the builder's vector width. Hence, this check leads
to visitor having scalar vector width of 1, and builder having the same vector width.
```c++
if (!can_vectorize(node, sym_tab)) {
    vector_width = 1;
     ir_builder.generate_scalar_code();
}
```
In order to avoid any issues, this check is simply removed and will be added in
the separate PR.

2. Predication support

- `can_vectorize` has been changed to support a single `IF` or `IF/ELSE` pair.
- A special vectorized `IF` AST node visitor has been added.
- If generating code within `IF` AST node, instructions are masked.

3. Added execution and IR tests

fixes #539
---
 .../llvm/codegen_llvm_helper_visitor.cpp      |   2 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  90 ++++++++++------
 src/codegen/llvm/codegen_llvm_visitor.hpp     |   3 +
 src/codegen/llvm/llvm_ir_builder.cpp          |  83 ++++++++++----
 src/codegen/llvm/llvm_ir_builder.hpp          |  56 ++++++----
 test/unit/codegen/codegen_llvm_execution.cpp  | 101 ++++++++++++++++++
 test/unit/codegen/codegen_llvm_ir.cpp         |  69 ++++++++++++
 7 files changed, 326 insertions(+), 78 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 10aee780ce..5974edc623 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -251,7 +251,7 @@ static void append_statements_from_block(ast::StatementVector& statements,
     for (const auto& statement: block_statements) {
         const auto& expression_statement = std::dynamic_pointer_cast<ast::ExpressionStatement>(
             statement);
-        if (!expression_statement->get_expression()->is_solve_block())
+        if (!expression_statement || !expression_statement->get_expression()->is_solve_block())
             statements.push_back(statement);
     }
 }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 2124ad82c9..ec41008da0 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -54,11 +54,15 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
             return false;
     }
 
-    // Check there is no control flow in the kernel.
-    const std::vector<ast::AstNodeType> unsupported_nodes = {ast::AstNodeType::IF_STATEMENT};
-    const auto& collected = collect_nodes(statement, unsupported_nodes);
+    // Check for simple supported control flow in the kernel (single if/else statement).
+    const std::vector<ast::AstNodeType> supported_control_flow = {ast::AstNodeType::IF_STATEMENT};
+    const auto& supported = collect_nodes(statement, supported_control_flow);
 
-    return collected.empty();
+    // Check for unsupported control flow statements.
+    const std::vector<ast::AstNodeType> unsupported_nodes = {ast::AstNodeType::ELSE_IF_STATEMENT};
+    const auto& unsupported = collect_nodes(statement, unsupported_nodes);
+
+    return unsupported.empty() && supported.size() <= 1;
 }
 
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
@@ -162,6 +166,27 @@ void CodegenLLVMVisitor::create_printf_call(const ast::ExpressionVector& argumen
     ir_builder.create_function_call(printf, argument_values, /*use_result=*/false);
 }
 
+void CodegenLLVMVisitor::create_vectorized_control_flow_block(const ast::IfStatement& node) {
+    // Get the true mask from the condition statement.
+    llvm::Value* true_mask = accept_and_get(node.get_condition());
+
+    // Process the true block.
+    ir_builder.set_mask(true_mask);
+    node.get_statement_block()->accept(*this);
+
+    // Note: by default, we do not support kernels with complicated control flow. This is checked
+    // prior to visiting 'CodegenForStatement`.
+    const auto& elses = node.get_elses();
+    if (elses) {
+        // If `else` statement exists, invert the mask and proceed with code generation.
+        ir_builder.invert_mask();
+        elses->get_statement_block()->accept(*this);
+    }
+
+    // Clear the mask value.
+    ir_builder.clear_mask();
+}
+
 void CodegenLLVMVisitor::find_kernel_names(std::vector<std::string>& container) {
     auto& functions = module->getFunctionList();
     for (auto& func: functions) {
@@ -325,7 +350,8 @@ llvm::Value* CodegenLLVMVisitor::read_variable(const ast::VarName& node) {
     const auto& identifier = node.get_name();
 
     if (identifier->is_name()) {
-        return ir_builder.create_load(node.get_node_name());
+        return ir_builder.create_load(node.get_node_name(),
+                                      /*masked=*/ir_builder.generates_predicated_ir());
     }
 
     if (identifier->is_indexed_name()) {
@@ -522,8 +548,8 @@ void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomic
 //  | <code after for loop>     |
 //  +---------------------------+
 void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatement& node) {
-    // Disable vector code generation for condition and increment blocks.
-    ir_builder.stop_vectorization();
+    // Condition and increment blocks must be scalar.
+    ir_builder.generate_scalar_ir();
 
     // Get the current and the next blocks within the function.
     llvm::BasicBlock* curr_block = ir_builder.get_current_block();
@@ -538,21 +564,11 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::BasicBlock* for_inc = llvm::BasicBlock::Create(*context, /*Name=*/"for.inc", func, next);
     llvm::BasicBlock* exit = llvm::BasicBlock::Create(*context, /*Name=*/"for.exit", func, next);
 
-    // Check if the kernel can be vectorised. If not, generate scalar code.
-    if (!can_vectorize(node, sym_tab)) {
-        logger->info("Cannot vectorise the for loop in '" + ir_builder.get_current_function_name() +
-                     "'");
-        logger->info("Generating scalar code...");
-        vector_width = 1;
-        ir_builder.generate_scalar_code();
-    }
-
-    // First, initialise the loop in the same basic block. This block is optional. Also, generate
-    // scalar code if processing the remainder of the loop.
-    if (node.get_initialization())
-        node.get_initialization()->accept(*this);
-    else
-        ir_builder.generate_scalar_code();
+    // First, initialize the loop in the same basic block. If processing the remainder of the loop,
+    // no initialization happens.
+    const auto& main_loop_initialization = node.get_initialization();
+    if (main_loop_initialization)
+        main_loop_initialization->accept(*this);
 
     // Branch to condition basic block and insert condition code there.
     ir_builder.create_br_and_set_insertion_point(for_cond);
@@ -561,22 +577,24 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     llvm::Value* cond = accept_and_get(node.get_condition());
     llvm::BranchInst* loop_br = ir_builder.create_cond_br(cond, for_body, exit);
     ir_builder.set_loop_metadata(loop_br);
+    ir_builder.set_insertion_point(for_body);
+
+    // If not processing remainder of the loop, start vectorization.
+    if (vector_width > 1 && main_loop_initialization)
+        ir_builder.generate_vector_ir();
 
     // Generate code for the loop body and create the basic block for the increment.
-    ir_builder.set_insertion_point(for_body);
-    ir_builder.start_vectorization();
     const auto& statement_block = node.get_statement_block();
     statement_block->accept(*this);
-    ir_builder.stop_vectorization();
+    ir_builder.generate_scalar_ir();
     ir_builder.create_br_and_set_insertion_point(for_inc);
-    // Process increment.
+
+    // Process the increment.
     node.get_increment()->accept(*this);
 
     // Create a branch to condition block, then generate exit code out of the loop.
     ir_builder.create_br(for_cond);
     ir_builder.set_insertion_point(exit);
-    ir_builder.generate_vectorized_code();
-    ir_builder.start_vectorization();
 }
 
 
@@ -601,12 +619,12 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     // Allocate parameters on the stack and add them to the symbol table.
     ir_builder.allocate_function_arguments(func, arguments);
 
-    // Process function or procedure body. If the function is a compute kernel, then set the
-    // corresponding flags. If so, the return statement is handled in a separate visitor.
-    if (is_kernel_function(name)) {
-        ir_builder.start_vectorization();
+    // Process function or procedure body. If the function is a compute kernel, enable
+    // vectorization. If so, the return statement is handled in a separate visitor.
+    if (vector_width > 1 && is_kernel_function(name)) {
+        ir_builder.generate_vector_ir();
         block->accept(*this);
-        ir_builder.stop_vectorization();
+        ir_builder.generate_scalar_ir();
     } else {
         block->accept(*this);
     }
@@ -676,6 +694,12 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
 }
 
 void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
+    // If vectorizing the compute kernel with control flow, process it separately.
+    if (vector_width > 1 && ir_builder.vectorizing()) {
+        create_vectorized_control_flow_block(node);
+        return;
+    }
+
     // Get the current and the next blocks within the function.
     llvm::BasicBlock* curr_block = ir_builder.get_current_block();
     llvm::BasicBlock* next = curr_block->getNextNode();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 22505a304c..384c20c2c7 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -204,6 +204,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Creates a call to `printf` function.
     void create_printf_call(const ast::ExpressionVector& arguments);
 
+    /// Creates a vectorized version of the LLVM IR for the simple control flow statement.
+    void create_vectorized_control_flow_block(const ast::IfStatement& node);
+
     /// Returns LLVM type for the given CodegenVarType AST node.
     llvm::Type* get_codegen_var_type(const ast::CodegenVarType& node);
 
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 8828aa83c5..90e7456e33 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -92,11 +92,15 @@ llvm::Value* IRBuilder::pop_last_value() {
 /****************************************************************************************/
 
 void IRBuilder::create_boolean_constant(int value) {
-    value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_boolean_type(), value));
+    if (vector_width > 1 && vectorize) {
+        value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_boolean_type(), value));
+    } else {
+        value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_boolean_type(), value));
+    }
 }
 
 void IRBuilder::create_fp_constant(const std::string& value) {
-    if (instruction_width > 1 && vectorize) {
+    if (vector_width > 1 && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantFP>(get_fp_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantFP>(get_fp_type(), value));
@@ -108,7 +112,7 @@ llvm::Value* IRBuilder::create_global_string(const ast::String& node) {
 }
 
 void IRBuilder::create_i32_constant(int value) {
-    if (instruction_width > 1 && vectorize) {
+    if (vector_width > 1 && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_i32_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_i32_type(), value));
@@ -123,7 +127,7 @@ llvm::Value* IRBuilder::get_scalar_constant(llvm::Type* type, V value) {
 template <typename C, typename V>
 llvm::Value* IRBuilder::get_vector_constant(llvm::Type* type, V value) {
     ConstantVector constants;
-    for (unsigned i = 0; i < instruction_width; ++i) {
+    for (unsigned i = 0; i < vector_width; ++i) {
         const auto& element = C::get(type, value);
         constants.push_back(element);
     }
@@ -312,19 +316,27 @@ llvm::Value* IRBuilder::create_index(llvm::Value* value) {
     const auto& element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
     if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
         return value;
-    return builder.CreateSExtOrTrunc(value,
-                                     llvm::FixedVectorType::get(i64_type, instruction_width));
+    return builder.CreateSExtOrTrunc(value, llvm::FixedVectorType::get(i64_type, vector_width));
 }
 
-llvm::Value* IRBuilder::create_load(const std::string& name) {
+llvm::Value* IRBuilder::create_load(const std::string& name, bool masked) {
     llvm::Value* ptr = lookup_value(name);
+
+    // Check if the generated IR is vectorized and masked.
+    if (masked) {
+        return builder.CreateMaskedLoad(ptr, llvm::Align(), mask);
+    }
     llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
     llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
     value_stack.push_back(loaded);
     return loaded;
 }
 
-llvm::Value* IRBuilder::create_load(llvm::Value* ptr) {
+llvm::Value* IRBuilder::create_load(llvm::Value* ptr, bool masked) {
+    // Check if the generated IR is vectorized and masked.
+    if (masked) {
+        return builder.CreateMaskedLoad(ptr, llvm::Align(), mask);
+    }
     llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
     llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
     value_stack.push_back(loaded);
@@ -336,12 +348,23 @@ llvm::Value* IRBuilder::create_load_from_array(const std::string& name, llvm::Va
     return create_load(element_ptr);
 }
 
-void IRBuilder::create_store(const std::string& name, llvm::Value* value) {
+void IRBuilder::create_store(const std::string& name, llvm::Value* value, bool masked) {
     llvm::Value* ptr = lookup_value(name);
+
+    // Check if the generated IR is vectorized and masked.
+    if (masked) {
+        builder.CreateMaskedStore(value, ptr, llvm::Align(), mask);
+        return;
+    }
     builder.CreateStore(value, ptr);
 }
 
-void IRBuilder::create_store(llvm::Value* ptr, llvm::Value* value) {
+void IRBuilder::create_store(llvm::Value* ptr, llvm::Value* value, bool masked) {
+    // Check if the generated IR is vectorized and masked.
+    if (masked) {
+        builder.CreateMaskedStore(value, ptr, llvm::Align(), mask);
+        return;
+    }
     builder.CreateStore(value, ptr);
 }
 
@@ -364,8 +387,8 @@ void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
     // Even if generating vectorised code, some variables still need to be scalar. Particularly, the
     // induction variable "id" and remainder loop variables (that start with "epilogue" prefix).
     llvm::Type* type;
-    if (instruction_width > 1 && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
-        type = llvm::FixedVectorType::get(element_or_scalar_type, instruction_width);
+    if (vector_width > 1 && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+        type = llvm::FixedVectorType::get(element_or_scalar_type, vector_width);
     } else {
         type = element_or_scalar_type;
     }
@@ -389,6 +412,17 @@ llvm::Value* IRBuilder::get_struct_member_ptr(llvm::Value* struct_variable, int
     return builder.CreateInBoundsGEP(struct_variable, indices);
 }
 
+void IRBuilder::invert_mask() {
+    if (!mask)
+        throw std::runtime_error("Error: mask is not set\n");
+
+    // Create the vector with all `true` values.
+    create_boolean_constant(1);
+    llvm::Value* one = pop_last_value();
+
+    mask = builder.CreateXor(mask, one);
+}
+
 llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
                                                     llvm::Value* id_value,
                                                     llvm::Value* array,
@@ -396,22 +430,27 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     // First, calculate the address of the element in the array.
     llvm::Value* element_ptr = create_inbounds_gep(array, id_value);
 
+    // Find out if the vector code is generated.
+    bool generating_vector_ir = vector_width > 1 && vectorize;
+
     // If the vector code is generated, we need to distinguish between two cases. If the array is
     // indexed indirectly (i.e. not by an induction variable `kernel_id`), create a gather
     // instruction.
-    if (id_name != kernel_id && vectorize && instruction_width > 1) {
-        return maybe_value_to_store
-                   ? builder.CreateMaskedScatter(maybe_value_to_store, element_ptr, llvm::Align())
-                   : builder.CreateMaskedGather(element_ptr, llvm::Align());
+    if (id_name != kernel_id && generating_vector_ir) {
+        return maybe_value_to_store ? builder.CreateMaskedScatter(maybe_value_to_store,
+                                                                  element_ptr,
+                                                                  llvm::Align(),
+                                                                  mask)
+                                    : builder.CreateMaskedGather(element_ptr, llvm::Align(), mask);
     }
 
     llvm::Value* ptr;
-    if (vectorize && instruction_width > 1) {
+    if (generating_vector_ir) {
         // If direct indexing is used during the vectorization, we simply bitcast the scalar pointer
         // to a vector pointer
         llvm::Type* vector_type = llvm::PointerType::get(
             llvm::FixedVectorType::get(element_ptr->getType()->getPointerElementType(),
-                                       instruction_width),
+                                       vector_width),
             /*AddressSpace=*/0);
         ptr = builder.CreateBitCast(element_ptr, vector_type);
     } else {
@@ -420,21 +459,21 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     }
 
     if (maybe_value_to_store) {
-        create_store(ptr, maybe_value_to_store);
+        create_store(ptr, maybe_value_to_store, /*masked=*/mask && generating_vector_ir);
         return nullptr;
     } else {
-        return create_load(ptr);
+        return create_load(ptr, /*masked=*/mask && generating_vector_ir);
     }
 }
 
 void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     // If the value should not be vectorised, or it is already a vector, add it to the stack.
-    if (!vectorize || instruction_width == 1 || value->getType()->isVectorTy()) {
+    if (!vectorize || vector_width == 1 || value->getType()->isVectorTy()) {
         value_stack.push_back(value);
     } else {
         // Otherwise, we generate vectorized code inside the loop, so replicate the value to form a
         // vector.
-        llvm::Value* vector_value = builder.CreateVectorSplat(instruction_width, value);
+        llvm::Value* vector_value = builder.CreateVectorSplat(vector_width, value);
         value_stack.push_back(vector_value);
     }
 }
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index e0cda2cf93..ba3800fc66 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -52,13 +52,12 @@ class IRBuilder {
     /// Precision of the floating-point numbers (32 or 64 bit).
     unsigned fp_precision;
 
-    /// If 1, indicates that the scalar code is generated. Otherwise, the current vectorization
-    /// width.
-    unsigned instruction_width;
-
     /// The vector width used for the vectorized code.
     unsigned vector_width;
 
+    /// Masked value used to predicate vector instructions.
+    llvm::Value* mask;
+
     /// The name of induction variable used in kernel loops.
     std::string kernel_id;
 
@@ -72,7 +71,7 @@ class IRBuilder {
         , vectorize(false)
         , fp_precision(use_single_precision ? single_precision : double_precision)
         , vector_width(vector_width)
-        , instruction_width(vector_width)
+        , mask(nullptr)
         , kernel_id("") {}
 
     /// Initializes the builder with the symbol table and the kernel induction variable id.
@@ -81,26 +80,21 @@ class IRBuilder {
         this->kernel_id = kernel_id;
     }
 
-    /// Explicitly sets the builder to produce scalar code (even during vectorization).
-    void generate_scalar_code() {
-        instruction_width = 1;
+    /// Explicitly sets the builder to produce scalar IR.
+    void generate_scalar_ir() {
+        vectorize = false;
     }
 
-    /// Explicitly sets the builder to produce vectorized code.
-    void generate_vectorized_code() {
-        instruction_width = vector_width;
+    /// Indicates whether the builder generates vectorized IR.
+    bool vectorizing() {
+        return vectorize;
     }
 
-    /// Turns on vectorization mode.
-    void start_vectorization() {
+    /// Explicitly sets the builder to produce vectorized IR.
+    void generate_vector_ir() {
         vectorize = true;
     }
 
-    /// Turns off vectorization mode.
-    void stop_vectorization() {
-        vectorize = false;
-    }
-
     /// Sets the current function for which LLVM IR is generated.
     void set_function(llvm::Function* function) {
         current_function = function;
@@ -112,6 +106,21 @@ class IRBuilder {
         current_function = nullptr;
     }
 
+    /// Sets the value to be the mask for vector code generation.
+    void set_mask(llvm::Value* value) {
+        mask = value;
+    }
+
+    /// Clears the mask for vector code generation.
+    void clear_mask() {
+        mask = nullptr;
+    }
+
+    /// Indicates whether the vectorized IR is predicated.
+    bool generates_predicated_ir() {
+        return vectorize && mask;
+    }
+
     /// Generates LLVM IR to allocate the arguments of the function on the stack.
     void allocate_function_arguments(llvm::Function* function,
                                      const ast::CodegenVarWithTypeVector& nmodl_arguments);
@@ -168,20 +177,20 @@ class IRBuilder {
     void create_i32_constant(int value);
 
     /// Generates LLVM IR to load the value specified by its name and returns it.
-    llvm::Value* create_load(const std::string& name);
+    llvm::Value* create_load(const std::string& name, bool masked = false);
 
     /// Generates LLVM IR to load the value from the pointer and returns it.
-    llvm::Value* create_load(llvm::Value* ptr);
+    llvm::Value* create_load(llvm::Value* ptr, bool masked = false);
 
     /// Generates LLVM IR to load the element at the specified index from the given array name and
     /// returns it.
     llvm::Value* create_load_from_array(const std::string& name, llvm::Value* index);
 
     /// Generates LLVM IR to store the value to the location specified by the name.
-    void create_store(const std::string& name, llvm::Value* value);
+    void create_store(const std::string& name, llvm::Value* value, bool masked = false);
 
     /// Generates LLVM IR to store the value to the location specified by the pointer.
-    void create_store(llvm::Value* ptr, llvm::Value* value);
+    void create_store(llvm::Value* ptr, llvm::Value* value, bool masked = false);
 
     /// Generates LLVM IR to store the value to the array element, where array is specified by the
     /// name.
@@ -234,6 +243,9 @@ class IRBuilder {
     /// Creates a pointer to struct type with the given name and given members.
     llvm::Type* get_struct_ptr_type(const std::string& struct_type_name, TypeVector& member_types);
 
+    /// Inverts the mask for vector code generation by xoring it.
+    void invert_mask();
+
     /// Generates IR that loads the elements of the array even during vectorization. If the value is
     /// specified, then it is stored to the array at the given index.
     llvm::Value* load_to_or_store_from_array(const std::string& id_name,
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index baa370143b..aa77a4e493 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -508,3 +508,104 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
         }
     }
 }
+
+//=============================================================================
+// Vectorised kernel with control flow.
+//=============================================================================
+
+SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
+    GIVEN("Simple MOD file with if statement") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+            }
+
+            STATE {
+                w x y z
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+                IF (v > 0) {
+                    w = v * w
+                }
+
+                IF (x < 0) {
+                    x = 7
+                }
+
+                IF (0 <= y && y < 10 || z == 0) {
+                    y = 2 * y
+                } ELSE {
+                    z = z - y
+                }
+
+            }
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 /*opt_passes=*/false,
+                                                 /*use_single_precision=*/false,
+                                                 /*vector_width=*/2);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_functions();
+
+        // Create the instance struct data.
+        int num_elements = 5;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // Fill the instance struct data with some values.
+        std::vector<double> x = {-1.0, 2.0, -3.0, 4.0, -5.0};
+        std::vector<double> y = {11.0, 2.0, -3.0, 4.0, 100.0};
+        std::vector<double> z = {0.0, 1.0, 20.0, 0.0, 40.0};
+
+        std::vector<double> w = {10.0, 20.0, 30.0, 40.0, 50.0};
+        std::vector<double> voltage = {-1.0, 2.0, -1.0, 2.0, -1.0};
+        std::vector<int> node_index = {1, 2, 3, 4, 0};
+
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
+        initialise_instance_variable(instance_info, w, "w");
+        initialise_instance_variable(instance_info, voltage, "voltage");
+        initialise_instance_variable(instance_info, node_index, "node_index");
+
+        initialise_instance_variable(instance_info, x, "x");
+        initialise_instance_variable(instance_info, y, "y");
+        initialise_instance_variable(instance_info, z, "z");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
+
+        THEN("Masked instructions are generated") {
+            runner.run_with_argument<int, void*>("__nrn_state_test_wrapper",
+                                                 instance_data.base_ptr);
+            std::vector<double> w_expected = {20.0, 20.0, 60.0, 40.0, 50.0};
+            REQUIRE(check_instance_variable(instance_info, w_expected, "w"));
+
+            std::vector<double> x_expected = {7.0, 2.0, 7.0, 4.0, 7.0};
+            REQUIRE(check_instance_variable(instance_info, x_expected, "x"));
+
+            std::vector<double> y_expected = {22.0, 4.0, -3.0, 8.0, 100.0};
+            std::vector<double> z_expected = {0.0, 1.0, 23.0, 0.0, -60.0};
+            REQUIRE(check_instance_variable(instance_info, y_expected, "y"));
+            REQUIRE(check_instance_variable(instance_info, z_expected, "z"));
+        }
+    }
+}
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 3295411f7a..4920a26c4c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1020,6 +1020,75 @@ SCENARIO("Vectorised simple kernel with ion writes", "[visitor][llvm]") {
     }
 }
 
+//=============================================================================
+// Vectorised kernel with simple control flow
+//=============================================================================
+
+SCENARIO("Vectorised simple kernel with control flow", "[visitor][llvm]") {
+    GIVEN("A single if/else statement") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+            }
+
+            STATE {
+                y
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+                IF (y < 0) {
+                    y = y + 7
+                } ELSE {
+                    y = v
+                }
+            }
+        )";
+
+        THEN("masked load and stores are created") {
+            std::string module_string = run_llvm_visitor(nmodl_text,
+                                                         /*opt=*/false,
+                                                         /*use_single_precision=*/true,
+                                                         /*vector_width=*/8);
+            std::smatch m;
+
+            // Check masked load/store intrinsics are correctly declared.
+            std::regex masked_load(
+                R"(declare <8 x float> @llvm\.masked\.load\.v8f32\.p0v8f32\(<8 x float>\*, i32 immarg, <8 x i1>, <8 x float>\))");
+            std::regex masked_store(
+                R"(declare void @llvm.masked\.store\.v8f32\.p0v8f32\(<8 x float>, <8 x float>\*, i32 immarg, <8 x i1>\))");
+            REQUIRE(std::regex_search(module_string, m, masked_load));
+            REQUIRE(std::regex_search(module_string, m, masked_store));
+
+            // Check true direction instructions are predicated with mask.
+            // IF (mech->y[id] < 0) {
+            //     mech->y[id] = mech->y[id] + 7
+            std::regex mask(R"(%30 = fcmp olt <8 x float> %.*, zeroinitializer)");
+            std::regex true_load(
+                R"(call <8 x float> @llvm\.masked\.load\.v8f32\.p0v8f32\(<8 x float>\* %.*, i32 1, <8 x i1> %30, <8 x float> undef\))");
+            std::regex true_store(
+                R"(call void @llvm\.masked\.store\.v8f32\.p0v8f32\(<8 x float> %.*, <8 x float>\* %.*, i32 1, <8 x i1> %30\))");
+            REQUIRE(std::regex_search(module_string, m, mask));
+            REQUIRE(std::regex_search(module_string, m, true_load));
+            REQUIRE(std::regex_search(module_string, m, true_store));
+
+            // Check false direction instructions are predicated with inverted mask.
+            // } ELSE {
+            //     mech->y[id] = v
+            // }
+            std::regex inverted_mask(
+                R"(%47 = xor <8 x i1> %30, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)");
+            std::regex false_load(
+                R"(call <8 x float> @llvm\.masked\.load\.v8f32\.p0v8f32\(<8 x float>\* %v, i32 1, <8 x i1> %47, <8 x float> undef\))");
+            std::regex false_store(
+                R"(call void @llvm\.masked\.store\.v8f32\.p0v8f32\(<8 x float> %.*, <8 x float>\* %.*, i32 1, <8 x i1> %47\))");
+        }
+    }
+}
+
 //=============================================================================
 // Derivative block : test optimization
 //=============================================================================

From 999bf36d88154d35c59333af06cfdbd541ce1173 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 20 May 2021 07:32:50 -0700
Subject: [PATCH 169/331] Improvements for LLVM code generation and
 benchmarking (#661)

* Improved cmake versioning of LLVM
* Added ^ support
* Added more math functions intrinsics with tests
* Added compute time variance and min/max times in benchmarking output
---
 CMakeLists.txt                            |   3 -
 src/codegen/llvm/codegen_llvm_visitor.cpp |   4 +-
 src/codegen/llvm/codegen_llvm_visitor.hpp |   2 +-
 src/codegen/llvm/llvm_ir_builder.cpp      |  24 ++++-
 test/benchmark/llvm_benchmark.cpp         |  29 ++++--
 test/unit/codegen/codegen_llvm_ir.cpp     | 117 ++++++++++++++++++++--
 6 files changed, 154 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a321d5e558..cacf9443ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,9 +161,6 @@ if(NMODL_ENABLE_LLVM)
   include(LLVMHelper)
   include_directories(${LLVM_INCLUDE_DIRS})
   add_definitions(-DNMODL_LLVM_BACKEND)
-  if(LLVM_VERSION VERSION_LESS_EQUAL 12)
-    add_definitions(-DLLVM_VERSION_LESS_THAN_13)
-  endif()
 endif()
 
 # =============================================================================
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index ec41008da0..ba28361e09 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/ToolOutputFile.h"
 
-#ifndef LLVM_VERSION_LESS_THAN_13
+#if LLVM_VERSION_MAJOR >= 13
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
 #endif
 
@@ -819,7 +819,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     // Optionally, replace LLVM's maths intrinsics with vector library calls.
     if (vector_width > 1 && vector_library != llvm::TargetLibraryInfoImpl::NoLibrary) {
-#ifdef LLVM_VERSION_LESS_THAN_13
+#if LLVM_VERSION_MAJOR < 13
         logger->warn(
             "This version of LLVM does not support replacement of LLVM intrinsics with vector "
             "library calls");
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 384c20c2c7..a97e73030a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -51,7 +51,7 @@ namespace codegen {
 /// A map to query vector library by its string value.
 static const std::map<std::string, llvm::TargetLibraryInfoImpl::VectorLibrary> veclib_map = {
     {"Accelerate", llvm::TargetLibraryInfoImpl::Accelerate},
-#ifndef LLVM_VERSION_LESS_THAN_13
+#if LLVM_VERSION_MAJOR >= 13
     {"libmvec", llvm::TargetLibraryInfoImpl::LIBMVEC_X86},
 #endif
     {"MASSV", llvm::TargetLibraryInfoImpl::MASSV},
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 90e7456e33..c67941df3e 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -165,9 +165,26 @@ void IRBuilder::create_function_call(llvm::Function* callee,
 void IRBuilder::create_intrinsic(const std::string& name,
                                  ValueVector& argument_values,
                                  TypeVector& argument_types) {
+    // Process 'pow' call separately.
+    if (name == "pow") {
+        llvm::Value* pow_intrinsic = builder.CreateIntrinsic(llvm::Intrinsic::pow,
+                                                             {argument_types.front()},
+                                                             argument_values);
+        value_stack.push_back(pow_intrinsic);
+        return;
+    }
+
+    // Create other intrinsics.
     unsigned intrinsic_id = llvm::StringSwitch<llvm::Intrinsic::ID>(name)
+                                .Case("ceil", llvm::Intrinsic::ceil)
+                                .Case("cos", llvm::Intrinsic::cos)
                                 .Case("exp", llvm::Intrinsic::exp)
-                                .Case("pow", llvm::Intrinsic::pow)
+                                .Case("fabs", llvm::Intrinsic::fabs)
+                                .Case("floor", llvm::Intrinsic::floor)
+                                .Case("log", llvm::Intrinsic::log)
+                                .Case("log10", llvm::Intrinsic::log10)
+                                .Case("sin", llvm::Intrinsic::sin)
+                                .Case("sqrt", llvm::Intrinsic::sqrt)
                                 .Default(llvm::Intrinsic::not_intrinsic);
     if (intrinsic_id) {
         llvm::Value* intrinsic =
@@ -267,6 +284,11 @@ void IRBuilder::create_binary_op(llvm::Value* lhs, llvm::Value* rhs, ast::Binary
 
 #undef DISPATCH
 
+    // Separately replace ^ with the `pow` intrinsic.
+    case ast::BinaryOp::BOP_POWER:
+        result = builder.CreateIntrinsic(llvm::Intrinsic::pow, {lhs->getType()}, {lhs, rhs});
+        break;
+
     // Logical instructions.
     case ast::BinaryOp::BOP_AND:
         result = builder.CreateAnd(lhs, rhs);
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index f6811fd664..b9f2fdeced 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -107,15 +107,21 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
-        // Initialise the data.
-        auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-
-        double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-        logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
-
         // For every kernel run the benchmark `num_experiments` times.
+        double time_min = std::numeric_limits<double>::max();
+        double time_max = 0.0;
         double time_sum = 0.0;
+        double time_squared_sum = 0.0;
         for (int i = 0; i < num_experiments; ++i) {
+            // Initialise the data.
+            auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+
+            // Log instance size once.
+            if (i == 0) {
+                double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
+                logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
+            }
+
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::high_resolution_clock::now();
@@ -126,10 +132,19 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             // Log the time taken for each run.
             logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
 
+            // Update statistics.
             time_sum += diff.count();
+            time_squared_sum += diff.count() * diff.count();
+            time_min = std::min(time_min, diff.count());
+            time_max = std::max(time_max, diff.count());
         }
         // Log the average time taken for the kernel.
-        logger->info("Average compute time = {:.6f} \n", time_sum / num_experiments);
+        double time_mean = time_sum / num_experiments;
+        logger->info("Average compute time = {:.6f}", time_mean);
+        logger->info("Compute time variance = {:g}",
+                     time_squared_sum / num_experiments - time_mean * time_mean);
+        logger->info("Minimum compute time = {:.6f}", time_min);
+        logger->info("Minimum compute time = {:.6f}\n", time_max);
     }
 }
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 4920a26c4c..0a3facf6fc 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -97,7 +97,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             std::regex lhs(R"(%2 = load float, float\* %a)");
             std::regex res(R"(%3 = fadd float %2, %1)");
 
-            // Check the float values are loaded correctly and added
+            // Check the float values are loaded correctly and added.
             REQUIRE(std::regex_search(module_string, m, rhs));
             REQUIRE(std::regex_search(module_string, m, lhs));
             REQUIRE(std::regex_search(module_string, m, res));
@@ -116,7 +116,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check rhs
+            // Check rhs.
             std::regex rr(R"(%1 = load double, double\* %b)");
             std::regex rl(R"(%2 = load double, double\* %a)");
             std::regex x(R"(%3 = fadd double %2, %1)");
@@ -124,7 +124,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, rl));
             REQUIRE(std::regex_search(module_string, m, x));
 
-            // Check lhs
+            // Check lhs.
             std::regex lr(R"(%4 = load double, double\* %b)");
             std::regex ll(R"(%5 = load double, double\* %a)");
             std::regex y(R"(%6 = fsub double %5, %4)");
@@ -132,7 +132,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             REQUIRE(std::regex_search(module_string, m, ll));
             REQUIRE(std::regex_search(module_string, m, y));
 
-            // Check result
+            // Check result.
             std::regex res(R"(%7 = fdiv double %6, %3)");
             REQUIRE(std::regex_search(module_string, m, res));
         }
@@ -150,13 +150,36 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check store immediate is created
+            // Check store immediate is created.
             std::regex allocation(R"(%i = alloca double)");
             std::regex assignment(R"(store double 2.0*e\+00, double\* %i)");
             REQUIRE(std::regex_search(module_string, m, allocation));
             REQUIRE(std::regex_search(module_string, m, assignment));
         }
     }
+
+    GIVEN("Function with power operator") {
+        std::string nmodl_text = R"(
+            FUNCTION power() {
+                LOCAL i, j
+                i = 2
+                j = 4
+                power = i ^ j
+            }
+        )";
+
+        THEN("'pow' intrinsic is created") {
+            std::string module_string =
+                run_llvm_visitor(nmodl_text, /*opt=*/false, /*use_single_precision=*/true);
+            std::smatch m;
+
+            // Check 'pow' intrinsic.
+            std::regex declaration(R"(declare float @llvm\.pow\.f32\(float, float\))");
+            std::regex pow(R"(call float @llvm\.pow\.f32\(float %.*, float %.*\))");
+            REQUIRE(std::regex_search(module_string, m, declaration));
+            REQUIRE(std::regex_search(module_string, m, pow));
+        }
+    }
 }
 
 //=============================================================================
@@ -492,8 +515,44 @@ SCENARIO("Function call", "[visitor][llvm]") {
 
     GIVEN("A call to external method") {
         std::string nmodl_text = R"(
-            FUNCTION bar(i) {
-                bar = exp(i)
+            FUNCTION nmodl_ceil(x) {
+                nmodl_ceil = ceil(x)
+            }
+
+            FUNCTION nmodl_cos(x) {
+                nmodl_cos = cos(x)
+            }
+
+            FUNCTION nmodl_exp(x) {
+                nmodl_exp = exp(x)
+            }
+
+            FUNCTION nmodl_fabs(x) {
+                nmodl_fabs = fabs(x)
+            }
+
+            FUNCTION nmodl_floor(x) {
+                nmodl_floor = floor(x)
+            }
+
+            FUNCTION nmodl_log(x) {
+                nmodl_log = log(x)
+            }
+
+            FUNCTION nmodl_log10(x) {
+                nmodl_log10 = log10(x)
+            }
+
+            FUNCTION nmodl_pow(x, y) {
+                nmodl_pow = pow(x, y)
+            }
+
+            FUNCTION nmodl_sin(x) {
+                nmodl_sin = sin(x)
+            }
+
+            FUNCTION nmodl_sqrt(x) {
+                nmodl_sqrt = sqrt(x)
             }
         )";
 
@@ -501,13 +560,49 @@ SCENARIO("Function call", "[visitor][llvm]") {
             std::string module_string = run_llvm_visitor(nmodl_text);
             std::smatch m;
 
-            // Check for intrinsic declaration.
+            // Check for intrinsic declarations.
+            std::regex ceil(R"(declare double @llvm\.ceil\.f64\(double\))");
+            std::regex cos(R"(declare double @llvm\.cos\.f64\(double\))");
             std::regex exp(R"(declare double @llvm\.exp\.f64\(double\))");
+            std::regex fabs(R"(declare double @llvm\.fabs\.f64\(double\))");
+            std::regex floor(R"(declare double @llvm\.floor\.f64\(double\))");
+            std::regex log(R"(declare double @llvm\.log\.f64\(double\))");
+            std::regex log10(R"(declare double @llvm\.log10\.f64\(double\))");
+            std::regex pow(R"(declare double @llvm\.pow\.f64\(double, double\))");
+            std::regex sin(R"(declare double @llvm\.sin\.f64\(double\))");
+            std::regex sqrt(R"(declare double @llvm\.sqrt\.f64\(double\))");
+            REQUIRE(std::regex_search(module_string, m, ceil));
+            REQUIRE(std::regex_search(module_string, m, cos));
             REQUIRE(std::regex_search(module_string, m, exp));
+            REQUIRE(std::regex_search(module_string, m, fabs));
+            REQUIRE(std::regex_search(module_string, m, floor));
+            REQUIRE(std::regex_search(module_string, m, log));
+            REQUIRE(std::regex_search(module_string, m, log10));
+            REQUIRE(std::regex_search(module_string, m, pow));
+            REQUIRE(std::regex_search(module_string, m, sin));
+            REQUIRE(std::regex_search(module_string, m, sqrt));
 
             // Check the correct call is made.
-            std::regex call(R"(call double @llvm\.exp\.f64\(double %[0-9]+\))");
-            REQUIRE(std::regex_search(module_string, m, call));
+            std::regex ceil_call(R"(call double @llvm\.ceil\.f64\(double %[0-9]+\))");
+            std::regex cos_call(R"(call double @llvm\.cos\.f64\(double %[0-9]+\))");
+            std::regex exp_call(R"(call double @llvm\.exp\.f64\(double %[0-9]+\))");
+            std::regex fabs_call(R"(call double @llvm\.fabs\.f64\(double %[0-9]+\))");
+            std::regex floor_call(R"(call double @llvm\.floor\.f64\(double %[0-9]+\))");
+            std::regex log_call(R"(call double @llvm\.log\.f64\(double %[0-9]+\))");
+            std::regex log10_call(R"(call double @llvm\.log10\.f64\(double %[0-9]+\))");
+            std::regex pow_call(R"(call double @llvm\.pow\.f64\(double %[0-9]+, double %[0-9]+\))");
+            std::regex sin_call(R"(call double @llvm\.sin\.f64\(double %[0-9]+\))");
+            std::regex sqrt_call(R"(call double @llvm\.sqrt\.f64\(double %[0-9]+\))");
+            REQUIRE(std::regex_search(module_string, m, ceil_call));
+            REQUIRE(std::regex_search(module_string, m, cos_call));
+            REQUIRE(std::regex_search(module_string, m, exp_call));
+            REQUIRE(std::regex_search(module_string, m, fabs_call));
+            REQUIRE(std::regex_search(module_string, m, floor_call));
+            REQUIRE(std::regex_search(module_string, m, log_call));
+            REQUIRE(std::regex_search(module_string, m, log10_call));
+            REQUIRE(std::regex_search(module_string, m, pow_call));
+            REQUIRE(std::regex_search(module_string, m, sin_call));
+            REQUIRE(std::regex_search(module_string, m, sqrt_call));
         }
     }
 
@@ -1230,7 +1325,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             REQUIRE(std::regex_search(no_library_module_str, m, exp_decl));
             REQUIRE(std::regex_search(no_library_module_str, m, exp_call));
 
-#ifndef LLVM_VERSION_LESS_THAN_13
+#if LLVM_VERSION_MAJOR >= 13
             // Check exponential calls are replaced with calls to SVML library.
             std::string svml_library_module_str = run_llvm_visitor(nmodl_text,
                                                                    /*opt=*/false,

From 2142e2d66eba20fa6c791212e5baf766b86ac446 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 20 May 2021 14:19:41 -0700
Subject: [PATCH 170/331] Fixed `alloca`s insertion point for LLVM backend
 (#663)

* With this PR alloca instructions are always inserted in the beginning
   of the function entry block. This is done to avoid them in the while or
   for loops, where allocations per iteration cause stack overflow
   (if the IR is not optimized).
* Insertion point for allocas is the enetry block now

See #653
---
 src/codegen/llvm/codegen_llvm_visitor.cpp |  4 +--
 src/codegen/llvm/llvm_ir_builder.cpp      | 38 +++++++++++++++++++++--
 src/codegen/llvm/llvm_ir_builder.hpp      |  7 +++++
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index ba28361e09..6df5820d42 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -601,12 +601,12 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
 void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node) {
     const auto& name = node.get_node_name();
     const auto& arguments = node.get_arguments();
-    llvm::Function* func = module->getFunction(name);
-    ir_builder.set_function(func);
 
     // Create the entry basic block of the function/procedure and point the local named values table
     // to the symbol table.
+    llvm::Function* func = module->getFunction(name);
     ir_builder.create_block_and_set_insertion_point(func);
+    ir_builder.set_function(func);
 
     // When processing a function, it returns a value named <function_name> in NMODL. Therefore, we
     // first run RenameVisitor to rename it into ret_<function_name>. This will aid in avoiding
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index c67941df3e..004f28d857 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -144,7 +144,7 @@ void IRBuilder::allocate_function_arguments(llvm::Function* function,
     for (auto& arg: function->args()) {
         std::string arg_name = nmodl_arguments[i++].get()->get_node_name();
         llvm::Type* arg_type = arg.getType();
-        llvm::Value* alloca = builder.CreateAlloca(arg_type, /*ArraySize=*/nullptr, arg_name);
+        llvm::Value* alloca = create_alloca(arg_name, arg_type);
         arg.setName(arg_name);
         builder.CreateStore(&arg, alloca);
     }
@@ -245,11 +245,43 @@ void IRBuilder::set_loop_metadata(llvm::BranchInst* branch) {
 /*                             LLVM instruction utilities                               */
 /****************************************************************************************/
 
+llvm::Value* IRBuilder::create_alloca(const std::string& name, llvm::Type* type) {
+    // If insertion point for `alloca` instructions is not set, then create the instruction in the
+    // entry block and set it to be the insertion point.
+    if (!alloca_ip) {
+        // Get the entry block and insert the `alloca` instruction there.
+        llvm::BasicBlock* current_block = builder.GetInsertBlock();
+        llvm::BasicBlock& entry_block = current_block->getParent()->getEntryBlock();
+        builder.SetInsertPoint(&entry_block);
+        llvm::Value* alloca = builder.CreateAlloca(type, /*ArraySize=*/nullptr, name);
+
+        // Set the `alloca` instruction insertion point and restore the insertion point for the next
+        // set of instructions.
+        alloca_ip = llvm::cast<llvm::AllocaInst>(alloca);
+        builder.SetInsertPoint(current_block);
+        return alloca;
+    }
+
+    // Create `alloca` instruction.
+    llvm::BasicBlock* alloca_block = alloca_ip->getParent();
+    const auto& data_layout = alloca_block->getModule()->getDataLayout();
+    auto* alloca = new llvm::AllocaInst(type,
+                                        data_layout.getAllocaAddrSpace(),
+                                        /*ArraySize=*/nullptr,
+                                        data_layout.getPrefTypeAlign(type),
+                                        name);
+
+    // Insert `alloca` at the specified insertion point and reset it for the next instructions.
+    alloca_block->getInstList().insertAfter(alloca_ip->getIterator(), alloca);
+    alloca_ip = alloca;
+    return alloca;
+}
+
 void IRBuilder::create_array_alloca(const std::string& name,
                                     llvm::Type* element_type,
                                     int num_elements) {
     llvm::Type* array_type = llvm::ArrayType::get(element_type, num_elements);
-    builder.CreateAlloca(array_type, /*ArraySize=*/nullptr, name);
+    create_alloca(name, array_type);
 }
 
 void IRBuilder::create_binary_op(llvm::Value* lhs, llvm::Value* rhs, ast::BinaryOp op) {
@@ -414,7 +446,7 @@ void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
     } else {
         type = element_or_scalar_type;
     }
-    builder.CreateAlloca(type, /*ArraySize=*/nullptr, name);
+    create_alloca(name, type);
 }
 
 void IRBuilder::create_unary_op(llvm::Value* value, ast::UnaryOp op) {
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index ba3800fc66..744b737392 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -46,6 +46,9 @@ class IRBuilder {
     /// Symbol table of the NMODL AST.
     symtab::SymbolTable* symbol_table;
 
+    /// Insertion point for `alloca` instructions.
+    llvm::Instruction* alloca_ip;
+
     /// Flag to indicate that the generated IR should be vectorized.
     bool vectorize;
 
@@ -69,6 +72,7 @@ class IRBuilder {
         , symbol_table(nullptr)
         , current_function(nullptr)
         , vectorize(false)
+        , alloca_ip(nullptr)
         , fp_precision(use_single_precision ? single_precision : double_precision)
         , vector_width(vector_width)
         , mask(nullptr)
@@ -104,6 +108,7 @@ class IRBuilder {
     void clear_function() {
         value_stack.clear();
         current_function = nullptr;
+        alloca_ip = nullptr;
     }
 
     /// Sets the value to be the mask for vector code generation.
@@ -125,6 +130,8 @@ class IRBuilder {
     void allocate_function_arguments(llvm::Function* function,
                                      const ast::CodegenVarWithTypeVector& nmodl_arguments);
 
+    llvm::Value* create_alloca(const std::string& name, llvm::Type* type);
+
     /// Generates IR for allocating an array.
     void create_array_alloca(const std::string& name, llvm::Type* element_type, int num_elements);
 

From 6d4743b7d6f6f560d27420b216143bf587764218 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 20 May 2021 23:53:04 -0700
Subject: [PATCH 171/331] Fast math flags for LLVM backend (#662)

Added support for fast math flags in LLVM backend. Currently,
the user can specify them via command-line (this approach was
chosen for easier benchmarking). The specified flags are named
exactly  the same as in LLVM. This feature is useful to enable
previously unsafe FP-math optimizations. For example, fused-multiply-add
instructions can now be generated when lowering LLVM IR to assembly
or executing via JIT.

Example:
```c++
// fma.mod
FUNCTION fma(a, b, c) {
    fma = (a * b) + c
}
```
```bash
$ ./nmodl fma.mod --verbose debug llvm --ir --fmf nnan contract afn --opt
```
```llvm
define double @fma(double %a, double %b, double %c) {
  %1 = fmul nnan contract afn double %a, %b
  %2 = fadd nnan contract afn double %1, %c
  ret double %2
}
```
---
 src/codegen/llvm/codegen_llvm_visitor.hpp |  5 +--
 src/codegen/llvm/llvm_ir_builder.hpp      | 28 +++++++++++++++--
 src/main.cpp                              |  9 +++++-
 test/unit/codegen/codegen_llvm_ir.cpp     | 38 +++++++++++++++++++++--
 4 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index a97e73030a..c3beb53640 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -113,14 +113,15 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        bool use_single_precision = false,
                        int vector_width = 1,
                        std::string vec_lib = "none",
-                       bool add_debug_information = false)
+                       bool add_debug_information = false,
+                       std::vector<std::string> fast_math_flags = {})
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
         , add_debug_information(add_debug_information)
-        , ir_builder(*context, use_single_precision, vector_width)
+        , ir_builder(*context, use_single_precision, vector_width, fast_math_flags)
         , debug_builder(*module)
         , codegen_pm(module.get())
         , opt_pm(module.get()) {}
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index 744b737392..b9736e2846 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -64,10 +64,14 @@ class IRBuilder {
     /// The name of induction variable used in kernel loops.
     std::string kernel_id;
 
+    /// Fast math flags for floating-point IR instructions.
+    std::vector<std::string> fast_math_flags;
+
   public:
     IRBuilder(llvm::LLVMContext& context,
               bool use_single_precision = false,
-              unsigned vector_width = 1)
+              unsigned vector_width = 1,
+              std::vector<std::string> fast_math_flags = {})
         : builder(context)
         , symbol_table(nullptr)
         , current_function(nullptr)
@@ -76,10 +80,30 @@ class IRBuilder {
         , fp_precision(use_single_precision ? single_precision : double_precision)
         , vector_width(vector_width)
         , mask(nullptr)
-        , kernel_id("") {}
+        , kernel_id("")
+        , fast_math_flags(fast_math_flags) {}
+
+    /// Transforms the fast math flags provided to the builder into LLVM's representation.
+    llvm::FastMathFlags transform_to_fmf(std::vector<std::string>& flags) {
+        static const std::map<std::string, void (llvm::FastMathFlags::*)(bool)> set_flag = {
+            {"nnan", &llvm::FastMathFlags::setNoNaNs},
+            {"ninf", &llvm::FastMathFlags::setNoInfs},
+            {"nsz", &llvm::FastMathFlags::setNoSignedZeros},
+            {"contract", &llvm::FastMathFlags::setAllowContract},
+            {"afn", &llvm::FastMathFlags::setApproxFunc},
+            {"reassoc", &llvm::FastMathFlags::setAllowReassoc},
+            {"fast", &llvm::FastMathFlags::setFast}};
+        llvm::FastMathFlags fmf;
+        for (const auto& flag: flags) {
+            (fmf.*(set_flag.at(flag)))(true);
+        }
+        return fmf;
+    }
 
     /// Initializes the builder with the symbol table and the kernel induction variable id.
     void initialize(symtab::SymbolTable& symbol_table, std::string& kernel_id) {
+        if (!fast_math_flags.empty())
+            builder.setFastMathFlags(transform_to_fmf(fast_math_flags));
         this->symbol_table = &symbol_table;
         this->kernel_id = kernel_id;
     }
diff --git a/src/main.cpp b/src/main.cpp
index 77067b0ef4..a1132d587e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -188,6 +188,9 @@ int main(int argc, const char* argv[]) {
     /// disable debug information generation for the IR
     bool disable_debug_information(false);
 
+    /// fast math flags for LLVM backend
+    std::vector<std::string> llvm_fast_math_flags;
+
     /// run llvm benchmark
     bool run_llvm_benchmark(false);
 
@@ -338,6 +341,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_option("--veclib",
                          vector_library,
                          "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
+    llvm_opt->add_option("--fmf",
+                         llvm_fast_math_flags,
+                         "Fast math flags for floating-point optimizations (none)")->check(CLI::IsMember({"afn", "arcp", "contract", "ninf", "nnan", "nsz", "reassoc", "fast"}));
 
     // LLVM IR benchmark options.
     auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
@@ -675,7 +681,8 @@ int main(int argc, const char* argv[]) {
                                            llvm_float_type,
                                            llvm_vec_width,
                                            vector_library,
-                                           !disable_debug_information);
+                                           !disable_debug_information,
+                                           llvm_fast_math_flags);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 0a3facf6fc..f338e13234 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -37,7 +37,8 @@ std::string run_llvm_visitor(const std::string& text,
                              bool opt = false,
                              bool use_single_precision = false,
                              int vector_width = 1,
-                             std::string vec_lib = "none") {
+                             std::string vec_lib = "none",
+                             std::vector<std::string> fast_math_flags = {}) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
@@ -50,7 +51,9 @@ std::string run_llvm_visitor(const std::string& text,
                                              opt,
                                              use_single_precision,
                                              vector_width,
-                                             vec_lib);
+                                             vec_lib,
+                                             /*add_debug_information=*/false,
+                                             fast_math_flags);
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();
 }
@@ -1378,6 +1381,37 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
     }
 }
 
+//=============================================================================
+// Fast math flags
+//=============================================================================
+
+SCENARIO("Fast math flags", "[visitor][llvm]") {
+    GIVEN("A function to produce fma and specified math flags") {
+        std::string nmodl_text = R"(
+            FUNCTION foo(a, b, c) {
+                foo = (a * b) + c
+            }
+        )";
+
+        THEN("instructions are generated with the flags set") {
+            std::string module_string =
+                run_llvm_visitor(nmodl_text,
+                                 /*opt=*/true,
+                                 /*use_single_precision=*/false,
+                                 /*vector_width=*/1,
+                                 /*vec_lib=*/"none",
+                                 /*fast_math_flags=*/{"nnan", "contract", "afn"});
+            std::smatch m;
+
+            // Check flags for produced 'fmul' and 'fadd' instructions.
+            std::regex fmul(R"(fmul nnan contract afn double %.*, %.*)");
+            std::regex fadd(R"(fadd nnan contract afn double %.*, %.*)");
+            REQUIRE(std::regex_search(module_string, m, fmul));
+            REQUIRE(std::regex_search(module_string, m, fadd));
+        }
+    }
+}
+
 //=============================================================================
 // Optimization : dead code removal
 //=============================================================================

From e752afd4a7c7f50114249dab2bf5d085e402e655 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 21 May 2021 14:04:39 +0200
Subject: [PATCH 172/331] Avoid generating LLVM IR for Functions and Procedures
 if inlined (#664)

---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 18 +++++
 .../llvm/codegen_llvm_helper_visitor.hpp      | 10 ++-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  2 +-
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  7 +-
 src/main.cpp                                  |  3 +-
 test/unit/codegen/codegen_llvm_ir.cpp         | 66 ++++++++++++++++++-
 6 files changed, 97 insertions(+), 9 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 5974edc623..ee9387be94 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -198,6 +198,7 @@ void CodegenLLVMHelperVisitor::create_function_for_node(ast::Block& node) {
     }
     codegen_functions.push_back(function);
 }
+
 /**
  * \note : Order of variables is not important but we assume all pointers
  * are added first and then scalar variables like t, dt, second_order etc.
@@ -536,11 +537,17 @@ void CodegenLLVMHelperVisitor::rename_local_variables(ast::StatementBlock& node)
 
 
 void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
+    // if the Procedure block is already inlined, there is no reason to generate the LLVM IR code
+    if (nmodl_inline)
+        return;
     node.visit_children(*this);
     create_function_for_node(node);
 }
 
 void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
+    // if the Function block is already inlined, there is no reason to generate the LLVM IR code
+    if (nmodl_inline)
+        return;
     node.visit_children(*this);
     create_function_for_node(node);
 }
@@ -786,6 +793,17 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
+    // Remove Function and Procedure blocks from the Program since they are already inlined
+    if (nmodl_inline) {
+        const auto& func_proc_nodes =
+            collect_nodes(node,
+                          {ast::AstNodeType::FUNCTION_BLOCK, ast::AstNodeType::PROCEDURE_BLOCK});
+        std::unordered_set<ast::Node*> nodes_to_erase;
+        for (const auto& ast_node: func_proc_nodes) {
+            nodes_to_erase.insert(static_cast<ast::Node*>(ast_node.get()));
+        }
+        node.erase_node(nodes_to_erase);
+    }
 }
 
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index bbff588675..3619cbc32e 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -100,9 +100,12 @@ struct InstanceVarHelper {
  * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
-    // explicit vectorisation width
+    /// explicit vectorisation width
     int vector_width;
 
+    /// variable to check whether Function and Procedures blocks are inline by NMODL passes
+    bool nmodl_inline;
+
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
 
@@ -134,8 +137,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     static const std::string VOLTAGE_VAR;
     static const std::string NODE_INDEX_VAR;
 
-    CodegenLLVMHelperVisitor(int vector_width)
-        : vector_width(vector_width){};
+    CodegenLLVMHelperVisitor(int vector_width, bool nmodl_inline)
+        : vector_width(vector_width)
+        , nmodl_inline(nmodl_inline) {}
 
     const InstanceVarHelper& get_instance_var_helper() {
         return instance_var_helper;
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6df5820d42..515949e329 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -770,7 +770,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{vector_width};
+    CodegenLLVMHelperVisitor v{vector_width, nmodl_inline};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index c3beb53640..cbc0f9b949 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -69,6 +69,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Output directory for code generation.
     std::string output_dir;
 
+    /// Variable to check if Functions and Procedures are inlined by NMODL passes
+    bool nmodl_inline;
+
   private:
     /// Underlying LLVM context.
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
@@ -114,9 +117,11 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        int vector_width = 1,
                        std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {})
+                       std::vector<std::string> fast_math_flags = {},
+                       bool nmodl_inline = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
+        , nmodl_inline(nmodl_inline)
         , opt_passes(opt_passes)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
diff --git a/src/main.cpp b/src/main.cpp
index a1132d587e..b008ed62a6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -682,7 +682,8 @@ int main(int argc, const char* argv[]) {
                                            llvm_vec_width,
                                            vector_library,
                                            !disable_debug_information,
-                                           llvm_fast_math_flags);
+                                           llvm_fast_math_flags,
+                                           nmodl_inline);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index f338e13234..0953034c99 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -16,6 +16,7 @@
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "parser/nmodl_driver.hpp"
 #include "visitors/checkparent_visitor.hpp"
+#include "visitors/inline_visitor.hpp"
 #include "visitors/neuron_solve_visitor.hpp"
 #include "visitors/solve_block_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
@@ -38,11 +39,15 @@ std::string run_llvm_visitor(const std::string& text,
                              bool use_single_precision = false,
                              int vector_width = 1,
                              std::string vec_lib = "none",
-                             std::vector<std::string> fast_math_flags = {}) {
+                             std::vector<std::string> fast_math_flags = {},
+                             bool nmodl_inline = false) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
+    if (nmodl_inline) {
+        InlineVisitor().visit_program(*ast);
+    }
     NeuronSolveVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
 
@@ -53,7 +58,9 @@ std::string run_llvm_visitor(const std::string& text,
                                              vector_width,
                                              vec_lib,
                                              /*add_debug_information=*/false,
-                                             fast_math_flags);
+                                             fast_math_flags,
+                                             nmodl_inline);
+
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();
 }
@@ -71,7 +78,7 @@ std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
 
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
+    CodegenLLVMHelperVisitor(vector_width, /*nmodl_inline=*/false).visit_program(*ast);
 
     const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 
@@ -1436,3 +1443,56 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
         }
     }
 }
+
+//=============================================================================
+// Inlining: remove inline code blocks
+//=============================================================================
+
+SCENARIO("Removal of inlined functions and procedures", "[visitor][llvm][inline]") {
+    GIVEN("Simple breakpoint block calling a function and a procedure") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test_inline
+                RANGE a, b, s
+            }
+            ASSIGNED {
+                a
+                b
+                s
+            }
+            PROCEDURE test_add(a, b) {
+                LOCAL i
+                i = a + b
+            }
+            FUNCTION test_sub(a, b) {
+                test_sub = a - b
+            }
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+            DERIVATIVE states {
+                a = 1
+                b = 2
+                test_add(a, b)
+                s = test_sub(a, b)
+            }
+        )";
+
+        THEN("when the code is inlined the procedure and function blocks are removed") {
+            std::string module_string = run_llvm_visitor(nmodl_text,
+                                                         /*opt=*/false,
+                                                         /*use_single_precision=*/false,
+                                                         /*vector_width=*/1,
+                                                         /*vec_lib=*/"none",
+                                                         /*fast_math_flags=*/{},
+                                                         /*nmodl_inline=*/true);
+            std::smatch m;
+
+            // Check if the procedure and function declarations are removed
+            std::regex add_proc(R"(define i32 @test_add\(double %a[0-9].*, double %b[0-9].*\))");
+            REQUIRE(!std::regex_search(module_string, m, add_proc));
+            std::regex sub_func(R"(define double @test_sub\(double %a[0-9].*, double %b[0-9].*\))");
+            REQUIRE(!std::regex_search(module_string, m, sub_func));
+        }
+    }
+}

From 9ece2e91ac77ab6c3ae1df4490a804c4c4fa3447 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 21 May 2021 08:21:07 -0700
Subject: [PATCH 173/331] Fixed typo in benchmarking metrics (#665)

---
 test/benchmark/llvm_benchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index b9f2fdeced..e48df0d457 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -144,7 +144,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         logger->info("Compute time variance = {:g}",
                      time_squared_sum / num_experiments - time_mean * time_mean);
         logger->info("Minimum compute time = {:.6f}", time_min);
-        logger->info("Minimum compute time = {:.6f}\n", time_max);
+        logger->info("Maximum compute time = {:.6f}\n", time_max);
     }
 }
 

From 468b3d19f725b5385af026fe62facd4343175c28 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 21 May 2021 23:04:19 +0200
Subject: [PATCH 174/331] Remove only inlined blocks from AST based on symtab
 properties (#668)

---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 36 ++++++++++---------
 .../llvm/codegen_llvm_helper_visitor.hpp      | 11 +++---
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  2 +-
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  7 +---
 src/main.cpp                                  |  3 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  5 ++-
 6 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index ee9387be94..654afd8ef5 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -10,6 +10,7 @@
 
 #include "ast/all.hpp"
 #include "codegen/codegen_helper_visitor.hpp"
+#include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
@@ -19,6 +20,8 @@ namespace codegen {
 
 using namespace fmt::literals;
 
+using symtab::syminfo::Status;
+
 /// initialize static member variables
 const ast::AstNodeType CodegenLLVMHelperVisitor::INTEGER_TYPE = ast::AstNodeType::INTEGER;
 const ast::AstNodeType CodegenLLVMHelperVisitor::FLOAT_TYPE = ast::AstNodeType::DOUBLE;
@@ -537,17 +540,11 @@ void CodegenLLVMHelperVisitor::rename_local_variables(ast::StatementBlock& node)
 
 
 void CodegenLLVMHelperVisitor::visit_procedure_block(ast::ProcedureBlock& node) {
-    // if the Procedure block is already inlined, there is no reason to generate the LLVM IR code
-    if (nmodl_inline)
-        return;
     node.visit_children(*this);
     create_function_for_node(node);
 }
 
 void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
-    // if the Function block is already inlined, there is no reason to generate the LLVM IR code
-    if (nmodl_inline)
-        return;
     node.visit_children(*this);
     create_function_for_node(node);
 }
@@ -780,6 +777,21 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     std::cout << nmodl::to_nmodl(function) << std::endl;
 }
 
+void CodegenLLVMHelperVisitor::remove_inlined_nodes(ast::Program& node) {
+    auto program_symtab = node.get_model_symbol_table();
+    const auto& func_proc_nodes =
+        collect_nodes(node, {ast::AstNodeType::FUNCTION_BLOCK, ast::AstNodeType::PROCEDURE_BLOCK});
+    std::unordered_set<ast::Node*> nodes_to_erase;
+    for (const auto& ast_node: func_proc_nodes) {
+        if (program_symtab->lookup(ast_node->get_node_name())
+                .get()
+                ->has_all_status(Status::inlined)) {
+            nodes_to_erase.insert(static_cast<ast::Node*>(ast_node.get()));
+        }
+    }
+    node.erase_node(nodes_to_erase);
+}
+
 void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     /// run codegen helper visitor to collect information
     CodegenHelperVisitor v;
@@ -789,21 +801,11 @@ void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     node.emplace_back_node(instance_var_helper.instance);
 
     logger->info("Running CodegenLLVMHelperVisitor");
+    remove_inlined_nodes(node);
     node.visit_children(*this);
     for (auto& fun: codegen_functions) {
         node.emplace_back_node(fun);
     }
-    // Remove Function and Procedure blocks from the Program since they are already inlined
-    if (nmodl_inline) {
-        const auto& func_proc_nodes =
-            collect_nodes(node,
-                          {ast::AstNodeType::FUNCTION_BLOCK, ast::AstNodeType::PROCEDURE_BLOCK});
-        std::unordered_set<ast::Node*> nodes_to_erase;
-        for (const auto& ast_node: func_proc_nodes) {
-            nodes_to_erase.insert(static_cast<ast::Node*>(ast_node.get()));
-        }
-        node.erase_node(nodes_to_erase);
-    }
 }
 
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 3619cbc32e..9d79e24803 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -103,9 +103,6 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     /// explicit vectorisation width
     int vector_width;
 
-    /// variable to check whether Function and Procedures blocks are inline by NMODL passes
-    bool nmodl_inline;
-
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
 
@@ -137,9 +134,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     static const std::string VOLTAGE_VAR;
     static const std::string NODE_INDEX_VAR;
 
-    CodegenLLVMHelperVisitor(int vector_width, bool nmodl_inline)
-        : vector_width(vector_width)
-        , nmodl_inline(nmodl_inline) {}
+    CodegenLLVMHelperVisitor(int vector_width)
+        : vector_width(vector_width) {}
 
     const InstanceVarHelper& get_instance_var_helper() {
         return instance_var_helper;
@@ -169,6 +165,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void convert_local_statement(ast::StatementBlock& node);
     void rename_local_variables(ast::StatementBlock& node);
 
+    /// Remove Function and Procedure blocks from the node since they are already inlined
+    void remove_inlined_nodes(ast::Program& node);
+
     void visit_procedure_block(ast::ProcedureBlock& node) override;
     void visit_function_block(ast::FunctionBlock& node) override;
     void visit_nrn_state_block(ast::NrnStateBlock& node) override;
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 515949e329..6df5820d42 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -770,7 +770,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{vector_width, nmodl_inline};
+    CodegenLLVMHelperVisitor v{vector_width};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index cbc0f9b949..c3beb53640 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -69,9 +69,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Output directory for code generation.
     std::string output_dir;
 
-    /// Variable to check if Functions and Procedures are inlined by NMODL passes
-    bool nmodl_inline;
-
   private:
     /// Underlying LLVM context.
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
@@ -117,11 +114,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        int vector_width = 1,
                        std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {},
-                       bool nmodl_inline = false)
+                       std::vector<std::string> fast_math_flags = {})
         : mod_filename(mod_filename)
         , output_dir(output_dir)
-        , nmodl_inline(nmodl_inline)
         , opt_passes(opt_passes)
         , vector_width(vector_width)
         , vector_library(veclib_map.at(vec_lib))
diff --git a/src/main.cpp b/src/main.cpp
index b008ed62a6..a1132d587e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -682,8 +682,7 @@ int main(int argc, const char* argv[]) {
                                            llvm_vec_width,
                                            vector_library,
                                            !disable_debug_information,
-                                           llvm_fast_math_flags,
-                                           nmodl_inline);
+                                           llvm_fast_math_flags);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 0953034c99..a0a4af297c 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -58,8 +58,7 @@ std::string run_llvm_visitor(const std::string& text,
                                              vector_width,
                                              vec_lib,
                                              /*add_debug_information=*/false,
-                                             fast_math_flags,
-                                             nmodl_inline);
+                                             fast_math_flags);
 
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();
@@ -78,7 +77,7 @@ std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
 
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(vector_width, /*nmodl_inline=*/false).visit_program(*ast);
+    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
 
     const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 

From 47e3c4a66ed1e2e272c3631795b0368d76c4a101 Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Tue, 25 May 2021 12:22:14 +0200
Subject: [PATCH 175/331] Use VarName on the RHS of assignment expression
 (#669)

- NMODL parser uses VarName on the LHS of assignment expression
- Inline visitor was using Name on the LHS of assignment expression

Related to #667
---
 src/visitors/inline_visitor.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/visitors/inline_visitor.cpp b/src/visitors/inline_visitor.cpp
index 7d82ce57bc..da52b4d6b7 100644
--- a/src/visitors/inline_visitor.cpp
+++ b/src/visitors/inline_visitor.cpp
@@ -297,6 +297,8 @@ void InlineVisitor::visit_statement_block(StatementBlock& node) {
 /** Visit all wrapped expressions which can contain function calls.
  *  If a function call is replaced then the wrapped expression is
  *  also replaced with new variable node from the inlining result.
+ *  Note that we use `VarName` so that LHS of assignment expression
+ *  is `VarName`, similar to parser.
  */
 void InlineVisitor::visit_wrapped_expression(WrappedExpression& node) {
     node.visit_children(*this);
@@ -305,7 +307,9 @@ void InlineVisitor::visit_wrapped_expression(WrappedExpression& node) {
         auto expression = dynamic_cast<FunctionCall*>(e.get());
         if (replaced_fun_calls.find(expression) != replaced_fun_calls.end()) {
             auto var = replaced_fun_calls[expression];
-            node.set_expression(std::make_shared<Name>(new String(var)));
+            node.set_expression(std::make_shared<VarName>(new Name(new String(var)),
+                                                          /*at=*/nullptr,
+                                                          /*index=*/nullptr));
         }
     }
 }

From af7540cffb3b50ae0173e3dd5d6aa240fb26dc09 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 30 May 2021 07:07:19 -0700
Subject: [PATCH 176/331] [LLVM] SLEEF and libsystem_m vector libraries support
 (#674)

* Added support for `libsystem_m` and `SLEEF` vector libraries. The
first one is supported by LLVM internally, so it comes for free with
LLVM 13. For `SLEEF`, basic support was added for AArch64 and
x86 architectures. Currently, we support
- `exp`
- `pow`

* Added corresponding IR checks for `libsystem_m` and
`SLEEF` (both AArch64 and x86).

* Updated LLVM binaries for MAC OS CI, as well as for latest LLVM 13
(trunk) to fix link errors for Darwin vector library.

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 azure-pipelines.yml                       |  4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp | 79 +++++++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp | 21 +++---
 src/main.cpp                              |  2 +-
 test/unit/codegen/codegen_llvm_ir.cpp     | 30 +++++++++
 5 files changed, 110 insertions(+), 26 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index effe8c43f9..0462864088 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -134,13 +134,13 @@ jobs:
     displayName: 'Install Dependencies'
   - script: |
       cd $HOME
-      git clone https://github.com/pramodk/llvm-nightly.git
+      git clone --depth 1 https://github.com/pramodk/llvm-nightly.git
     displayName: 'Setup LLVM v13'
   - script: |
       export PATH=/usr/local/opt/flex/bin:/usr/local/opt/bison/bin:$PATH;
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$HOME/llvm-nightly/0421/osx/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$HOME/llvm-nightly/0621/osx/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
       make -j 2
       if [ $? -ne 0 ]
       then
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 6df5820d42..1e5ca89c6d 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -65,6 +65,68 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
+#if LLVM_VERSION_MAJOR >= 13
+void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
+                                                                 llvm::Triple& triple) {
+    // Since LLVM does not support SLEEF as a vector library yet, process it separately.
+    if (vector_library == "SLEEF") {
+        // Populate function definitions of only exp and pow (for now)
+#define FIXED(w)                        llvm::ElementCount::getFixed(w)
+#define DISPATCH(func, vec_func, width) {func, vec_func, width},
+        const llvm::VecDesc aarch64_functions[] = {
+            // clang-format off
+            DISPATCH("llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4))
+            DISPATCH("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2))
+            DISPATCH("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4))
+            DISPATCH("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2))
+            // clang-format on
+        };
+        const llvm::VecDesc x86_functions[] = {
+            // clang-format off
+            DISPATCH("llvm.exp.f64", "_ZGVbN2v_exp", FIXED(2))
+            DISPATCH("llvm.exp.f64", "_ZGVdN4v_exp", FIXED(4))
+            DISPATCH("llvm.exp.f64", "_ZGVeN8v_exp", FIXED(8))
+            DISPATCH("llvm.pow.f64", "_ZGVbN2vv_pow", FIXED(2))
+            DISPATCH("llvm.pow.f64", "_ZGVdN4vv_pow", FIXED(4))
+            DISPATCH("llvm.pow.f64", "_ZGVeN8vv_pow", FIXED(8))
+            // clang-format on
+        };
+#undef DISPATCH
+
+        if (triple.isAArch64()) {
+            tli.addVectorizableFunctions(aarch64_functions);
+        }
+        if (triple.isX86() && triple.isArch64Bit()) {
+            tli.addVectorizableFunctions(x86_functions);
+        }
+
+    } else {
+        // A map to query vector library by its string value.
+        using VecLib = llvm::TargetLibraryInfoImpl::VectorLibrary;
+        static const std::map<std::string, VecLib> llvm_supported_vector_libraries = {
+            {"Accelerate", VecLib::Accelerate},
+            {"libmvec", VecLib::LIBMVEC_X86},
+            {"libsystem_m", VecLib ::DarwinLibSystemM},
+            {"MASSV", VecLib::MASSV},
+            {"none", VecLib::NoLibrary},
+            {"SVML", VecLib::SVML}};
+        const auto& library = llvm_supported_vector_libraries.find(vector_library);
+        if (library == llvm_supported_vector_libraries.end())
+            throw std::runtime_error("Error: unknown vector library - " + vector_library + "\n");
+
+        // Add vectorizable functions to the target library info.
+        switch (library->second) {
+        case VecLib::LIBMVEC_X86:
+            if (!triple.isX86() || !triple.isArch64Bit())
+                break;
+        default:
+            tli.addVectorizableFunctionsFromVecLib(library->second);
+            break;
+        }
+    }
+}
+#endif
+
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
     node->accept(*this);
     return ir_builder.pop_last_value();
@@ -817,25 +879,20 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         run_ir_opt_passes();
     }
 
-    // Optionally, replace LLVM's maths intrinsics with vector library calls.
-    if (vector_width > 1 && vector_library != llvm::TargetLibraryInfoImpl::NoLibrary) {
+    // Optionally, replace LLVM math intrinsics with vector library calls.
+    if (vector_width > 1) {
 #if LLVM_VERSION_MAJOR < 13
         logger->warn(
             "This version of LLVM does not support replacement of LLVM intrinsics with vector "
             "library calls");
 #else
-        // First, get the target library information.
+        // First, get the target library information and add vectorizable functions for the
+        // specified vector library.
         llvm::Triple triple(llvm::sys::getDefaultTargetTriple());
         llvm::TargetLibraryInfoImpl target_lib_info = llvm::TargetLibraryInfoImpl(triple);
+        add_vectorizable_functions_from_vec_lib(target_lib_info, triple);
 
-        // Populate target library information with vectorisable functions. Since libmvec is
-        // supported for x86_64 only, have a check to catch other architectures.
-        if (vector_library != llvm::TargetLibraryInfoImpl::LIBMVEC_X86 ||
-            (triple.isX86() && triple.isArch64Bit())) {
-            target_lib_info.addVectorizableFunctionsFromVecLib(vector_library);
-        }
-
-        // Run the codegen optimisation passes that replace maths intrinsics.
+        // Run passes that replace math intrinsics.
         codegen_pm.add(new llvm::TargetLibraryInfoWrapperPass(target_lib_info));
         codegen_pm.add(new llvm::ReplaceWithVeclibLegacy);
         codegen_pm.doInitialization();
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index c3beb53640..49285f9941 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -48,15 +48,6 @@ namespace codegen {
  * @{
  */
 
-/// A map to query vector library by its string value.
-static const std::map<std::string, llvm::TargetLibraryInfoImpl::VectorLibrary> veclib_map = {
-    {"Accelerate", llvm::TargetLibraryInfoImpl::Accelerate},
-#if LLVM_VERSION_MAJOR >= 13
-    {"libmvec", llvm::TargetLibraryInfoImpl::LIBMVEC_X86},
-#endif
-    {"MASSV", llvm::TargetLibraryInfoImpl::MASSV},
-    {"SVML", llvm::TargetLibraryInfoImpl::SVML},
-    {"none", llvm::TargetLibraryInfoImpl::NoLibrary}};
 
 /**
  * \class CodegenLLVMVisitor
@@ -100,8 +91,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Pass manager for optimisation passes that are used for target code generation.
     llvm::legacy::FunctionPassManager codegen_pm;
 
-    /// Vector library used for maths functions.
-    llvm::TargetLibraryInfoImpl::VectorLibrary vector_library;
+    /// Vector library used for math functions.
+    std::string vector_library;
 
     /// Explicit vectorisation width.
     int vector_width;
@@ -119,7 +110,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         , output_dir(output_dir)
         , opt_passes(opt_passes)
         , vector_width(vector_width)
-        , vector_library(veclib_map.at(vec_lib))
+        , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
         , ir_builder(*context, use_single_precision, vector_width, fast_math_flags)
         , debug_builder(*module)
@@ -183,6 +174,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void wrap_kernel_functions();
 
   private:
+#if LLVM_VERSION_MAJOR >= 13
+    /// Populates target library info with the vector library definitions.
+    void add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
+                                                 llvm::Triple& triple);
+#endif
+
     /// Accepts the given AST node and returns the processed value.
     llvm::Value* accept_and_get(const std::shared_ptr<ast::Node>& node);
 
diff --git a/src/main.cpp b/src/main.cpp
index a1132d587e..3583bd00f3 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -340,7 +340,7 @@ int main(int argc, const char* argv[]) {
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
     llvm_opt->add_option("--veclib",
                          vector_library,
-                         "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libmvec", "MASSV", "SVML", "none"}));
+                         "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libsystem_m", "libmvec", "MASSV", "SLEEF", "SVML", "none"}));
     llvm_opt->add_option("--fmf",
                          llvm_fast_math_flags,
                          "Fast math flags for floating-point optimizations (none)")->check(CLI::IsMember({"afn", "arcp", "contract", "ninf", "nnan", "nsz", "reassoc", "fast"}));
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index a0a4af297c..fa0a649f2d 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1382,6 +1382,36 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             REQUIRE(std::regex_search(accelerate_library_module_str, m, accelerate_exp_decl));
             REQUIRE(std::regex_search(accelerate_library_module_str, m, accelerate_exp_call));
             REQUIRE(!std::regex_search(accelerate_library_module_str, m, fexp_call));
+
+            // Check correct replacement of @llvm.exp.v2f64 into @_ZGV?N?v_exp when using SLEEF.
+            std::string sleef_library_module_str = run_llvm_visitor(nmodl_text,
+                                                                    /*opt=*/false,
+                                                                    /*use_single_precision=*/false,
+                                                                    /*vector_width=*/2,
+                                                                    /*vec_lib=*/"SLEEF");
+#if defined(__arm64__) || defined(__aarch64__)
+            std::regex sleef_exp_decl(R"(declare <2 x double> @_ZGVnN2v_exp\(<2 x double>\))");
+            std::regex sleef_exp_call(R"(call <2 x double> @_ZGVnN2v_exp\(<2 x double> .*\))");
+#else
+            std::regex sleef_exp_decl(R"(declare <2 x double> @_ZGVbN2v_exp\(<2 x double>\))");
+            std::regex sleef_exp_call(R"(call <2 x double> @_ZGVbN2v_exp\(<2 x double> .*\))");
+#endif
+            REQUIRE(std::regex_search(sleef_library_module_str, m, sleef_exp_decl));
+            REQUIRE(std::regex_search(sleef_library_module_str, m, sleef_exp_call));
+            REQUIRE(!std::regex_search(sleef_library_module_str, m, fexp_call));
+
+            // Check the replacements when using Darwin's libsystem_m.
+            std::string libsystem_m_library_module_str =
+                run_llvm_visitor(nmodl_text,
+                                 /*opt=*/false,
+                                 /*use_single_precision=*/true,
+                                 /*vector_width=*/4,
+                                 /*vec_lib=*/"libsystem_m");
+            std::regex libsystem_m_exp_decl(R"(declare <4 x float> @_simd_exp_f4\(<4 x float>\))");
+            std::regex libsystem_m_exp_call(R"(call <4 x float> @_simd_exp_f4\(<4 x float> .*\))");
+            REQUIRE(std::regex_search(libsystem_m_library_module_str, m, libsystem_m_exp_decl));
+            REQUIRE(std::regex_search(libsystem_m_library_module_str, m, libsystem_m_exp_call));
+            REQUIRE(!std::regex_search(libsystem_m_library_module_str, m, fexp_call));
 #endif
         }
     }

From 19ad02a1d6f008d57fd8009a04eab9c3b85c5934 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 00:01:58 -0700
Subject: [PATCH 177/331] [LLVM] Enhancements for optimization pipeline  (#683)

Added several improvements to the way optimizations are run
for the LLVM code generation pipeline and benchmarking.

1. Created `llvm_utils` files that currently have logic/implementations
for optimizing the IR. In future, things like dumping IR to file will also go
there. This allows to share optimizing infrastructure between benchmarking
and LLVM visitor.

2. Replaced`--opt` with `--opt-level-ir` for LLVM visitor. The `--opt` option
was duplicated by `--opt-level-ir` in the benchmarking infrastructure.
With new `llvm_utils` package, we can simply reuse the optimizing routines
and use optimization levels instead.

3. Added IPO and AggressiveInstCombine passes

Importantly, if running the benchmark, the IR is still optimized after the
`targetMachine` is created to benefit from target-specific optimizations.

Example:
```bash
bin/nmodl test.mod llvm --ir --single-precision --vector-width 4 --opt-level-ir 3 \
                                  benchmark --run --opt-level-codegen 3
```
Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 cmake/LLVMHelper.cmake                        |  1 +
 src/codegen/llvm/CMakeLists.txt               |  4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 37 +++------
 src/codegen/llvm/codegen_llvm_visitor.hpp     | 27 ++-----
 src/codegen/llvm/llvm_utils.cpp               | 79 +++++++++++++++++++
 src/codegen/llvm/llvm_utils.hpp               | 23 ++++++
 src/codegen/llvm/main.cpp                     |  2 +-
 src/main.cpp                                  | 18 ++---
 test/benchmark/jit_driver.cpp                 | 65 +--------------
 test/benchmark/jit_driver.hpp                 |  2 +-
 test/benchmark/llvm_benchmark.hpp             |  2 +-
 test/unit/codegen/codegen_llvm_execution.cpp  | 12 +--
 .../codegen/codegen_llvm_instance_struct.cpp  |  6 +-
 test/unit/codegen/codegen_llvm_ir.cpp         | 38 ++++-----
 14 files changed, 164 insertions(+), 152 deletions(-)
 create mode 100644 src/codegen/llvm/llvm_utils.cpp
 create mode 100644 src/codegen/llvm/llvm_utils.hpp

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 780ae29cfa..9e4af5d503 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -6,6 +6,7 @@ find_package(LLVM REQUIRED CONFIG)
 
 # include LLVM libraries
 set(NMODL_LLVM_COMPONENTS
+    aggressiveinstcombine
     analysis
     codegen
     core
diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index b927475f15..5c7eadc91c 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -9,7 +9,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_debug_builder.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 1e5ca89c6d..ffbedbb063 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -6,6 +6,7 @@
  *************************************************************************/
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/llvm_utils.hpp"
 
 #include "ast/all.hpp"
 #include "visitors/rename_visitor.hpp"
@@ -15,6 +16,7 @@
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -431,25 +433,6 @@ llvm::Value* CodegenLLVMVisitor::read_variable(const ast::VarName& node) {
                              "' is not supported\n");
 }
 
-void CodegenLLVMVisitor::run_ir_opt_passes() {
-    // Run some common optimisation passes that are commonly suggested.
-    opt_pm.add(llvm::createInstructionCombiningPass());
-    opt_pm.add(llvm::createReassociatePass());
-    opt_pm.add(llvm::createGVNPass());
-    opt_pm.add(llvm::createCFGSimplificationPass());
-
-    // Initialize pass manager.
-    opt_pm.doInitialization();
-
-    // Iterate over all functions and run the optimisation passes.
-    auto& functions = module->getFunctionList();
-    for (auto& function: functions) {
-        llvm::verifyFunction(function);
-        opt_pm.run(function);
-    }
-    opt_pm.doFinalization();
-}
-
 void CodegenLLVMVisitor::write_to_variable(const ast::VarName& node, llvm::Value* value) {
     const auto& identifier = node.get_name();
     if (!identifier->is_name() && !identifier->is_indexed_name() &&
@@ -874,9 +857,10 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         throw std::runtime_error("Error: incorrect IR has been generated!\n" + ostream.str());
     }
 
-    if (opt_passes) {
+    if (opt_level_ir) {
         logger->info("Running LLVM optimisation passes");
-        run_ir_opt_passes();
+        utils::initialise_optimisation_passes();
+        utils::optimise_module(*module, opt_level_ir);
     }
 
     // Optionally, replace LLVM math intrinsics with vector library calls.
@@ -893,14 +877,15 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         add_vectorizable_functions_from_vec_lib(target_lib_info, triple);
 
         // Run passes that replace math intrinsics.
-        codegen_pm.add(new llvm::TargetLibraryInfoWrapperPass(target_lib_info));
-        codegen_pm.add(new llvm::ReplaceWithVeclibLegacy);
-        codegen_pm.doInitialization();
+        llvm::legacy::FunctionPassManager fpm(module.get());
+        fpm.add(new llvm::TargetLibraryInfoWrapperPass(target_lib_info));
+        fpm.add(new llvm::ReplaceWithVeclibLegacy);
+        fpm.doInitialization();
         for (auto& function: module->getFunctionList()) {
             if (!function.isDeclaration())
-                codegen_pm.run(function);
+                fpm.run(function);
         }
-        codegen_pm.doFinalization();
+        fpm.doFinalization();
 #endif
     }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 49285f9941..5dd8eda15c 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -28,12 +28,8 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
 
 namespace nmodl {
 namespace codegen {
@@ -82,14 +78,8 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Instance variable helper.
     InstanceVarHelper instance_var_helper;
 
-    /// Run optimisation passes if true.
-    bool opt_passes;
-
-    /// Pass manager for optimisation passes that are run on IR and are not related to target.
-    llvm::legacy::FunctionPassManager opt_pm;
-
-    /// Pass manager for optimisation passes that are used for target code generation.
-    llvm::legacy::FunctionPassManager codegen_pm;
+    /// Optimisation level for LLVM IR transformations.
+    int opt_level_ir;
 
     /// Vector library used for math functions.
     std::string vector_library;
@@ -100,7 +90,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
   public:
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
-                       bool opt_passes,
+                       int opt_level_ir,
                        bool use_single_precision = false,
                        int vector_width = 1,
                        std::string vec_lib = "none",
@@ -108,14 +98,12 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        std::vector<std::string> fast_math_flags = {})
         : mod_filename(mod_filename)
         , output_dir(output_dir)
-        , opt_passes(opt_passes)
+        , opt_level_ir(opt_level_ir)
         , vector_width(vector_width)
         , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
         , ir_builder(*context, use_single_precision, vector_width, fast_math_flags)
-        , debug_builder(*module)
-        , codegen_pm(module.get())
-        , opt_pm(module.get()) {}
+        , debug_builder(*module) {}
 
     /// Dumps the generated LLVM IR module to string.
     std::string dump_module() const {
@@ -228,11 +216,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Reads the given variable and returns the processed value.
     llvm::Value* read_variable(const ast::VarName& node);
 
-
-    /// Run multiple LLVM optimisation passes on generated IR.
-    /// TODO: this can be moved to a dedicated file or deprecated.
-    void run_ir_opt_passes();
-
     //// Writes the value to the given variable.
     void write_to_variable(const ast::VarName& node, llvm::Value* value);
 };
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
new file mode 100644
index 0000000000..684f962b76
--- /dev/null
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -0,0 +1,79 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/llvm_utils.hpp"
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+
+namespace nmodl {
+namespace utils {
+
+/// Populates pass managers with passes for the given optimisation levels.
+static void populate_pms(llvm::legacy::FunctionPassManager& func_pm,
+                         llvm::legacy::PassManager& module_pm,
+                         int opt_level,
+                         int size_level,
+                         llvm::TargetMachine* tm) {
+    // First, set the pass manager builder with some basic optimisation information.
+    llvm::PassManagerBuilder pm_builder;
+    pm_builder.OptLevel = opt_level;
+    pm_builder.SizeLevel = size_level;
+    pm_builder.DisableUnrollLoops = opt_level == 0;
+
+    // If target machine is defined, then initialise the TargetTransformInfo for the target.
+    if (tm) {
+        module_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
+        func_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
+    }
+
+    // Populate pass managers.
+    pm_builder.populateModulePassManager(module_pm);
+    pm_builder.populateFunctionPassManager(func_pm);
+}
+
+/// Runs the function and module passes on the provided module.
+static void run_optimisation_passes(llvm::Module& module,
+                                    llvm::legacy::FunctionPassManager& func_pm,
+                                    llvm::legacy::PassManager& module_pm) {
+    func_pm.doInitialization();
+    auto& functions = module.getFunctionList();
+    for (auto& function: functions) {
+        llvm::verifyFunction(function);
+        func_pm.run(function);
+    }
+    func_pm.doFinalization();
+    module_pm.run(module);
+}
+
+/****************************************************************************************/
+/*                             Optimisation utils                                       */
+/****************************************************************************************/
+
+void initialise_optimisation_passes() {
+    auto& registry = *llvm::PassRegistry::getPassRegistry();
+    llvm::initializeCore(registry);
+    llvm::initializeTransformUtils(registry);
+    llvm::initializeScalarOpts(registry);
+    llvm::initializeIPO(registry);
+    llvm::initializeInstCombine(registry);
+    llvm::initializeAggressiveInstCombine(registry);
+    llvm::initializeAnalysis(registry);
+}
+
+void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm) {
+    llvm::legacy::FunctionPassManager func_pm(&module);
+    llvm::legacy::PassManager module_pm;
+    populate_pms(func_pm, module_pm, opt_level, /*size_level=*/0, tm);
+    run_optimisation_passes(module, func_pm, module_pm);
+}
+}  // namespace utils
+}  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
new file mode 100644
index 0000000000..81dc30d97f
--- /dev/null
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -0,0 +1,23 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace nmodl {
+namespace utils {
+
+/// Initialises some LLVM optimisation passes.
+void initialise_optimisation_passes();
+
+/// Optimises the given LLVM IR module.
+void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm = nullptr);
+
+}  // namespace utils
+}  // namespace nmodl
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 2f4e1f653d..6d374999c3 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -48,7 +48,7 @@ int main(int argc, const char* argv[]) {
     visitor::SymtabVisitor().visit_program(*ast);
 
     logger->info("Running LLVM Visitor");
-    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", /*opt_passes=*/false);
+    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", /*opt_level_ir=*/0);
     llvm_visitor.visit_program(*ast);
     std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
 
diff --git a/src/main.cpp b/src/main.cpp
index 3583bd00f3..362ccb4ddc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -176,9 +176,6 @@ int main(int argc, const char* argv[]) {
     /// use single precision floating-point types
     bool llvm_float_type(false);
 
-    /// run llvm optimisation passes
-    bool llvm_ir_opt_passes(false);
-
     /// llvm vector width
     int llvm_vec_width = 1;
 
@@ -329,9 +326,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--disable-debug-info",
                        disable_debug_information,
                        "Disable debug information ({})"_format(disable_debug_information))->ignore_case();
-    llvm_opt->add_flag("--opt",
-                       llvm_ir_opt_passes,
-                       "Run few common LLVM IR optimisation passes ({})"_format(llvm_ir_opt_passes))->ignore_case();
+    llvm_opt->add_option("--opt-level-ir",
+                              llvm_opt_level_ir,
+                              "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
@@ -350,9 +347,6 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_flag("--run",
                             run_llvm_benchmark,
                             "Run LLVM benchmark ({})"_format(run_llvm_benchmark))->ignore_case();
-    benchmark_opt->add_option("--opt-level-ir",
-                              llvm_opt_level_ir,
-                              "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
     benchmark_opt->add_option("--opt-level-codegen",
                               llvm_opt_level_codegen,
                               "Machine code optimisation level (O{})"_format(llvm_opt_level_codegen))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
@@ -674,10 +668,14 @@ int main(int argc, const char* argv[]) {
 
 #ifdef NMODL_LLVM_BACKEND
             if (llvm_ir || run_llvm_benchmark) {
+                // If benchmarking, we want to optimize the IR with target information and not in
+                // LLVM visitor.
+                int llvm_opt_level = run_llvm_benchmark ? 0 : llvm_opt_level_ir;
+
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
                                            output_dir,
-                                           llvm_ir_opt_passes,
+                                           llvm_opt_level,
                                            llvm_float_type,
                                            llvm_vec_width,
                                            vector_library,
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index a2d8df63f4..e5a7cd8928 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -7,9 +7,9 @@
 
 #include "jit_driver.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/llvm_utils.hpp"
 #include "utils/common_utils.hpp"
 
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
@@ -21,12 +21,10 @@
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 
 namespace nmodl {
 namespace runner {
@@ -35,63 +33,6 @@ namespace runner {
 /*                            Utilities for JIT driver                                  */
 /****************************************************************************************/
 
-/// Initialises some LLVM optimisation passes.
-static void initialise_optimisation_passes() {
-    auto& registry = *llvm::PassRegistry::getPassRegistry();
-    llvm::initializeCore(registry);
-    llvm::initializeTransformUtils(registry);
-    llvm::initializeScalarOpts(registry);
-    llvm::initializeInstCombine(registry);
-    llvm::initializeAnalysis(registry);
-}
-
-/// Populates pass managers with passes for the given optimisation levels.
-static void populate_pms(llvm::legacy::FunctionPassManager& func_pm,
-                         llvm::legacy::PassManager& module_pm,
-                         int opt_level,
-                         int size_level,
-                         llvm::TargetMachine* tm) {
-    // First, set the pass manager builder with some basic optimisation information.
-    llvm::PassManagerBuilder pm_builder;
-    pm_builder.OptLevel = opt_level;
-    pm_builder.SizeLevel = size_level;
-    pm_builder.DisableUnrollLoops = opt_level == 0;
-
-    // If target machine is defined, then initialise the TargetTransformInfo for the target.
-    if (tm) {
-        module_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
-        func_pm.add(createTargetTransformInfoWrapperPass(tm->getTargetIRAnalysis()));
-    }
-
-    // Populate pass managers.
-    pm_builder.populateModulePassManager(module_pm);
-    pm_builder.populateFunctionPassManager(func_pm);
-}
-
-/// Runs the function and module passes on the provided module.
-static void run_optimisation_passes(llvm::Module& module,
-                                    llvm::legacy::FunctionPassManager& func_pm,
-                                    llvm::legacy::PassManager& module_pm) {
-    func_pm.doInitialization();
-    auto& functions = module.getFunctionList();
-    for (auto& function: functions) {
-        llvm::verifyFunction(function);
-        func_pm.run(function);
-    }
-    func_pm.doFinalization();
-    module_pm.run(module);
-}
-
-/// Optimises the given LLVM IR module.
-static void optimise_module(llvm::Module& module,
-                            int opt_level,
-                            llvm::TargetMachine* tm = nullptr) {
-    llvm::legacy::FunctionPassManager func_pm(&module);
-    llvm::legacy::PassManager module_pm;
-    populate_pms(func_pm, module_pm, opt_level, /*size_level=*/0, tm);
-    run_optimisation_passes(module, func_pm, module_pm);
-}
-
 /// Sets the target triple and the data layout of the module.
 static void set_triple_and_data_layout(llvm::Module& module, const std::string& features) {
     // Get the default target triple for the host.
@@ -149,7 +90,7 @@ void JITDriver::init(std::string features,
                      BenchmarkInfo* benchmark_info) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
-    initialise_optimisation_passes();
+    utils::initialise_optimisation_passes();
 
     // Set the target triple and the data layout for the module.
     set_triple_and_data_layout(*module, features);
@@ -211,7 +152,7 @@ void JITDriver::init(std::string features,
 
         // Optimise the LLVM IR module and save it to .ll file if benchmarking.
         if (benchmark_info) {
-            optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
+            utils::optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
 
             std::error_code error_code;
             std::unique_ptr<llvm::ToolOutputFile> out =
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index afb1317cd8..d8e1127417 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -29,7 +29,7 @@ struct BenchmarkInfo {
     /// Object file output directory.
     std::string output_dir;
 
-    /// Optimisation level for generated IR.
+    /// Optimisation level for IT.
     int opt_level_ir;
 
     /// Optimisation level for machine code generation.
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index 9696191172..4a66de52fc 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -43,7 +43,7 @@ class LLVMBenchmark {
     /// Benchmarking backend
     std::string backend;
 
-    /// Optimisation level for LLVM IR transformations.
+    /// Optimisation level for IR generation.
     int opt_level_ir;
 
     /// Optimisation level for machine code generation.
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index aa77a4e493..41605ecbd3 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -126,7 +126,7 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/false);
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
@@ -228,7 +228,7 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/true);
+                                                 /*opt_level_ir=*/3);
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
@@ -301,7 +301,7 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         SolveBlockVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/false,
+                                                 /*opt_level_ir=*/0,
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/1);
         llvm_visitor.visit_program(*ast);
@@ -383,7 +383,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         SolveBlockVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/true,
+                                                 /*opt_level_ir=*/3,
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/4);
         llvm_visitor.visit_program(*ast);
@@ -465,7 +465,7 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
         SolveBlockVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/false,
+                                                 /*opt_level_ir=*/0,
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/2);
         llvm_visitor.visit_program(*ast);
@@ -556,7 +556,7 @@ SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
         SolveBlockVisitor().visit_program(*ast);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_passes=*/false,
+                                                 /*opt_level_ir=*/0,
                                                  /*use_single_precision=*/false,
                                                  /*vector_width=*/2);
         llvm_visitor.visit_program(*ast);
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index e77b6844ae..6042aecfc8 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -27,7 +27,7 @@ using nmodl::parser::NmodlDriver;
 //=============================================================================
 
 codegen::CodegenInstanceData generate_instance_data(const std::string& text,
-                                                    bool opt = false,
+                                                    int opt_level = 0,
                                                     bool use_single_precision = false,
                                                     int vector_width = 1,
                                                     size_t num_elements = 100,
@@ -41,7 +41,7 @@ codegen::CodegenInstanceData generate_instance_data(const std::string& text,
 
     codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"test",
                                              /*output_dir=*/".",
-                                             opt,
+                                             opt_level,
                                              use_single_precision,
                                              vector_width);
     llvm_visitor.visit_program(*ast);
@@ -104,7 +104,7 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
             const size_t num_elements = 10;
             constexpr static double seed = 42;
             auto instance_data = generate_instance_data(nmodl_text,
-                                                        /*opt=*/false,
+                                                        /*opt_level=*/0,
                                                         /*use_single_precision=*/true,
                                                         /*vector_width*/ 1,
                                                         num_elements,
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index fa0a649f2d..d43d99282d 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -35,7 +35,7 @@ using nmodl::parser::NmodlDriver;
 //=============================================================================
 
 std::string run_llvm_visitor(const std::string& text,
-                             bool opt = false,
+                             int opt_level = 0,
                              bool use_single_precision = false,
                              int vector_width = 1,
                              std::string vec_lib = "none",
@@ -53,7 +53,7 @@ std::string run_llvm_visitor(const std::string& text,
 
     codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                              /*output_dir=*/".",
-                                             opt,
+                                             opt_level,
                                              use_single_precision,
                                              vector_width,
                                              vec_lib,
@@ -99,7 +99,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
 
         THEN("variables are loaded and add instruction is created") {
             std::string module_string =
-                run_llvm_visitor(nmodl_text, /*opt=*/false, /*use_single_precision=*/true);
+                run_llvm_visitor(nmodl_text, /*opt_level=*/0, /*use_single_precision=*/true);
             std::smatch m;
 
             std::regex rhs(R"(%1 = load float, float\* %b)");
@@ -179,7 +179,7 @@ SCENARIO("Binary expression", "[visitor][llvm]") {
 
         THEN("'pow' intrinsic is created") {
             std::string module_string =
-                run_llvm_visitor(nmodl_text, /*opt=*/false, /*use_single_precision=*/true);
+                run_llvm_visitor(nmodl_text, /*opt_level=*/0, /*use_single_precision=*/true);
             std::smatch m;
 
             // Check 'pow' intrinsic.
@@ -1046,7 +1046,7 @@ SCENARIO("Vectorised simple kernel", "[visitor][llvm]") {
 
         THEN("a gather instructions is created") {
             std::string module_string = run_llvm_visitor(nmodl_text,
-                                                         /*opt=*/false,
+                                                         /*opt_level=*/0,
                                                          /*use_single_precision=*/false,
                                                          /*vector_width=*/4);
             std::smatch m;
@@ -1098,7 +1098,7 @@ SCENARIO("Vectorised simple kernel with ion writes", "[visitor][llvm]") {
 
         THEN("a scatter instructions is created") {
             std::string module_string = run_llvm_visitor(nmodl_text,
-                                                         /*opt=*/false,
+                                                         /*opt_level=*/0,
                                                          /*use_single_precision=*/false,
                                                          /*vector_width=*/4);
             std::smatch m;
@@ -1154,7 +1154,7 @@ SCENARIO("Vectorised simple kernel with control flow", "[visitor][llvm]") {
 
         THEN("masked load and stores are created") {
             std::string module_string = run_llvm_visitor(nmodl_text,
-                                                         /*opt=*/false,
+                                                         /*opt_level=*/0,
                                                          /*use_single_precision=*/true,
                                                          /*vector_width=*/8);
             std::smatch m;
@@ -1326,7 +1326,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
 
             // Check exponential intrinsic is created.
             std::string no_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                 /*opt=*/false,
+                                                                 /*opt_level=*/0,
                                                                  /*use_single_precision=*/false,
                                                                  /*vector_width=*/2);
             std::regex exp_decl(R"(declare <2 x double> @llvm\.exp\.v2f64\(<2 x double>\))");
@@ -1337,7 +1337,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
 #if LLVM_VERSION_MAJOR >= 13
             // Check exponential calls are replaced with calls to SVML library.
             std::string svml_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                   /*opt=*/false,
+                                                                   /*opt_level=*/0,
                                                                    /*use_single_precision=*/false,
                                                                    /*vector_width=*/2,
                                                                    /*vec_lib=*/"SVML");
@@ -1350,7 +1350,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             // Check that supported exponential calls are replaced with calls to MASSV library (i.e.
             // operating on vector of width 2).
             std::string massv2_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                     /*opt=*/false,
+                                                                     /*opt_level=*/0,
                                                                      /*use_single_precision=*/false,
                                                                      /*vector_width=*/2,
                                                                      /*vec_lib=*/"MASSV");
@@ -1362,7 +1362,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
 
             // Check no replacement for MASSV happens for non-supported vector widths.
             std::string massv4_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                     /*opt=*/false,
+                                                                     /*opt_level=*/0,
                                                                      /*use_single_precision=*/false,
                                                                      /*vector_width=*/4,
                                                                      /*vec_lib=*/"MASSV");
@@ -1372,7 +1372,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             // Check correct replacement of @llvm.exp.v4f32 into @vexpf when using Accelerate.
             std::string accelerate_library_module_str =
                 run_llvm_visitor(nmodl_text,
-                                 /*opt=*/false,
+                                 /*opt_level=*/0,
                                  /*use_single_precision=*/true,
                                  /*vector_width=*/4,
                                  /*vec_lib=*/"Accelerate");
@@ -1385,7 +1385,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
 
             // Check correct replacement of @llvm.exp.v2f64 into @_ZGV?N?v_exp when using SLEEF.
             std::string sleef_library_module_str = run_llvm_visitor(nmodl_text,
-                                                                    /*opt=*/false,
+                                                                    /*opt_level=*/0,
                                                                     /*use_single_precision=*/false,
                                                                     /*vector_width=*/2,
                                                                     /*vec_lib=*/"SLEEF");
@@ -1403,7 +1403,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             // Check the replacements when using Darwin's libsystem_m.
             std::string libsystem_m_library_module_str =
                 run_llvm_visitor(nmodl_text,
-                                 /*opt=*/false,
+                                 /*opt_level=*/0,
                                  /*use_single_precision=*/true,
                                  /*vector_width=*/4,
                                  /*vec_lib=*/"libsystem_m");
@@ -1432,7 +1432,7 @@ SCENARIO("Fast math flags", "[visitor][llvm]") {
         THEN("instructions are generated with the flags set") {
             std::string module_string =
                 run_llvm_visitor(nmodl_text,
-                                 /*opt=*/true,
+                                 /*opt_level=*/3,
                                  /*use_single_precision=*/false,
                                  /*vector_width=*/1,
                                  /*vec_lib=*/"none",
@@ -1462,12 +1462,12 @@ SCENARIO("Dead code removal", "[visitor][llvm][opt]") {
         )";
 
         THEN("with optimisation enabled, all ops are eliminated") {
-            std::string module_string = run_llvm_visitor(nmodl_text, true);
+            std::string module_string = run_llvm_visitor(nmodl_text, /*opt_level=*/3);
             std::smatch m;
 
-            // Check if the values are optimised out
+            // Check if the values are optimised out.
             std::regex empty_proc(
-                R"(define i32 @add\(double %a[0-9].*, double %b[0-9].*\) \{\n(\s)*ret i32 0\n\})");
+                R"(define i32 @add\(double %a[0-9].*, double %b[0-9].*\).*\{\n(\s)*ret i32 0\n\})");
             REQUIRE(std::regex_search(module_string, m, empty_proc));
         }
     }
@@ -1509,7 +1509,7 @@ SCENARIO("Removal of inlined functions and procedures", "[visitor][llvm][inline]
 
         THEN("when the code is inlined the procedure and function blocks are removed") {
             std::string module_string = run_llvm_visitor(nmodl_text,
-                                                         /*opt=*/false,
+                                                         /*opt_level=*/0,
                                                          /*use_single_precision=*/false,
                                                          /*vector_width=*/1,
                                                          /*vec_lib=*/"none",

From 83067368a7c9f89ef20124641fe06c735a70c14a Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 08:01:37 -0700
Subject: [PATCH 178/331] [LLVM] Added saving to file utility (#685)

* Added saving to file utility
* Skip NEURON test in LLVM branch

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 azure-pipelines.yml                       |  1 +
 src/codegen/llvm/codegen_llvm_visitor.cpp | 21 ++++++---------------
 src/codegen/llvm/llvm_utils.cpp           | 19 +++++++++++++++++++
 src/codegen/llvm/llvm_utils.hpp           |  3 +++
 test/benchmark/jit_driver.cpp             | 18 +++---------------
 5 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0462864088..39ab99a66c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -174,6 +174,7 @@ jobs:
         exit 1
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
+    condition: false
     env:
       SHELL: 'bash'
     displayName: 'Build Neuron and Run Integration Tests'
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index ffbedbb063..bac6f4e0b2 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -13,14 +13,11 @@
 #include "visitors/visitor_utils.hpp"
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/ToolOutputFile.h"
 
 #if LLVM_VERSION_MAJOR >= 13
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
@@ -72,9 +69,12 @@ void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLib
                                                                  llvm::Triple& triple) {
     // Since LLVM does not support SLEEF as a vector library yet, process it separately.
     if (vector_library == "SLEEF") {
-        // Populate function definitions of only exp and pow (for now)
-#define FIXED(w)                        llvm::ElementCount::getFixed(w)
+// clang-format off
+#define FIXED(w) llvm::ElementCount::getFixed(w)
+// clang-format on
 #define DISPATCH(func, vec_func, width) {func, vec_func, width},
+
+        // Populate function definitions of only exp and pow (for now)
         const llvm::VecDesc aarch64_functions[] = {
             // clang-format off
             DISPATCH("llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4))
@@ -890,17 +890,8 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     }
 
     // If the output directory is specified, save the IR to .ll file.
-    // \todo: Consider saving the generated LLVM IR to bytecode (.bc) file instead.
     if (output_dir != ".") {
-        std::error_code error_code;
-        std::unique_ptr<llvm::ToolOutputFile> out = std::make_unique<llvm::ToolOutputFile>(
-            output_dir + "/" + mod_filename + ".ll", error_code, llvm::sys::fs::OF_Text);
-        if (error_code)
-            throw std::runtime_error("Error: " + error_code.message());
-
-        std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
-        module->print(out->os(), annotator.get());
-        out->keep();
+        utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
     }
 
     logger->debug("Dumping generated IR...\n" + dump_module());
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index 684f962b76..59967c59c1 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -8,9 +8,12 @@
 #include "codegen/llvm/llvm_utils.hpp"
 
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 
@@ -75,5 +78,21 @@ void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* t
     populate_pms(func_pm, module_pm, opt_level, /*size_level=*/0, tm);
     run_optimisation_passes(module, func_pm, module_pm);
 }
+
+/****************************************************************************************/
+/*                                    File utils                                        */
+/****************************************************************************************/
+
+void save_ir_to_ll_file(llvm::Module& module, const std::string& filename) {
+    std::error_code error_code;
+    std::unique_ptr<llvm::ToolOutputFile> out = std::make_unique<llvm::ToolOutputFile>(
+        filename + ".ll", error_code, llvm::sys::fs::OF_Text);
+    if (error_code)
+        throw std::runtime_error("Error: " + error_code.message());
+
+    std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
+    module.print(out->os(), annotator.get());
+    out->keep();
+}
 }  // namespace utils
 }  // namespace nmodl
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index 81dc30d97f..8e1e6e48dc 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -19,5 +19,8 @@ void initialise_optimisation_passes();
 /// Optimises the given LLVM IR module.
 void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm = nullptr);
 
+///
+void save_ir_to_ll_file(llvm::Module& module, const std::string& filename);
+
 }  // namespace utils
 }  // namespace nmodl
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index e5a7cd8928..a804a2d4fd 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -20,11 +20,9 @@
 #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
 
 namespace nmodl {
 namespace runner {
@@ -153,19 +151,9 @@ void JITDriver::init(std::string features,
         // Optimise the LLVM IR module and save it to .ll file if benchmarking.
         if (benchmark_info) {
             utils::optimise_module(*module, benchmark_info->opt_level_ir, tm.get());
-
-            std::error_code error_code;
-            std::unique_ptr<llvm::ToolOutputFile> out =
-                std::make_unique<llvm::ToolOutputFile>(benchmark_info->output_dir + "/" +
-                                                           benchmark_info->filename + "_opt.ll",
-                                                       error_code,
-                                                       llvm::sys::fs::OF_Text);
-            if (error_code)
-                throw std::runtime_error("Error: " + error_code.message());
-
-            std::unique_ptr<llvm::AssemblyAnnotationWriter> annotator;
-            module->print(out->os(), annotator.get());
-            out->keep();
+            const std::string filename = benchmark_info->output_dir + "/" +
+                                         benchmark_info->filename + "_opt";
+            utils::save_ir_to_ll_file(*module, filename);
         }
 
         return std::make_unique<llvm::orc::TMOwningSimpleCompiler>(std::move(tm));

From 24908e27a9829fd568de3bca71e1de74b191e12e Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 3 Jun 2021 08:03:05 -0700
Subject: [PATCH 179/331] [LLVM] Aliasing and `cpu` options for LLVM visitor
 and the benchmark (#686)

* Added may-alias and cpu options
* Removed CPU checks
* Use steady clock as we saw issue on VM

Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>
---
 azure-pipelines.yml                       |  1 +
 src/codegen/llvm/codegen_llvm_visitor.hpp |  9 ++-
 src/codegen/llvm/llvm_ir_builder.cpp      |  9 ++-
 src/codegen/llvm/llvm_ir_builder.hpp      |  9 ++-
 src/main.cpp                              | 21 ++++---
 test/benchmark/jit_driver.cpp             | 71 ++++++++++++++---------
 test/benchmark/jit_driver.hpp             | 26 ++++-----
 test/benchmark/llvm_benchmark.cpp         | 68 +++-------------------
 test/benchmark/llvm_benchmark.hpp         | 12 ++--
 9 files changed, 103 insertions(+), 123 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 39ab99a66c..38ee9c6bc3 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -177,6 +177,7 @@ jobs:
     condition: false
     env:
       SHELL: 'bash'
+    condition: false
     displayName: 'Build Neuron and Run Integration Tests'
 - job: 'manylinux_wheels'
   timeoutInMinutes: 45
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 5dd8eda15c..22b9fafd83 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -95,14 +95,19 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        int vector_width = 1,
                        std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {})
+                       std::vector<std::string> fast_math_flags = {},
+                       bool llvm_assume_alias = false)
         : mod_filename(mod_filename)
         , output_dir(output_dir)
         , opt_level_ir(opt_level_ir)
         , vector_width(vector_width)
         , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
-        , ir_builder(*context, use_single_precision, vector_width, fast_math_flags)
+        , ir_builder(*context,
+                     use_single_precision,
+                     vector_width,
+                     fast_math_flags,
+                     !llvm_assume_alias)
         , debug_builder(*module) {}
 
     /// Dumps the generated LLVM IR module to string.
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 004f28d857..a585c95b3b 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -200,12 +200,15 @@ void IRBuilder::set_kernel_attributes() {
     current_function->setDoesNotFreeMemory();
     current_function->setDoesNotThrow();
 
-    // We also want to specify that the pointers that instance struct holds, do not alias. In order
-    // to do that, we add a `noalias` attribute to the argument. As per Clang's specification:
+    // We also want to specify that the pointers that instance struct holds do not alias, unless
+    // specified otherwise. In order to do that, we add a `noalias` attribute to the argument. As
+    // per Clang's specification:
     //  > The `noalias` attribute indicates that the only memory accesses inside function are loads
     //  > and stores from objects pointed to by its pointer-typed arguments, with arbitrary
     //  > offsets.
-    current_function->addParamAttr(0, llvm::Attribute::NoAlias);
+    if (assume_noalias) {
+        current_function->addParamAttr(0, llvm::Attribute::NoAlias);
+    }
 
     // Finally, specify that the struct pointer does not capture and is read-only.
     current_function->addParamAttr(0, llvm::Attribute::NoCapture);
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index b9736e2846..b3005db0c7 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -58,6 +58,9 @@ class IRBuilder {
     /// The vector width used for the vectorized code.
     unsigned vector_width;
 
+    /// Instance struct fields do not alias.
+    bool assume_noalias;
+
     /// Masked value used to predicate vector instructions.
     llvm::Value* mask;
 
@@ -71,7 +74,8 @@ class IRBuilder {
     IRBuilder(llvm::LLVMContext& context,
               bool use_single_precision = false,
               unsigned vector_width = 1,
-              std::vector<std::string> fast_math_flags = {})
+              std::vector<std::string> fast_math_flags = {},
+              bool assume_noalias = true)
         : builder(context)
         , symbol_table(nullptr)
         , current_function(nullptr)
@@ -81,7 +85,8 @@ class IRBuilder {
         , vector_width(vector_width)
         , mask(nullptr)
         , kernel_id("")
-        , fast_math_flags(fast_math_flags) {}
+        , fast_math_flags(fast_math_flags)
+        , assume_noalias(assume_noalias) {}
 
     /// Transforms the fast math flags provided to the builder into LLVM's representation.
     llvm::FastMathFlags transform_to_fmf(std::vector<std::string>& flags) {
diff --git a/src/main.cpp b/src/main.cpp
index 362ccb4ddc..b2102aaee7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -191,6 +191,9 @@ int main(int argc, const char* argv[]) {
     /// run llvm benchmark
     bool run_llvm_benchmark(false);
 
+    /// do not assume that instance struct fields do not alias
+    bool llvm_assume_alias(false);
+
     /// optimisation level for IR generation
     int llvm_opt_level_ir = 0;
 
@@ -206,8 +209,8 @@ int main(int argc, const char* argv[]) {
     /// the number of repeated experiments for the benchmarking
     int num_experiments = 100;
 
-    /// specify the backend for LLVM IR to target
-    std::string backend = "default";
+    /// specify the cpu for LLVM IR to target
+    std::string cpu = "default";
 #endif
 
     app.get_formatter()->column_width(40);
@@ -332,6 +335,9 @@ int main(int argc, const char* argv[]) {
     llvm_opt->add_flag("--single-precision",
                        llvm_float_type,
                        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
+    llvm_opt->add_flag("--assume-may-alias",
+                       llvm_assume_alias,
+                       "Assume instance struct fields may alias ({})"_format(llvm_assume_alias))->ignore_case();
     llvm_opt->add_option("--vector-width",
         llvm_vec_width,
         "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
@@ -359,9 +365,9 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
-    benchmark_opt->add_option("--backend",
-                       backend,
-                       "Target's backend ({})"_format(backend))->ignore_case()->check(CLI::IsMember({"avx2", "default", "sse2"}));
+    benchmark_opt->add_option("--cpu",
+                       cpu,
+                       "Target's backend ({})"_format(cpu))->ignore_case();
 #endif
     // clang-format on
 
@@ -680,7 +686,8 @@ int main(int argc, const char* argv[]) {
                                            llvm_vec_width,
                                            vector_library,
                                            !disable_debug_information,
-                                           llvm_fast_math_flags);
+                                           llvm_fast_math_flags,
+                                           llvm_assume_alias);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
@@ -693,7 +700,7 @@ int main(int argc, const char* argv[]) {
                                                        shared_lib_paths,
                                                        num_experiments,
                                                        instance_size,
-                                                       backend,
+                                                       cpu,
                                                        llvm_opt_level_ir,
                                                        llvm_opt_level_codegen);
                     benchmark.run(ast);
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index a804a2d4fd..f91b41cda0 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -31,8 +31,21 @@ namespace runner {
 /*                            Utilities for JIT driver                                  */
 /****************************************************************************************/
 
+/// Get the host CPU features in the format:
+///   +feature,+feature,-feature,+feature,...
+/// where `+` indicates that the feature is enabled.
+std::string get_cpu_features(const std::string& cpu) {
+    llvm::SubtargetFeatures features;
+    llvm::StringMap<bool> host_features;
+    if (llvm::sys::getHostCPUFeatures(host_features)) {
+        for (auto& f: host_features)
+            features.AddFeature(f.first(), f.second);
+    }
+    return llvm::join(features.getFeatures().begin(), features.getFeatures().end(), ",");
+}
+
 /// Sets the target triple and the data layout of the module.
-static void set_triple_and_data_layout(llvm::Module& module, const std::string& features) {
+static void set_triple_and_data_layout(llvm::Module& module, const std::string& cpu) {
     // Get the default target triple for the host.
     auto target_triple = llvm::sys::getDefaultTargetTriple();
     std::string error_msg;
@@ -40,8 +53,8 @@ static void set_triple_and_data_layout(llvm::Module& module, const std::string&
     if (!target)
         throw std::runtime_error("Error " + error_msg + "\n");
 
-    // Get the CPU information and set a target machine to create the data layout.
-    std::string cpu(llvm::sys::getHostCPUName());
+    // Set a target machine to create the data layout.
+    std::string features = get_cpu_features(cpu);
     std::unique_ptr<llvm::TargetMachine> tm(
         target->createTargetMachine(target_triple, cpu, features, {}, {}));
     if (!tm)
@@ -52,10 +65,10 @@ static void set_triple_and_data_layout(llvm::Module& module, const std::string&
     module.setTargetTriple(target_triple);
 }
 
-/// Creates llvm::TargetMachine with certain CPU features turned on/off.
+/// Creates llvm::TargetMachine with for a specified CPU.
 static std::unique_ptr<llvm::TargetMachine> create_target(
     llvm::orc::JITTargetMachineBuilder* tm_builder,
-    const std::string& features,
+    const std::string& cpu,
     int opt_level) {
     // First, look up the target.
     std::string error_msg;
@@ -66,8 +79,8 @@ static std::unique_ptr<llvm::TargetMachine> create_target(
 
     // Create default target machine with provided features.
     auto tm = target->createTargetMachine(target_triple,
-                                          llvm::sys::getHostCPUName().str(),
-                                          features,
+                                          cpu,
+                                          get_cpu_features(cpu),
                                           tm_builder->getOptions(),
                                           tm_builder->getRelocationModel(),
                                           tm_builder->getCodeModel(),
@@ -83,15 +96,13 @@ static std::unique_ptr<llvm::TargetMachine> create_target(
 /*                                      JIT driver                                      */
 /****************************************************************************************/
 
-void JITDriver::init(std::string features,
-                     std::vector<std::string> lib_paths,
-                     BenchmarkInfo* benchmark_info) {
+void JITDriver::init(const std::string& cpu, BenchmarkInfo* benchmark_info) {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
     utils::initialise_optimisation_passes();
 
     // Set the target triple and the data layout for the module.
-    set_triple_and_data_layout(*module, features);
+    set_triple_and_data_layout(*module, cpu);
     auto data_layout = module->getDataLayout();
 
     // If benchmarking, enable listeners to use GDB, perf or VTune. Note that LLVM should be built
@@ -120,24 +131,26 @@ void JITDriver::init(std::string features,
         if (intel_event_listener)
             layer->registerJITEventListener(*intel_event_listener);
 
-        for (const auto& lib_path: lib_paths) {
-            // For every library path, create a corresponding memory buffer.
-            auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
-            if (!memory_buffer)
-                throw std::runtime_error("Unable to create memory buffer for " + lib_path);
-
-            // Create a new JIT library instance for this session and resolve symbols.
-            auto& jd = session.createBareJITDylib(std::string(lib_path));
-            auto loaded =
-                llvm::orc::DynamicLibrarySearchGenerator::Load(lib_path.data(),
-                                                               data_layout.getGlobalPrefix());
-
-            if (!loaded)
-                throw std::runtime_error("Unable to load " + lib_path);
-            jd.addGenerator(std::move(*loaded));
-            cantFail(layer->add(jd, std::move(*memory_buffer)));
+        // If benchmarking, resolve shared libraries.
+        if (benchmark_info) {
+            for (const auto& lib_path: benchmark_info->shared_lib_paths) {
+                // For every library path, create a corresponding memory buffer.
+                auto memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
+                if (!memory_buffer)
+                    throw std::runtime_error("Unable to create memory buffer for " + lib_path);
+
+                // Create a new JIT library instance for this session and resolve symbols.
+                auto& jd = session.createBareJITDylib(std::string(lib_path));
+                auto loaded =
+                    llvm::orc::DynamicLibrarySearchGenerator::Load(lib_path.data(),
+                                                                   data_layout.getGlobalPrefix());
+
+                if (!loaded)
+                    throw std::runtime_error("Unable to load " + lib_path);
+                jd.addGenerator(std::move(*loaded));
+                cantFail(layer->add(jd, std::move(*memory_buffer)));
+            }
         }
-
         return layer;
     };
 
@@ -146,7 +159,7 @@ void JITDriver::init(std::string features,
         -> llvm::Expected<std::unique_ptr<llvm::orc::IRCompileLayer::IRCompiler>> {
         // Create target machine with some features possibly turned off.
         int opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
-        auto tm = create_target(&tm_builder, features, opt_level_codegen);
+        auto tm = create_target(&tm_builder, cpu, opt_level_codegen);
 
         // Optimise the LLVM IR module and save it to .ll file if benchmarking.
         if (benchmark_info) {
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index d8e1127417..7106311523 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -17,6 +17,7 @@
 
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/Support/Host.h"
 
 namespace nmodl {
 namespace runner {
@@ -29,6 +30,9 @@ struct BenchmarkInfo {
     /// Object file output directory.
     std::string output_dir;
 
+    /// Shared libraries' paths to link against.
+    std::vector<std::string> shared_lib_paths;
+
     /// Optimisation level for IT.
     int opt_level_ir;
 
@@ -63,9 +67,7 @@ class JITDriver {
         : module(std::move(m)) {}
 
     /// Initializes the JIT driver.
-    void init(std::string features = "",
-              std::vector<std::string> lib_paths = {},
-              BenchmarkInfo* benchmark_info = nullptr);
+    void init(const std::string& cpu, BenchmarkInfo* benchmark_info = nullptr);
 
     /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
     template <typename ReturnType>
@@ -131,7 +133,7 @@ class TestRunner: public BaseRunner {
         : BaseRunner(std::move(m)) {}
 
     virtual void initialize_driver() {
-        driver->init();
+        driver->init(llvm::sys::getHostCPUName().str());
     }
 };
 
@@ -145,27 +147,23 @@ class BenchmarkRunner: public BaseRunner {
     /// Benchmarking information passed to JIT driver.
     BenchmarkInfo benchmark_info;
 
-    /// CPU features specified by the user.
-    std::string features;
-
-    /// Shared libraries' paths to link against.
-    std::vector<std::string> shared_lib_paths;
+    /// CPU to target.
+    std::string cpu;
 
   public:
     BenchmarkRunner(std::unique_ptr<llvm::Module> m,
                     std::string filename,
                     std::string output_dir,
-                    std::string features = "",
+                    std::string cpu,
                     std::vector<std::string> lib_paths = {},
                     int opt_level_ir = 0,
                     int opt_level_codegen = 0)
         : BaseRunner(std::move(m))
-        , benchmark_info{filename, output_dir, opt_level_ir, opt_level_codegen}
-        , features(features)
-        , shared_lib_paths(lib_paths) {}
+        , cpu(cpu)
+        , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
     virtual void initialize_driver() {
-        driver->init(features, shared_lib_paths, &benchmark_info);
+        driver->init(cpu, &benchmark_info);
     }
 };
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index e48df0d457..0e94ae231b 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -6,7 +6,6 @@
  *************************************************************************/
 
 #include <chrono>
-#include <fstream>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "llvm_benchmark.hpp"
@@ -19,35 +18,6 @@
 namespace nmodl {
 namespace benchmark {
 
-/// Precision for the timing measurements.
-static constexpr int PRECISION = 9;
-
-/// Get the host CPU features in the format:
-///   +feature,+feature,-feature,+feature,...
-/// where `+` indicates that the feature is enabled.
-static std::vector<std::string> get_cpu_features() {
-    std::string cpu(llvm::sys::getHostCPUName());
-
-    llvm::SubtargetFeatures features;
-    llvm::StringMap<bool> host_features;
-    if (llvm::sys::getHostCPUFeatures(host_features)) {
-        for (auto& f: host_features)
-            features.AddFeature(f.first(), f.second);
-    }
-    return features.getFeatures();
-}
-
-
-void LLVMBenchmark::disable(const std::string& feature, std::vector<std::string>& host_features) {
-    for (auto& host_feature: host_features) {
-        if (feature == host_feature.substr(1)) {
-            host_feature[0] = '-';
-            logger->info("{}", host_feature);
-            return;
-        }
-    }
-}
-
 void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
     // create functions
     generate_llvm(node);
@@ -57,9 +27,9 @@ void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
 
 void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
     // First, visit the AST to build the LLVM IR module and wrap the kernel function calls.
-    auto start = std::chrono::high_resolution_clock::now();
+    auto start = std::chrono::steady_clock::now();
     llvm_visitor.wrap_kernel_functions();
-    auto end = std::chrono::high_resolution_clock::now();
+    auto end = std::chrono::steady_clock::now();
 
     // Log the time taken to visit the AST and build LLVM IR.
     std::chrono::duration<double> diff = end - start;
@@ -72,37 +42,17 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     std::vector<std::string> kernel_names;
     llvm_visitor.find_kernel_names(kernel_names);
 
-    // Get feature's string and turn them off depending on the backend.
-    std::vector<std::string> features = get_cpu_features();
-    logger->info("Backend: {}", backend);
-    if (backend == "avx2") {
-        // Disable SSE.
-        logger->info("Disabling features:");
-        disable("sse", features);
-        disable("sse2", features);
-        disable("sse3", features);
-        disable("sse4.1", features);
-        disable("sse4.2", features);
-    } else if (backend == "sse2") {
-        // Disable AVX.
-        logger->info("Disabling features:");
-        disable("avx", features);
-        disable("avx2", features);
-    }
+    // Get feature's string and turn them off depending on the cpu.
+    std::string cpu_name = cpu == "default" ? llvm::sys::getHostCPUName().str() : cpu;
+    logger->info("CPU: {}", cpu_name);
 
-    std::string features_str = llvm::join(features.begin(), features.end(), ",");
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
     // Create the benchmark runner and initialize it.
     std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
                            mod_filename;
-    runner::BenchmarkRunner runner(std::move(m),
-                                   filename,
-                                   output_dir,
-                                   features_str,
-                                   shared_libs,
-                                   opt_level_ir,
-                                   opt_level_codegen);
+    runner::BenchmarkRunner runner(
+        std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
     // Benchmark every kernel.
@@ -124,9 +74,9 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
-            auto start = std::chrono::high_resolution_clock::now();
+            auto start = std::chrono::steady_clock::now();
             runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
-            auto end = std::chrono::high_resolution_clock::now();
+            auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
             // Log the time taken for each run.
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index 4a66de52fc..cc9dd3bcf0 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <fstream>
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
@@ -40,8 +41,8 @@ class LLVMBenchmark {
     /// The size of the instance struct for benchmarking.
     int instance_size;
 
-    /// Benchmarking backend
-    std::string backend;
+    /// CPU to target.
+    std::string cpu;
 
     /// Optimisation level for IR generation.
     int opt_level_ir;
@@ -59,7 +60,7 @@ class LLVMBenchmark {
                   std::vector<std::string> shared_libs,
                   int num_experiments,
                   int instance_size,
-                  const std::string& backend,
+                  const std::string& cpu,
                   int opt_level_ir,
                   int opt_level_codegen)
         : llvm_visitor(llvm_visitor)
@@ -68,7 +69,7 @@ class LLVMBenchmark {
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
-        , backend(backend)
+        , cpu(cpu)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen) {}
 
@@ -76,9 +77,6 @@ class LLVMBenchmark {
     void run(const std::shared_ptr<ast::Program>& node);
 
   private:
-    /// Disables the specified feature in the target.
-    void disable(const std::string& feature, std::vector<std::string>& host_features);
-
     /// Visits the AST to construct the LLVM IR module.
     void generate_llvm(const std::shared_ptr<ast::Program>& node);
 

From f7218905fc12f4b6bdb9912cf8bc369743c6575e Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Thu, 3 Jun 2021 18:52:35 +0200
Subject: [PATCH 180/331] Fix azure yaml pipeline from merge (#687)

---
 azure-pipelines.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 38ee9c6bc3..df317598b8 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -174,7 +174,6 @@ jobs:
         exit 1
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
-    condition: false
     env:
       SHELL: 'bash'
     condition: false

From ff5430c96467736ebe23cc89954fed340e144cd7 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 8 Mar 2022 12:21:10 +0100
Subject: [PATCH 181/331] [LLVM] Support for newer versions of LLVM APIs

This commit fixes deprications warnings and errors that
occur due to switch of LLVM to opaque pointers: recent versions
of LLVM instruction builder APIs require explicit type parameters
when doing `gep`s, `load`s, `gather`s.

Moreover, with recent change to LLVM `https://reviews.llvm.org/D106678`
MASSV SIMD functions have no `_P8` suffix by default. Tests were
adjusted to take that into account.

Note: tested with the LLVM version from `brew` (13).
---
 src/codegen/llvm/llvm_ir_builder.cpp  | 44 ++++++++++++++++++---------
 test/unit/codegen/codegen_llvm_ir.cpp |  4 +--
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index a585c95b3b..1015b437f3 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -347,11 +347,14 @@ llvm::Value* IRBuilder::create_inbounds_gep(const std::string& var_name, llvm::V
 
     // Since we index through the pointer, we need an extra 0 index in the indices list for GEP.
     ValueVector indices{llvm::ConstantInt::get(get_i64_type(), 0), index};
-    return builder.CreateInBoundsGEP(variable_ptr, indices);
+    llvm::Type* variable_type = variable_ptr->getType()->getPointerElementType();
+    return builder.CreateInBoundsGEP(variable_type, variable_ptr, indices);
 }
 
 llvm::Value* IRBuilder::create_inbounds_gep(llvm::Value* variable, llvm::Value* index) {
-    return builder.CreateInBoundsGEP(variable, {index});
+    ValueVector indices{index};
+    llvm::Type* variable_type = variable->getType()->getPointerElementType();
+    return builder.CreateInBoundsGEP(variable_type, variable, indices);
 }
 
 llvm::Value* IRBuilder::create_index(llvm::Value* value) {
@@ -378,23 +381,25 @@ llvm::Value* IRBuilder::create_index(llvm::Value* value) {
 
 llvm::Value* IRBuilder::create_load(const std::string& name, bool masked) {
     llvm::Value* ptr = lookup_value(name);
+    llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
 
     // Check if the generated IR is vectorized and masked.
     if (masked) {
-        return builder.CreateMaskedLoad(ptr, llvm::Align(), mask);
+        builder.CreateMaskedLoad(loaded_type, ptr, llvm::Align(), mask);
     }
-    llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
     llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
     value_stack.push_back(loaded);
     return loaded;
 }
 
 llvm::Value* IRBuilder::create_load(llvm::Value* ptr, bool masked) {
+    llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
+
     // Check if the generated IR is vectorized and masked.
     if (masked) {
-        return builder.CreateMaskedLoad(ptr, llvm::Align(), mask);
+        builder.CreateMaskedLoad(loaded_type, ptr, llvm::Align(), mask);
     }
-    llvm::Type* loaded_type = ptr->getType()->getPointerElementType();
+
     llvm::Value* loaded = builder.CreateLoad(loaded_type, ptr);
     value_stack.push_back(loaded);
     return loaded;
@@ -466,7 +471,9 @@ llvm::Value* IRBuilder::get_struct_member_ptr(llvm::Value* struct_variable, int
     ValueVector indices;
     indices.push_back(llvm::ConstantInt::get(get_i32_type(), 0));
     indices.push_back(llvm::ConstantInt::get(get_i32_type(), member_index));
-    return builder.CreateInBoundsGEP(struct_variable, indices);
+
+    llvm::Type* type = struct_variable->getType()->getPointerElementType();
+    return builder.CreateInBoundsGEP(type, struct_variable, indices);
 }
 
 void IRBuilder::invert_mask() {
@@ -491,14 +498,23 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     bool generating_vector_ir = vector_width > 1 && vectorize;
 
     // If the vector code is generated, we need to distinguish between two cases. If the array is
-    // indexed indirectly (i.e. not by an induction variable `kernel_id`), create a gather
-    // instruction.
+    // indexed indirectly (i.e. not by an induction variable `kernel_id`), create gather/scatter
+    // instructions.
     if (id_name != kernel_id && generating_vector_ir) {
-        return maybe_value_to_store ? builder.CreateMaskedScatter(maybe_value_to_store,
-                                                                  element_ptr,
-                                                                  llvm::Align(),
-                                                                  mask)
-                                    : builder.CreateMaskedGather(element_ptr, llvm::Align(), mask);
+        if (maybe_value_to_store) {
+            return builder.CreateMaskedScatter(maybe_value_to_store,
+                                               element_ptr,
+                                               llvm::Align(),
+                                               mask);
+        } else {
+            // Construct the loaded vector type.
+            auto* ptrs = llvm::cast<llvm::VectorType>(element_ptr->getType());
+            llvm::ElementCount element_count = ptrs->getElementCount();
+            llvm::Type* element_type = ptrs->getElementType()->getPointerElementType();
+            llvm::Type* loaded_type = llvm::VectorType::get(element_type, element_count);
+
+            return builder.CreateMaskedGather(loaded_type, element_ptr, llvm::Align(), mask);
+        }
     }
 
     llvm::Value* ptr;
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index d43d99282d..23f6977aea 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1354,8 +1354,8 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
                                                                      /*use_single_precision=*/false,
                                                                      /*vector_width=*/2,
                                                                      /*vec_lib=*/"MASSV");
-            std::regex massv2_exp_decl(R"(declare <2 x double> @__expd2_P8\(<2 x double>\))");
-            std::regex massv2_exp_call(R"(call <2 x double> @__expd2_P8\(<2 x double> .*\))");
+            std::regex massv2_exp_decl(R"(declare <2 x double> @__expd2\(<2 x double>\))");
+            std::regex massv2_exp_call(R"(call <2 x double> @__expd2\(<2 x double> .*\))");
             REQUIRE(std::regex_search(massv2_library_module_str, m, massv2_exp_decl));
             REQUIRE(std::regex_search(massv2_library_module_str, m, massv2_exp_call));
             REQUIRE(!std::regex_search(massv2_library_module_str, m, exp_call));

From c80a44d92fbee2b137b04a2d47bf798edffaadb6 Mon Sep 17 00:00:00 2001
From: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Tue, 8 Mar 2022 15:12:14 +0100
Subject: [PATCH 182/331] Fix build issues for the rebased branch  - fix bad
 merges  - remove custom llvm version, use brew with llvm@13  - follow
 hpc/gitlab-pipelines changes from Olli  - load extra module llvm using
 SPACK_EXTRA_MODULES  - use gcc build instead of nvhpc which fails to compile
 with llvm    headers

---
 .gitlab-ci.yml                                 | 18 ++++++++++--------
 CMakeLists.txt                                 |  2 +-
 azure-pipelines.yml                            |  6 +-----
 src/codegen/codegen_info.hpp                   |  7 +++++++
 .../llvm/codegen_llvm_helper_visitor.hpp       |  1 +
 5 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a445db62d2..a17a8dea9e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,10 +8,6 @@ include:
 
 trigger cvf:
   stage: .pre
-  variables:
-    # Tell the CVF pipeline to use the current ref of NMODL, this works because
-    # the CVF CI uses the `gitlab-pipelines` helper components.
-    SPACK_PACKAGE_REF_nmodl: "commit='${CI_COMMIT_SHA}'"
   rules:
     # Don't run on PRs targeting the LLVM development branch
     - if: '$CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "llvm"'
@@ -22,15 +18,21 @@ trigger cvf:
     project: hpc/cvf
     # Make the NMODL CI status depend on the CVF CI status
     strategy: depend
+  variables:
+    # Tell CVF to use the same commits/branches as NMODL.
+    SPACK_ENV_FILE_URL: $SPACK_SETUP_COMMIT_MAPPING_URL
 
 .spack_nmodl:
   variables:
     SPACK_PACKAGE: nmodl
     SPACK_PACKAGE_SPEC: ~legacy-unit+python
+    SPACK_EXTRA_MODULES: llvm
+    SPACK_INSTALL_EXTRA_FLAGS: -v
 
 spack_setup:
   extends: .spack_setup_ccache
   variables:
+    NMODL_COMMIT: ${CI_COMMIT_SHA}
     # Enable fetching GitHub PR descriptions and parsing them to find out what
     # branches to build of other projects.
     PARSE_GITHUB_PR_DESCRIPTIONS: "true"
@@ -42,12 +44,12 @@ build:intel:
   variables:
     SPACK_PACKAGE_COMPILER: intel
 
-build:nvhpc:
+build:gcc:
   extends:
     - .spack_build
     - .spack_nmodl
   variables:
-    SPACK_PACKAGE_COMPILER: nvhpc
+    SPACK_PACKAGE_COMPILER: gcc
     SPACK_PACKAGE_DEPENDENCIES: ^bison%gcc^flex%gcc^py-jinja2%gcc^py-sympy%gcc^py-pyyaml%gcc
 
 .nmodl_tests:
@@ -61,8 +63,8 @@ test:intel:
     - .nmodl_tests
   needs: ["build:intel"]
 
-test:nvhpc:
+test:gcc:
   extends:
     - .ctest
     - .nmodl_tests
-  needs: ["build:nvhpc"]
+  needs: ["build:gcc"]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cacf9443ff..28d93e8bfd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,7 +158,7 @@ nmodl_find_python_module(yaml 3.12 REQUIRED)
 # Find LLVM dependencies
 # =============================================================================
 if(NMODL_ENABLE_LLVM)
-  include(LLVMHelper)
+  include(cmake/LLVMHelper.cmake)
   include_directories(${LLVM_INCLUDE_DIRS})
   add_definitions(-DNMODL_LLVM_BACKEND)
 endif()
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index df317598b8..14931a0a9a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -132,15 +132,11 @@ jobs:
       python3 -m pip install --upgrade pip 'setuptools<59.7.0'
       python3 -m pip install --user 'Jinja2>=2.9.3' 'PyYAML>=3.13' pytest pytest-cov numpy 'sympy>=1.3,<1.9'
     displayName: 'Install Dependencies'
-  - script: |
-      cd $HOME
-      git clone --depth 1 https://github.com/pramodk/llvm-nightly.git
-    displayName: 'Setup LLVM v13'
   - script: |
       export PATH=/usr/local/opt/flex/bin:/usr/local/opt/bison/bin:$PATH;
       mkdir -p $(Build.Repository.LocalPath)/build
       cd $(Build.Repository.LocalPath)/build
-      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$HOME/llvm-nightly/0621/osx/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
+      cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$(brew --prefix llvm)/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
       make -j 2
       if [ $? -ne 0 ]
       then
diff --git a/src/codegen/codegen_info.hpp b/src/codegen/codegen_info.hpp
index 17e4102700..70940e7428 100644
--- a/src/codegen/codegen_info.hpp
+++ b/src/codegen/codegen_info.hpp
@@ -196,6 +196,9 @@ enum BlockType {
     /// initial block
     Initial,
 
+    /// constructor block
+    Constructor,
+
     /// destructor block
     Destructor,
 
@@ -214,10 +217,14 @@ enum BlockType {
     /// net_receive block
     NetReceive,
 
+    /// before / after block
+    BeforeAfter,
+
     /// fake ending block type for loops on the enums. Keep it at the end
     BlockTypeEnd
 };
 
+
 /**
  * \class ShadowUseStatement
  * \brief Represents ion write statement during code generation
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 9d79e24803..9b3f759bfa 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -18,6 +18,7 @@
 #include "codegen/codegen_info.hpp"
 #include "symtab/symbol_table.hpp"
 #include "visitors/ast_visitor.hpp"
+ #include "utils/logger.hpp"
 
 namespace nmodl {
 namespace codegen {

From de3a8be20c37ea3ab0137f0f38f60e2a8634b858 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 10 Mar 2022 17:39:11 +0100
Subject: [PATCH 183/331] [LLVM] Allocate InstanceStruct on the GPU using
 cudaMallocManaged (#815)

* Added CUDA includes and libraries
* Added ability to allocate data with cudamallocmanaged if CUDA backend is enabled
* Make cmake 3.17 minimum required cmake version
* Use cmake version 3.17 in azure CI
---
 CMakeLists.txt                                | 14 ++++++++++-
 azure-pipelines.yml                           | 10 ++++----
 .../llvm/codegen_llvm_helper_visitor.hpp      |  2 +-
 test/unit/CMakeLists.txt                      |  6 +++++
 test/unit/codegen/codegen_data_helper.cpp     | 25 +++++++++++++++++--
 5 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28d93e8bfd..6c5aae4a7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 
-cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
 
 project(NMODL LANGUAGES CXX)
 
@@ -23,6 +23,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" OFF)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
 option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
+option(NMODL_ENABLE_LLVM_GPU "Enable LLVM based GPU code generation" ON)
 option(NMODL_ENABLE_JIT_EVENT_LISTENERS "Enable JITEventListener for Perf and Vtune" OFF)
 
 if(NMODL_ENABLE_LEGACY_UNITS)
@@ -161,6 +162,11 @@ if(NMODL_ENABLE_LLVM)
   include(cmake/LLVMHelper.cmake)
   include_directories(${LLVM_INCLUDE_DIRS})
   add_definitions(-DNMODL_LLVM_BACKEND)
+  if(NMODL_ENABLE_LLVM_CUDA)
+    enable_language(CUDA)
+    find_package(CUDAToolkit)
+    add_definitions(-DNMODL_LLVM_CUDA_BACKEND)
+  endif()
 endif()
 
 # =============================================================================
@@ -267,6 +273,12 @@ if(NMODL_ENABLE_LLVM)
   message(STATUS "  CMAKE             | ${LLVM_CMAKE_DIR}")
   message(STATUS "  JIT LISTENERS     | ${NMODL_ENABLE_JIT_EVENT_LISTENERS}")
 endif()
+message(STATUS "LLVM CUDA Codegen   | ${NMODL_ENABLE_LLVM_CUDA}")
+if(NMODL_ENABLE_LLVM_CUDA)
+  message(STATUS "  CUDA VERSION      | ${CUDAToolkit_VERSION}")
+  message(STATUS "  INCLUDE           | ${CUDAToolkit_INCLUDE_DIRS}")
+  message(STATUS "  LIBRARY           | ${CUDAToolkit_LIBRARY_DIR}")
+endif()
 if(NMODL_CLANG_FORMAT)
   message(STATUS "Clang Format        | ${ClangFormat_EXECUTABLE}")
 endif()
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 14931a0a9a..4a490b9cb4 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -52,8 +52,8 @@ jobs:
       chmod +x llvm.sh
       sudo ./llvm.sh 13
     env:
-      CMAKE_VER: 'v3.15.0'
-      CMAKE_PKG: 'cmake-3.15.0-Linux-x86_64'
+      CMAKE_VER: 'v3.17.0'
+      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
     displayName: 'Install Dependencies'
   - script: |
       export PATH=$(pwd)/$CMAKE_PKG/bin:/home/vsts/.local/bin:$PATH
@@ -71,7 +71,7 @@ jobs:
       make install #this is needed for the integration tests
       env CTEST_OUTPUT_ON_FAILURE=1 make test
     env:
-      CMAKE_PKG: 'cmake-3.15.0-Linux-x86_64'
+      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
     displayName: 'Build and Run Unit Tests'
   - script: |
       export PATH=$(pwd)/$CMAKE_PKG/bin:/home/vsts/.local/bin:$PATH
@@ -94,7 +94,7 @@ jobs:
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
     env:
-      CMAKE_PKG: 'cmake-3.15.0-Linux-x86_64'
+      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
       SHELL: 'bash'
     displayName: 'Build Neuron and Run Integration Tests'
   - script: |
@@ -118,7 +118,7 @@ jobs:
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
     env:
-      CMAKE_PKG: 'cmake-3.15.0-Linux-x86_64'
+      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
     displayName: 'Build CoreNEURON and Run Integration Tests with ISPC compiler'
 - job: 'osx1015'
   pool:
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 9b3f759bfa..21aff4a92d 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -17,8 +17,8 @@
 #include "ast/instance_struct.hpp"
 #include "codegen/codegen_info.hpp"
 #include "symtab/symbol_table.hpp"
+#include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
- #include "utils/logger.hpp"
 
 namespace nmodl {
 namespace codegen {
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 903e19214f..4e30d48f1e 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -115,6 +115,12 @@ if(NMODL_ENABLE_LLVM)
                           codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_data_helper.cpp
                                   codegen/codegen_llvm_execution.cpp)
+  if(NMODL_ENABLE_LLVM_CUDA)
+    include_directories(${CUDAToolkit_INCLUDE_DIRS})
+    target_link_libraries(benchmark_data PRIVATE CUDA::cudart)
+    target_link_libraries(testllvm CUDA::cudart)
+    target_link_libraries(test_llvm_runner CUDA::cudart)
+  endif()
   target_link_libraries(
     testllvm
     llvm_codegen
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index a0ee6ec957..d2b17277bc 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -1,5 +1,9 @@
 #include <algorithm>
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+#include <cuda_runtime_api.h>
+#endif
+
 #include "ast/codegen_var_type.hpp"
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 
@@ -18,10 +22,18 @@ const int default_second_order_value = 0;
 CodegenInstanceData::~CodegenInstanceData() {
     // first free num_ptr_members members which are pointers
     for (size_t i = 0; i < num_ptr_members; i++) {
+#ifdef NMODL_LLVM_CUDA_BACKEND
+        cudaFree(members[i]);
+#else
         free(members[i]);
+#endif
     }
-    // and then pointer to container struct
+// and then pointer to container struct
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    cudaFree(base_ptr);
+#else
     free(base_ptr);
+#endif
 }
 
 /**
@@ -85,8 +97,13 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
     // max size of each member : pointer / double has maximum size
     size_t member_size = std::max(sizeof(double), sizeof(double*));
 
-    // allocate instance object with memory alignment
+// allocate instance object with memory alignment
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    cudaMallocManaged(&base, member_size * variables.size());
+#else
     posix_memalign(&base, NBYTE_ALIGNMENT, member_size * variables.size());
+#endif
+
     data.base_ptr = base;
     data.num_bytes += member_size * variables.size();
 
@@ -114,7 +131,11 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
 
         // allocate memory and setup a pointer
         void* member;
+#ifdef NMODL_LLVM_CUDA_BACKEND
+        cudaMallocManaged(&member, member_size * num_elements);
+#else
         posix_memalign(&member, NBYTE_ALIGNMENT, member_size * num_elements);
+#endif
 
         // integer values are often offsets so they must start from
         // 0 to num_elements-1 to avoid out of bound accesses.

From ad819548f9452696d9ebdaa7885834b2affb4e85 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 11 Mar 2022 08:40:22 +0100
Subject: [PATCH 184/331] [LLVM][GPU] Separated CPU and GPU CLI options

Now, CLI has two options: `cpu` and `gpu` that allow
users to target different platforms. For example,

```
bin/nmodl mod/test.mod -o out llvm --ir

bin/nmodl mod/test.mod -o out llvm --ir cpu --name skylake --vector-width 2

bin/nmodl mod/test.mod -o out llvm --ir gpu --name cuda
```

Moreover, `assume_no_alias` option was dropped and
made default (it didn't affect the computation in
our experiments).

The new CLI looks like:
```
llvm
  LLVM code generation option
  Options:
    --ir REQUIRED                         Generate LLVM IR (false)
    --no-debug                            Disable debug information (false)
    --opt-level-ir INT:{0,1,2,3}          LLVM IR optimisation level (O0)
    --single-precision                    Use single precision floating-point types (false)
    --fmf TEXT:{afn,arcp,contract,ninf,nnan,nsz,reassoc,fast} ...
                                          Fast math flags for floating-point optimizations (none)

cpu
  LLVM CPU option
  Options:
    --name TEXT                           Name of CPU platform to use
    --math-library TEXT:{Accelerate,libmvec,libsystem_m,MASSV,SLEEF,SVML,none}
                                          Math library for SIMD code generation (none)
    --vector-width INT                    Explicit vectorization width for IR generation (1)

gpu
  LLVM GPU option
  Options:
    --name TEXT                           Name of GPU platform to use
    --math-library TEXT:{libdevice}       Math library for GPU code generation (none)

benchmark
  LLVM benchmark option
  Options:
    --run                                 Run LLVM benchmark (false)
    --opt-level-codegen INT:{0,1,2,3}     Machine code optimisation level (O0)
    --libs TEXT:FILE ...                  Shared libraries to link IR against
    --instance-size INT                   Instance struct size (10000)
    --repeat INT                          Number of experiments for benchmarking (100)
```
---
 src/main.cpp | 153 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 83 insertions(+), 70 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index b2102aaee7..140e4c77d0 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -176,26 +176,29 @@ int main(int argc, const char* argv[]) {
     /// use single precision floating-point types
     bool llvm_float_type(false);
 
-    /// llvm vector width
-    int llvm_vec_width = 1;
+    /// optimisation level for IR generation
+    int llvm_opt_level_ir = 0;
 
-    /// vector library name
-    std::string vector_library("none");
+    /// math library name
+    std::string llvm_math_library("none");
 
     /// disable debug information generation for the IR
-    bool disable_debug_information(false);
+    bool llvm_no_debug(false);
 
     /// fast math flags for LLVM backend
     std::vector<std::string> llvm_fast_math_flags;
 
-    /// run llvm benchmark
-    bool run_llvm_benchmark(false);
+    /// traget CPU platform name
+    std::string llvm_cpu_name = "default";
 
-    /// do not assume that instance struct fields do not alias
-    bool llvm_assume_alias(false);
+    /// traget GPU platform name
+    std::string llvm_gpu_name = "default";
 
-    /// optimisation level for IR generation
-    int llvm_opt_level_ir = 0;
+    /// llvm vector width if generating code for CPUs
+    int llvm_vector_width = 1;
+
+    /// run llvm benchmark
+    bool llvm_benchmark(false);
 
     /// optimisation level for machine code generation
     int llvm_opt_level_codegen = 0;
@@ -208,9 +211,6 @@ int main(int argc, const char* argv[]) {
 
     /// the number of repeated experiments for the benchmarking
     int num_experiments = 100;
-
-    /// specify the cpu for LLVM IR to target
-    std::string cpu = "default";
 #endif
 
     app.get_formatter()->column_width(40);
@@ -323,36 +323,57 @@ int main(int argc, const char* argv[]) {
 
     // LLVM IR code generation options.
     auto llvm_opt = app.add_subcommand("llvm", "LLVM code generation option")->ignore_case();
-    llvm_opt->add_flag("--ir",
+    auto llvm_ir_opt = llvm_opt->add_flag("--ir",
         llvm_ir,
         "Generate LLVM IR ({})"_format(llvm_ir))->ignore_case();
-    llvm_opt->add_flag("--disable-debug-info",
-                       disable_debug_information,
-                       "Disable debug information ({})"_format(disable_debug_information))->ignore_case();
+    llvm_ir_opt->required(true);
+    llvm_opt->add_flag("--no-debug",
+        llvm_no_debug,
+        "Disable debug information ({})"_format(llvm_no_debug))->ignore_case();
     llvm_opt->add_option("--opt-level-ir",
-                              llvm_opt_level_ir,
-                              "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
+        llvm_opt_level_ir,
+        "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
     llvm_opt->add_flag("--single-precision",
-                       llvm_float_type,
-                       "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
-    llvm_opt->add_flag("--assume-may-alias",
-                       llvm_assume_alias,
-                       "Assume instance struct fields may alias ({})"_format(llvm_assume_alias))->ignore_case();
-    llvm_opt->add_option("--vector-width",
-        llvm_vec_width,
-        "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
-    llvm_opt->add_option("--veclib",
-                         vector_library,
-                         "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libsystem_m", "libmvec", "MASSV", "SLEEF", "SVML", "none"}));
+        llvm_float_type,
+        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
     llvm_opt->add_option("--fmf",
-                         llvm_fast_math_flags,
-                         "Fast math flags for floating-point optimizations (none)")->check(CLI::IsMember({"afn", "arcp", "contract", "ninf", "nnan", "nsz", "reassoc", "fast"}));
+        llvm_fast_math_flags,
+        "Fast math flags for floating-point optimizations (none)")->check(CLI::IsMember({"afn", "arcp", "contract", "ninf", "nnan", "nsz", "reassoc", "fast"}));
+
+    // Platform options for LLVM code generation.
+    auto cpu_opt = app.add_subcommand("cpu", "LLVM CPU option")->ignore_case();
+    cpu_opt->needs(llvm_opt);
+    cpu_opt->add_option("--name",
+        llvm_cpu_name,
+        "Name of CPU platform to use")->ignore_case();
+    auto simd_math_library_opt = cpu_opt->add_option("--math-library",
+        llvm_math_library,
+        "Math library for SIMD code generation ({})"_format(llvm_math_library));
+    simd_math_library_opt->check(CLI::IsMember({"Accelerate", "libmvec", "libsystem_m", "MASSV", "SLEEF", "SVML", "none"}));
+    cpu_opt->add_option("--vector-width",
+        llvm_vector_width,
+        "Explicit vectorization width for IR generation ({})"_format(llvm_vector_width))->ignore_case();
+
+    auto gpu_opt = app.add_subcommand("gpu", "LLVM GPU option")->ignore_case();
+    gpu_opt->needs(llvm_opt);
+    gpu_opt->add_option("--name",
+        llvm_gpu_name,
+        "Name of GPU platform to use")->ignore_case();
+    auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
+        llvm_math_library,
+        "Math library for GPU code generation ({})"_format(llvm_math_library));
+    gpu_math_library_opt->check(CLI::IsMember({"libdevice"}));
+
+    // Allow only one platform at a time.
+    cpu_opt->excludes(gpu_opt);
+    gpu_opt->excludes(cpu_opt);
 
     // LLVM IR benchmark options.
     auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
+    benchmark_opt->needs(llvm_opt);
     benchmark_opt->add_flag("--run",
-                            run_llvm_benchmark,
-                            "Run LLVM benchmark ({})"_format(run_llvm_benchmark))->ignore_case();
+                            llvm_benchmark,
+                            "Run LLVM benchmark ({})"_format(llvm_benchmark))->ignore_case();
     benchmark_opt->add_option("--opt-level-codegen",
                               llvm_opt_level_codegen,
                               "Machine code optimisation level (O{})"_format(llvm_opt_level_codegen))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
@@ -365,9 +386,6 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
-    benchmark_opt->add_option("--cpu",
-                       cpu,
-                       "Target's backend ({})"_format(cpu))->ignore_case();
 #endif
     // clang-format on
 
@@ -673,38 +691,33 @@ int main(int argc, const char* argv[]) {
             }
 
 #ifdef NMODL_LLVM_BACKEND
-            if (llvm_ir || run_llvm_benchmark) {
-                // If benchmarking, we want to optimize the IR with target information and not in
-                // LLVM visitor.
-                int llvm_opt_level = run_llvm_benchmark ? 0 : llvm_opt_level_ir;
-
-                logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(modfile,
-                                           output_dir,
-                                           llvm_opt_level,
-                                           llvm_float_type,
-                                           llvm_vec_width,
-                                           vector_library,
-                                           !disable_debug_information,
-                                           llvm_fast_math_flags,
-                                           llvm_assume_alias);
-                visitor.visit_program(*ast);
-                ast_to_nmodl(*ast, filepath("llvm", "mod"));
-                ast_to_json(*ast, filepath("llvm", "json"));
-
-                if (run_llvm_benchmark) {
-                    logger->info("Running LLVM benchmark");
-                    benchmark::LLVMBenchmark benchmark(visitor,
-                                                       modfile,
-                                                       output_dir,
-                                                       shared_lib_paths,
-                                                       num_experiments,
-                                                       instance_size,
-                                                       cpu,
-                                                       llvm_opt_level_ir,
-                                                       llvm_opt_level_codegen);
-                    benchmark.run(ast);
-                }
+            if (llvm_ir || llvm_benchmark) {
+              // If benchmarking, we want to optimize the IR with target
+              // information and not in LLVM visitor.
+              int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
+
+              if (llvm_gpu_name != "default") {
+                logger->warn("GPU code generation is not supported, targeting "
+                             "CPU instead");
+              }
+
+              logger->info("Running LLVM backend code generator");
+              CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_level,
+                                         llvm_float_type, llvm_vector_width,
+                                         llvm_math_library, !llvm_no_debug,
+                                         llvm_fast_math_flags, true);
+              visitor.visit_program(*ast);
+              ast_to_nmodl(*ast, filepath("llvm", "mod"));
+              ast_to_json(*ast, filepath("llvm", "json"));
+
+              if (llvm_benchmark) {
+                logger->info("Running LLVM benchmark");
+                benchmark::LLVMBenchmark benchmark(
+                    visitor, modfile, output_dir, shared_lib_paths,
+                    num_experiments, instance_size, llvm_cpu_name,
+                    llvm_opt_level_ir, llvm_opt_level_codegen);
+                benchmark.run(ast);
+              }
             }
 #endif
         }

From 399231538e9e14cf2c1be3aba8ae5caab52615e1 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 12 Mar 2022 19:25:13 +0100
Subject: [PATCH 185/331] [LLVM][refactoring] Added platform abstraction

This commit introduces a handy `Plarform` class
that is designed to incorporate target information
for IR  generation, such as precision, vectorization
width (if applicable), type of target (CPU/GPU), etc.

In future, more functionality can be added to `Platform`,
e.g. we can move functionality of handling `llvm::Target`,
math SIMD libraries, etc.

Note: this is just a very basic implementation that enables
easier integration of GPU code generation.
---
 src/codegen/llvm/CMakeLists.txt               |  4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 20 ++--
 src/codegen/llvm/codegen_llvm_visitor.hpp     | 25 ++---
 src/codegen/llvm/llvm_ir_builder.cpp          | 29 +++---
 src/codegen/llvm/llvm_ir_builder.hpp          | 23 ++---
 src/codegen/llvm/main.cpp                     |  5 +-
 src/codegen/llvm/target_platform.cpp          | 54 +++++++++++
 src/codegen/llvm/target_platform.hpp          | 92 +++++++++++++++++++
 src/main.cpp                                  | 24 +++--
 test/unit/codegen/codegen_llvm_execution.cpp  | 40 +++++---
 .../codegen/codegen_llvm_instance_struct.cpp  |  6 +-
 test/unit/codegen/codegen_llvm_ir.cpp         | 14 ++-
 12 files changed, 246 insertions(+), 90 deletions(-)
 create mode 100644 src/codegen/llvm/target_platform.cpp
 create mode 100644 src/codegen/llvm/target_platform.hpp

diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 5c7eadc91c..198d90c1a3 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -11,7 +11,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/target_platform.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/target_platform.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index bac6f4e0b2..0fa81de691 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -68,7 +68,7 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
 void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
                                                                  llvm::Triple& triple) {
     // Since LLVM does not support SLEEF as a vector library yet, process it separately.
-    if (vector_library == "SLEEF") {
+    if (platform.get_math_library() == "SLEEF") {
 // clang-format off
 #define FIXED(w) llvm::ElementCount::getFixed(w)
 // clang-format on
@@ -112,9 +112,9 @@ void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLib
             {"MASSV", VecLib::MASSV},
             {"none", VecLib::NoLibrary},
             {"SVML", VecLib::SVML}};
-        const auto& library = llvm_supported_vector_libraries.find(vector_library);
+        const auto& library = llvm_supported_vector_libraries.find(platform.get_math_library());
         if (library == llvm_supported_vector_libraries.end())
-            throw std::runtime_error("Error: unknown vector library - " + vector_library + "\n");
+            throw std::runtime_error("Error: unknown vector library - " + platform.get_math_library() + "\n");
 
         // Add vectorizable functions to the target library info.
         switch (library->second) {
@@ -542,7 +542,7 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
  * \todo support this properly.
  */
 void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomicStatement& node) {
-    if (vector_width > 1)
+    if (platform.is_cpu_with_simd())
         logger->warn("Atomic operations are not supported");
 
     // Support only assignment for now.
@@ -555,7 +555,7 @@ void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomic
         throw std::runtime_error("Error: only 'VarName' assignment is supported\n");
 
     // Process the assignment as if it was non-atomic.
-    if (vector_width > 1)
+    if (platform.is_cpu_with_simd())
         logger->warn("Treating write as non-atomic");
     write_to_variable(*var, rhs);
 }
@@ -625,7 +625,7 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     ir_builder.set_insertion_point(for_body);
 
     // If not processing remainder of the loop, start vectorization.
-    if (vector_width > 1 && main_loop_initialization)
+    if (platform.is_cpu_with_simd() && main_loop_initialization)
         ir_builder.generate_vector_ir();
 
     // Generate code for the loop body and create the basic block for the increment.
@@ -666,7 +666,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
 
     // Process function or procedure body. If the function is a compute kernel, enable
     // vectorization. If so, the return statement is handled in a separate visitor.
-    if (vector_width > 1 && is_kernel_function(name)) {
+    if (platform.is_cpu_with_simd() && is_kernel_function(name)) {
         ir_builder.generate_vector_ir();
         block->accept(*this);
         ir_builder.generate_scalar_ir();
@@ -740,7 +740,7 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
 
 void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     // If vectorizing the compute kernel with control flow, process it separately.
-    if (vector_width > 1 && ir_builder.vectorizing()) {
+    if (platform.is_cpu_with_simd() && ir_builder.vectorizing()) {
         create_vectorized_control_flow_block(node);
         return;
     }
@@ -815,7 +815,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{vector_width};
+    CodegenLLVMHelperVisitor v{platform.get_instruction_width()};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
@@ -864,7 +864,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     }
 
     // Optionally, replace LLVM math intrinsics with vector library calls.
-    if (vector_width > 1) {
+    if (platform.is_cpu_with_simd()) {
 #if LLVM_VERSION_MAJOR < 13
         logger->warn(
             "This version of LLVM does not support replacement of LLVM intrinsics with vector "
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 22b9fafd83..396d8cbb67 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -81,33 +81,22 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Optimisation level for LLVM IR transformations.
     int opt_level_ir;
 
-    /// Vector library used for math functions.
-    std::string vector_library;
-
-    /// Explicit vectorisation width.
-    int vector_width;
+    /// Target platform for the code generation.
+    Platform platform;
 
   public:
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
+                       Platform& platform,
                        int opt_level_ir,
-                       bool use_single_precision = false,
-                       int vector_width = 1,
-                       std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {},
-                       bool llvm_assume_alias = false)
+                       std::vector<std::string> fast_math_flags = {})
         : mod_filename(mod_filename)
         , output_dir(output_dir)
+        , platform(platform)
         , opt_level_ir(opt_level_ir)
-        , vector_width(vector_width)
-        , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
-        , ir_builder(*context,
-                     use_single_precision,
-                     vector_width,
-                     fast_math_flags,
-                     !llvm_assume_alias)
+        , ir_builder(*context, platform, fast_math_flags)
         , debug_builder(*module) {}
 
     /// Dumps the generated LLVM IR module to string.
@@ -139,7 +128,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
     /// Returns vector width
     int get_vector_width() const {
-        return vector_width;
+        return platform.get_instruction_width();
     }
 
     // Visitors.
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 1015b437f3..e7a6a4a60b 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -41,13 +41,13 @@ llvm::Type* IRBuilder::get_i64_type() {
 }
 
 llvm::Type* IRBuilder::get_fp_type() {
-    if (fp_precision == single_precision)
+    if (platform.is_single_precision())
         return llvm::Type::getFloatTy(builder.getContext());
     return llvm::Type::getDoubleTy(builder.getContext());
 }
 
 llvm::Type* IRBuilder::get_fp_ptr_type() {
-    if (fp_precision == single_precision)
+    if (platform.is_single_precision())
         return llvm::Type::getFloatPtrTy(builder.getContext());
     return llvm::Type::getDoublePtrTy(builder.getContext());
 }
@@ -92,7 +92,7 @@ llvm::Value* IRBuilder::pop_last_value() {
 /****************************************************************************************/
 
 void IRBuilder::create_boolean_constant(int value) {
-    if (vector_width > 1 && vectorize) {
+    if (platform.is_cpu_with_simd() && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_boolean_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_boolean_type(), value));
@@ -100,7 +100,7 @@ void IRBuilder::create_boolean_constant(int value) {
 }
 
 void IRBuilder::create_fp_constant(const std::string& value) {
-    if (vector_width > 1 && vectorize) {
+    if (platform.is_cpu_with_simd() && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantFP>(get_fp_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantFP>(get_fp_type(), value));
@@ -112,7 +112,7 @@ llvm::Value* IRBuilder::create_global_string(const ast::String& node) {
 }
 
 void IRBuilder::create_i32_constant(int value) {
-    if (vector_width > 1 && vectorize) {
+    if (platform.is_cpu_with_simd() && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_i32_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_i32_type(), value));
@@ -126,6 +126,8 @@ llvm::Value* IRBuilder::get_scalar_constant(llvm::Type* type, V value) {
 
 template <typename C, typename V>
 llvm::Value* IRBuilder::get_vector_constant(llvm::Type* type, V value) {
+    int vector_width = platform.get_instruction_width();
+
     ConstantVector constants;
     for (unsigned i = 0; i < vector_width; ++i) {
         const auto& element = C::get(type, value);
@@ -206,9 +208,7 @@ void IRBuilder::set_kernel_attributes() {
     //  > The `noalias` attribute indicates that the only memory accesses inside function are loads
     //  > and stores from objects pointed to by its pointer-typed arguments, with arbitrary
     //  > offsets.
-    if (assume_noalias) {
-        current_function->addParamAttr(0, llvm::Attribute::NoAlias);
-    }
+    current_function->addParamAttr(0, llvm::Attribute::NoAlias);
 
     // Finally, specify that the struct pointer does not capture and is read-only.
     current_function->addParamAttr(0, llvm::Attribute::NoCapture);
@@ -227,7 +227,7 @@ void IRBuilder::set_loop_metadata(llvm::BranchInst* branch) {
     loop_metadata.push_back(nullptr);
 
     // If `vector_width` is 1, explicitly disable vectorization for benchmarking purposes.
-    if (vector_width == 1) {
+    if (platform.is_cpu() && platform.get_instruction_width() == 1) {
         llvm::MDString* name = llvm::MDString::get(context, "llvm.loop.vectorize.enable");
         llvm::Value* false_value = llvm::ConstantInt::get(get_boolean_type(), 0);
         llvm::ValueAsMetadata* value = llvm::ValueAsMetadata::get(false_value);
@@ -376,6 +376,7 @@ llvm::Value* IRBuilder::create_index(llvm::Value* value) {
     const auto& element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
     if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
         return value;
+    int vector_width = platform.get_instruction_width();
     return builder.CreateSExtOrTrunc(value, llvm::FixedVectorType::get(i64_type, vector_width));
 }
 
@@ -449,7 +450,8 @@ void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
     // Even if generating vectorised code, some variables still need to be scalar. Particularly, the
     // induction variable "id" and remainder loop variables (that start with "epilogue" prefix).
     llvm::Type* type;
-    if (vector_width > 1 && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+    if (platform.is_cpu_with_simd() && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+        int vector_width = platform.get_instruction_width();
         type = llvm::FixedVectorType::get(element_or_scalar_type, vector_width);
     } else {
         type = element_or_scalar_type;
@@ -495,7 +497,7 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     llvm::Value* element_ptr = create_inbounds_gep(array, id_value);
 
     // Find out if the vector code is generated.
-    bool generating_vector_ir = vector_width > 1 && vectorize;
+    bool generating_vector_ir = platform.is_cpu_with_simd() && vectorize;
 
     // If the vector code is generated, we need to distinguish between two cases. If the array is
     // indexed indirectly (i.e. not by an induction variable `kernel_id`), create gather/scatter
@@ -523,7 +525,7 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
         // to a vector pointer
         llvm::Type* vector_type = llvm::PointerType::get(
             llvm::FixedVectorType::get(element_ptr->getType()->getPointerElementType(),
-                                       vector_width),
+                                       platform.get_instruction_width()),
             /*AddressSpace=*/0);
         ptr = builder.CreateBitCast(element_ptr, vector_type);
     } else {
@@ -541,11 +543,12 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
 
 void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     // If the value should not be vectorised, or it is already a vector, add it to the stack.
-    if (!vectorize || vector_width == 1 || value->getType()->isVectorTy()) {
+    if (!vectorize || !platform.is_cpu_with_simd() || value->getType()->isVectorTy()) {
         value_stack.push_back(value);
     } else {
         // Otherwise, we generate vectorized code inside the loop, so replicate the value to form a
         // vector.
+        int vector_width = platform.get_instruction_width();
         llvm::Value* vector_value = builder.CreateVectorSplat(vector_width, value);
         value_stack.push_back(vector_value);
     }
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index b3005db0c7..cf9e7f936d 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -10,6 +10,7 @@
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+#include "codegen/llvm/target_platform.hpp"
 #include "symtab/symbol_table.hpp"
 
 #include "llvm/IR/IRBuilder.h"
@@ -52,14 +53,8 @@ class IRBuilder {
     /// Flag to indicate that the generated IR should be vectorized.
     bool vectorize;
 
-    /// Precision of the floating-point numbers (32 or 64 bit).
-    unsigned fp_precision;
-
-    /// The vector width used for the vectorized code.
-    unsigned vector_width;
-
-    /// Instance struct fields do not alias.
-    bool assume_noalias;
+    /// Target platform for which IR is built.
+    Platform platform;
 
     /// Masked value used to predicate vector instructions.
     llvm::Value* mask;
@@ -72,21 +67,17 @@ class IRBuilder {
 
   public:
     IRBuilder(llvm::LLVMContext& context,
-              bool use_single_precision = false,
-              unsigned vector_width = 1,
-              std::vector<std::string> fast_math_flags = {},
-              bool assume_noalias = true)
+              Platform& platform,
+              std::vector<std::string> fast_math_flags = {})
         : builder(context)
+        , platform(platform)
         , symbol_table(nullptr)
         , current_function(nullptr)
         , vectorize(false)
         , alloca_ip(nullptr)
-        , fp_precision(use_single_precision ? single_precision : double_precision)
-        , vector_width(vector_width)
         , mask(nullptr)
         , kernel_id("")
-        , fast_math_flags(fast_math_flags)
-        , assume_noalias(assume_noalias) {}
+        , fast_math_flags(fast_math_flags) {}
 
     /// Transforms the fast math flags provided to the builder into LLVM's representation.
     llvm::FastMathFlags transform_to_fmf(std::vector<std::string>& flags) {
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 6d374999c3..92d8a486c1 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -47,8 +47,11 @@ int main(int argc, const char* argv[]) {
     logger->info("Running Symtab Visitor");
     visitor::SymtabVisitor().visit_program(*ast);
 
+    // Use default platform for this toy example.
+    codegen::Platform platform;
+
     logger->info("Running LLVM Visitor");
-    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", /*opt_level_ir=*/0);
+    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", platform, /*opt_level_ir=*/0);
     llvm_visitor.visit_program(*ast);
     std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
 
diff --git a/src/codegen/llvm/target_platform.cpp b/src/codegen/llvm/target_platform.cpp
new file mode 100644
index 0000000000..6cb8c7bb2b
--- /dev/null
+++ b/src/codegen/llvm/target_platform.cpp
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/target_platform.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+const std::string Platform::DEFAULT_PLATFORM_NAME = "default";
+const std::string Platform::DEFAULT_MATH_LIBRARY = "none";
+
+bool Platform::is_default_platform() {
+    // Default platform is a CPU.
+    return platform_id == PlatformID::CPU &&  name == Platform::DEFAULT_PLATFORM_NAME;
+}
+
+bool Platform::is_cpu() {
+    return platform_id == PlatformID::CPU;
+}
+
+bool Platform::is_cpu_with_simd() {
+    return platform_id == PlatformID::CPU && instruction_width > 1;
+}
+
+bool Platform::is_gpu() {
+    return platform_id == PlatformID::GPU;
+}
+
+bool Platform::is_single_precision() {
+  return use_single_precision;
+}
+
+std::string Platform::get_name() const {
+    return name;
+}
+
+std::string Platform::get_math_library() const {
+    return math_library;
+}
+
+int Platform::get_instruction_width() const {
+    return instruction_width;
+}
+
+int Platform::get_precision() const {
+    return use_single_precision? 32 : 64;
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/target_platform.hpp b/src/codegen/llvm/target_platform.hpp
new file mode 100644
index 0000000000..2eabbb1a4b
--- /dev/null
+++ b/src/codegen/llvm/target_platform.hpp
@@ -0,0 +1,92 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+namespace nmodl {
+namespace codegen {
+
+enum PlatformID {
+    CPU,
+    GPU
+};
+
+/**
+ * \class Platform
+ * \brief A class that represents the target platform. It is needed to
+ * reduce the amount of code passed to LLVM visitor and its helpers.
+ */
+class Platform {
+  public:
+    /// Default name of the target and math library.
+    static const std::string DEFAULT_PLATFORM_NAME;
+    static const std::string DEFAULT_MATH_LIBRARY;
+
+  private:
+    /// Name of the platform.
+    const std::string name = Platform::DEFAULT_PLATFORM_NAME;
+
+    /// Target-specific id to compare platforms easily.
+    PlatformID platform_id;
+
+    /// User-provided width that is used to construct LLVM instructions
+    //  and types.
+    int instruction_width = 1;
+
+    /// Use single-precision floating-point types.
+    bool use_single_precision = false;
+
+    /// A name of user-provided math library.
+    std::string math_library = Platform::DEFAULT_MATH_LIBRARY;
+
+  public:
+    Platform(PlatformID platform_id,
+             const std::string& name,
+             std::string& math_library,
+             bool use_single_precision = false,
+             int instruction_width = 1)
+              : platform_id(platform_id)
+              , name(name)
+              , math_library(math_library)
+              , use_single_precision(use_single_precision)
+              , instruction_width(instruction_width) {}
+
+    Platform(bool use_single_precision,
+             int instruction_width)
+            : platform_id(PlatformID::CPU)
+            , use_single_precision(use_single_precision)
+            , instruction_width(instruction_width) {}
+
+    Platform() : platform_id(PlatformID::CPU) {}
+
+    /// Checks if this platform is a default platform.
+    bool is_default_platform();
+
+    /// Checks if this platform is a CPU.
+    bool is_cpu();
+
+    /// Checks if this platform is a CPU with SIMD support.
+    bool is_cpu_with_simd();
+
+    /// Checks if this platform is a GPU.
+    bool is_gpu();
+
+    bool is_single_precision();
+
+    std::string get_name() const;
+
+    std::string get_math_library() const;
+
+    int get_instruction_width() const;
+
+    int get_precision() const;
+};
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/main.cpp b/src/main.cpp
index 140e4c77d0..f2678fcb48 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -696,21 +696,29 @@ int main(int argc, const char* argv[]) {
               // information and not in LLVM visitor.
               int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
 
-              if (llvm_gpu_name != "default") {
-                logger->warn("GPU code generation is not supported, targeting "
-                             "CPU instead");
-              }
+              // Create platform abstraction.
+              PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU
+                                                          : PlatformID::GPU;
+              const std::string name =
+                  llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
+              Platform platform(pid, name, llvm_math_library, llvm_float_type,
+                                llvm_vector_width);
 
               logger->info("Running LLVM backend code generator");
-              CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_level,
-                                         llvm_float_type, llvm_vector_width,
-                                         llvm_math_library, !llvm_no_debug,
-                                         llvm_fast_math_flags, true);
+              CodegenLLVMVisitor visitor(modfile, output_dir, platform,
+                                         llvm_opt_level, !llvm_no_debug,
+                                         llvm_fast_math_flags);
               visitor.visit_program(*ast);
               ast_to_nmodl(*ast, filepath("llvm", "mod"));
               ast_to_json(*ast, filepath("llvm", "json"));
 
               if (llvm_benchmark) {
+                // \todo integrate Platform class here
+                if (llvm_gpu_name != "default") {
+                  logger->warn("GPU benchmarking is not supported, targeting "
+                               "CPU instead");
+                }
+
                 logger->info("Running LLVM benchmark");
                 benchmark::LLVMBenchmark benchmark(
                     visitor, modfile, output_dir, shared_lib_paths,
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 41605ecbd3..4c9515f814 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -124,8 +124,12 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         const auto& ast = driver.parse_string(nmodl_text);
 
         SymtabVisitor().visit_program(*ast);
+
+        codegen::Platform cpu_platform(/*use_single_precision=*/false,
+                                       /*instruction_width=*/1);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
+                                                 cpu_platform,
                                                  /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
 
@@ -226,8 +230,12 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
         const auto& ast = driver.parse_string(nmodl_text);
 
         SymtabVisitor().visit_program(*ast);
+
+        codegen::Platform cpu_platform(/*use_single_precision=*/false,
+                                       /*instruction_width=*/1);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
+                                                 cpu_platform,
                                                  /*opt_level_ir=*/3);
         llvm_visitor.visit_program(*ast);
 
@@ -299,11 +307,13 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform cpu_platform(/*use_single_precision=*/false,
+                                       /*instruction_width=*/1);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/0,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/1);
+                                                 cpu_platform,
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
@@ -381,11 +391,13 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/4);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/3,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/4);
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/3);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
@@ -463,11 +475,13 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/2);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/0,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/2);
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
@@ -554,11 +568,13 @@ SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/2);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/0,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/2);
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 6042aecfc8..fbb07dfbcd 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -39,11 +39,11 @@ codegen::CodegenInstanceData generate_instance_data(const std::string& text,
     SymtabVisitor().visit_program(*ast);
     NeuronSolveVisitor().visit_program(*ast);
 
+    codegen::Platform cpu_platform(use_single_precision, vector_width);
     codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"test",
                                              /*output_dir=*/".",
-                                             opt_level,
-                                             use_single_precision,
-                                             vector_width);
+                                             cpu_platform,
+                                             opt_level);
     llvm_visitor.visit_program(*ast);
     llvm_visitor.dump_module();
     const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 23f6977aea..34fcd8b0da 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -51,14 +51,12 @@ std::string run_llvm_visitor(const std::string& text,
     NeuronSolveVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
 
-    codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
-                                             /*output_dir=*/".",
-                                             opt_level,
-                                             use_single_precision,
-                                             vector_width,
-                                             vec_lib,
-                                             /*add_debug_information=*/false,
-                                             fast_math_flags);
+    codegen::Platform cpu_platform(codegen::PlatformID::CPU, /*name=*/"default",
+                                   vec_lib, use_single_precision, vector_width);
+    codegen::CodegenLLVMVisitor llvm_visitor(
+        /*mod_filename=*/"unknown",
+        /*output_dir=*/".", cpu_platform, opt_level,
+        /*add_debug_information=*/false, fast_math_flags);
 
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();

From d4e12d0ecd3ed7ee5542fd585724cfae0d7ddbd5 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 00:34:41 +0100
Subject: [PATCH 186/331] [LLVM][GPU] Added GPU-specific AST transformations

This commit adds a new AST node: `CodegenThreadId` that
represents thread id used in GPU computation. Thanks to
the new platform class abstraction, the code to generate
compute body of NEURON block was readapted to support
AST transformations needed for GPU.

Example of the transformation:
```
GPU_ID id
INTEGER node_id
DOUBLE v
IF (id<mech->node_count) {
    node_id = mech->node_index[id]
    v = mech->voltage[node_id]
    mech->m[id] = mech->y[id]+2
}
```
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 208 +++++++++---------
 .../llvm/codegen_llvm_helper_visitor.hpp      |  28 ++-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   8 +-
 src/language/code_generator.cmake             |   1 +
 src/language/codegen.yaml                     |  16 ++
 test/unit/codegen/codegen_llvm_ir.cpp         |  63 +++++-
 6 files changed, 212 insertions(+), 112 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 654afd8ef5..8de61f726b 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -443,7 +443,7 @@ void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
  * @param node Ast node under which variables to be converted to instance type
  */
 void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
-                                                            std::string& index_var) {
+                                                            const std::string& index_var) {
     /// collect all variables in the node of type ast::VarName
     auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
     for (const auto& v: variables) {
@@ -612,35 +612,29 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// statements for new function to be generated
     ast::StatementVector function_statements;
 
-    /// create variable definition for loop index and insert at the beginning
-    std::string loop_index_var = "id";
-    std::vector<std::string> induction_variables{"id"};
-    function_statements.push_back(
-        create_local_variable_statement(induction_variables, INTEGER_TYPE));
-
     /// create vectors of local variables that would be used in compute part
     std::vector<std::string> int_variables{"node_id"};
     std::vector<std::string> double_variables{"v"};
 
-    /// create now main compute part : for loop over channel instances
+    /// create now main compute part
 
-    /// loop body : initialization + solve blocks
-    ast::StatementVector loop_def_statements;
-    ast::StatementVector loop_index_statements;
-    ast::StatementVector loop_body_statements;
+    /// compute body : initialization + solve blocks
+    ast::StatementVector def_statements;
+    ast::StatementVector index_statements;
+    ast::StatementVector body_statements;
     {
         /// access node index and corresponding voltage
-        loop_index_statements.push_back(
+        index_statements.push_back(
             visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
-        loop_body_statements.push_back(
+        body_statements.push_back(
             visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
 
         /// read ion variables
         ion_read_statements(BlockType::State,
                             int_variables,
                             double_variables,
-                            loop_index_statements,
-                            loop_body_statements);
+                            index_statements,
+                            body_statements);
 
         /// main compute node : extract solution expressions from the derivative block
         const auto& solutions = collect_nodes(node, {ast::AstNodeType::SOLUTION_EXPRESSION});
@@ -648,109 +642,41 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
             const auto& solution = std::dynamic_pointer_cast<ast::SolutionExpression>(statement);
             const auto& block = std::dynamic_pointer_cast<ast::StatementBlock>(
                 solution->get_node_to_solve());
-            append_statements_from_block(loop_body_statements, block);
+            append_statements_from_block(body_statements, block);
         }
 
         /// add breakpoint block if no current
         if (info.currents.empty() && info.breakpoint_node != nullptr) {
             auto block = info.breakpoint_node->get_statement_block();
-            append_statements_from_block(loop_body_statements, block);
+            append_statements_from_block(body_statements, block);
         }
 
         /// write ion statements
         ion_write_statements(BlockType::State,
                              int_variables,
                              double_variables,
-                             loop_index_statements,
-                             loop_body_statements);
+                             index_statements,
+                             body_statements);
 
         // \todo handle process_shadow_update_statement and wrote_conc_call yet
     }
 
-    ast::StatementVector loop_body;
-    loop_body.insert(loop_body.end(), loop_def_statements.begin(), loop_def_statements.end());
-    loop_body.insert(loop_body.end(), loop_index_statements.begin(), loop_index_statements.end());
-    loop_body.insert(loop_body.end(), loop_body_statements.begin(), loop_body_statements.end());
-
-    /// now construct a new code block which will become the body of the loop
-    auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
-
-    /// declare main FOR loop local variables
-    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
-    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
-
-    /// main loop possibly vectorized on vector_width
-    {
-        /// loop constructs : initialization, condition and increment
-        const auto& initialization = int_initialization_expression(INDUCTION_VAR);
-        const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, vector_width);
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
-
-        /// clone it
-        auto local_loop_block = std::shared_ptr<ast::StatementBlock>(loop_block->clone());
-
-        /// convert local statement to codegenvar statement
-        convert_local_statement(*local_loop_block);
-
-        auto for_loop_statement_main = std::make_shared<ast::CodegenForStatement>(initialization,
-                                                                                  condition,
-                                                                                  increment,
-                                                                                  local_loop_block);
-
-        /// convert all variables inside loop body to instance variables
-        convert_to_instance_variable(*for_loop_statement_main, loop_index_var);
-
-        /// loop itself becomes one of the statement in the function
-        function_statements.push_back(for_loop_statement_main);
-    }
-
-    /// vectors containing renamed FOR loop local variables
-    std::vector<std::string> renamed_int_variables;
-    std::vector<std::string> renamed_double_variables;
-
-    /// remainder loop possibly vectorized on vector_width
-    if (vector_width > 1) {
-        /// loop constructs : initialization, condition and increment
-        const auto& condition =
-            loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, /*vector_width=*/1);
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, /*vector_width=*/1);
-
-        /// rename local variables to avoid conflict with main loop
-        rename_local_variables(*loop_block);
-
-        /// convert local statement to codegenvar statement
-        convert_local_statement(*loop_block);
-
-        auto for_loop_statement_remainder =
-            std::make_shared<ast::CodegenForStatement>(nullptr, condition, increment, loop_block);
-
-        const auto& loop_statements = for_loop_statement_remainder->get_statement_block();
-        // \todo: Change RenameVisitor to take a vector of names to which it would append a single
-        // prefix.
-        for (const auto& name: int_variables) {
-            std::string new_name = epilogue_variable_prefix + name;
-            renamed_int_variables.push_back(new_name);
-            visitor::RenameVisitor v(name, new_name);
-            loop_statements->accept(v);
-        }
-        for (const auto& name: double_variables) {
-            std::string new_name = epilogue_variable_prefix + name;
-            renamed_double_variables.push_back(new_name);
-            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
-            loop_statements->accept(v);
-        }
+    /// create target-specific compute body
+    ast::StatementVector compute_body;
+    compute_body.insert(compute_body.end(), def_statements.begin(), def_statements.end());
+    compute_body.insert(compute_body.end(), index_statements.begin(), index_statements.end());
+    compute_body.insert(compute_body.end(), body_statements.begin(), body_statements.end());
 
-        /// declare remainder FOR loop local variables
-        function_statements.push_back(
-            create_local_variable_statement(renamed_int_variables, INTEGER_TYPE));
+    if (platform.is_gpu()) {
+        const auto& id_statement = std::make_shared<ast::CodegenThreadId>(create_varname(INDUCTION_VAR));
+        function_statements.push_back(id_statement);
+        create_gpu_compute_body(compute_body, function_statements, int_variables, double_variables);
+    } else {
+        // Create induction variable
+        std::vector<std::string> induction_variables{INDUCTION_VAR};
         function_statements.push_back(
-            create_local_variable_statement(renamed_double_variables, FLOAT_TYPE));
-
-        /// convert all variables inside loop body to instance variables
-        convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
-
-        /// loop itself becomes one of the statement in the function
-        function_statements.push_back(for_loop_statement_remainder);
+                create_local_variable_statement(induction_variables, INTEGER_TYPE));
+        create_cpu_compute_body(compute_body, function_statements, int_variables, double_variables);
     }
 
     /// new block for the function
@@ -777,6 +703,84 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     std::cout << nmodl::to_nmodl(function) << std::endl;
 }
 
+void CodegenLLVMHelperVisitor::create_gpu_compute_body(ast::StatementVector& body,
+                                                       ast::StatementVector& function_statements,
+                                                       std::vector<std::string>& int_variables,
+                                                       std::vector<std::string>& double_variables) {
+    // Then, create condition for thread id. For now - reuse the same functionality as for
+    auto kernel_block = std::make_shared<ast::StatementBlock>(body);
+    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, 1);
+    ast::ElseIfStatementVector else_ifs = {};
+    auto if_statement = std::make_shared<ast::IfStatement>(condition, kernel_block, else_ifs, nullptr);
+
+    convert_to_instance_variable(*if_statement, INDUCTION_VAR);
+
+    // Push variables and the loop to the function statements vector.
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
+    function_statements.push_back(if_statement);
+}
+
+void CodegenLLVMHelperVisitor::create_cpu_compute_body(ast::StatementVector& body,
+                                                       ast::StatementVector& function_statements,
+                                                       std::vector<std::string>& int_variables,
+                                                       std::vector<std::string>& double_variables) {
+    auto loop_block = std::make_shared<ast::StatementBlock>(body);
+    create_compute_body_loop(loop_block, function_statements, int_variables, double_variables);
+    if (platform.is_cpu_with_simd())
+        create_compute_body_loop(loop_block, function_statements, int_variables, double_variables, /*is_remainder_loop=*/true);
+}
+
+void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::StatementBlock>& block,
+                                                        ast::StatementVector& function_statements,
+                                                        std::vector<std::string>& int_variables,
+                                                        std::vector<std::string>& double_variables,
+                                                        bool is_remainder_loop) {
+    // First, check if we are creating a main or remainder loop. If it is a remainder loop, then
+    // no initialization is needed and instruction width is simply 1.
+    int width = is_remainder_loop ? 1 : platform.get_instruction_width();
+    const auto& initialization = is_remainder_loop ? nullptr : int_initialization_expression(INDUCTION_VAR);
+    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, width);
+    const auto& increment = loop_increment_expression(INDUCTION_VAR, width);
+
+    // Clone the statement block if needed since it can be used by the remainder loop.
+    auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
+
+    // Convert local statement to use CodegenVar statements and create  a FOR loop node. Also, if creating
+    // a remainder loop then rename variables to avoid conflicts.
+    if (is_remainder_loop)
+        rename_local_variables(*loop_block);
+    convert_local_statement(*loop_block);
+    auto for_loop = std::make_shared<ast::CodegenForStatement>(initialization,
+                                                               condition,
+                                                               increment,
+                                                               loop_block);
+
+    // Convert all variables inside loop body to be instance variables.
+    convert_to_instance_variable(*for_loop, INDUCTION_VAR);
+
+    // Rename variables if processing remainder loop.
+    if (is_remainder_loop) {
+        const auto& loop_statements = for_loop->get_statement_block();
+        auto rename = [&](std::vector<std::string>& vars) {
+            for (int i = 0; i < vars.size(); ++i) {
+                std::string old_name = vars[i];
+                std::string new_name = epilogue_variable_prefix + vars[i];
+                vars[i] = new_name;
+                visitor::RenameVisitor v(old_name, new_name);
+                loop_statements->accept(v);
+            }
+        };
+        rename(int_variables);
+        rename(double_variables);
+    }
+
+    // Push variables and  the loop to the function statements vector.
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
+    function_statements.push_back(for_loop);
+}
+
 void CodegenLLVMHelperVisitor::remove_inlined_nodes(ast::Program& node) {
     auto program_symtab = node.get_model_symbol_table();
     const auto& func_proc_nodes =
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 21aff4a92d..a40d7923cc 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -16,6 +16,7 @@
 
 #include "ast/instance_struct.hpp"
 #include "codegen/codegen_info.hpp"
+#include "codegen/llvm/target_platform.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
@@ -101,8 +102,8 @@ struct InstanceVarHelper {
  * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
-    /// explicit vectorisation width
-    int vector_width;
+    /// target platform
+    Platform platform;
 
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
@@ -135,8 +136,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     static const std::string VOLTAGE_VAR;
     static const std::string NODE_INDEX_VAR;
 
-    CodegenLLVMHelperVisitor(int vector_width)
-        : vector_width(vector_width) {}
+    CodegenLLVMHelperVisitor(Platform& platform)
+        : platform(platform) {}
 
     const InstanceVarHelper& get_instance_var_helper() {
         return instance_var_helper;
@@ -161,7 +162,7 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
                               ast::StatementVector& index_statements,
                               ast::StatementVector& body_statements);
 
-    void convert_to_instance_variable(ast::Node& node, std::string& index_var);
+    void convert_to_instance_variable(ast::Node& node, const std::string& index_var);
 
     void convert_local_statement(ast::StatementBlock& node);
     void rename_local_variables(ast::StatementBlock& node);
@@ -173,6 +174,23 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void visit_function_block(ast::FunctionBlock& node) override;
     void visit_nrn_state_block(ast::NrnStateBlock& node) override;
     void visit_program(ast::Program& node) override;
+
+  private:
+    /// Methods to populate`function_statements` with necessary AST constructs to form
+    /// a kernel for a specific target.
+    void create_gpu_compute_body(ast::StatementVector& body,
+                                 ast::StatementVector& function_statements,
+                                 std::vector<std::string>& int_variables,
+                                 std::vector<std::string>& double_variables);
+    void create_cpu_compute_body(ast::StatementVector& body,
+                                 ast::StatementVector& function_statements,
+                                 std::vector<std::string>& int_variables,
+                                 std::vector<std::string>& double_variables);
+    void create_compute_body_loop(std::shared_ptr<ast::StatementBlock>& block,
+                                  ast::StatementVector& function_statements,
+                                  std::vector<std::string>& int_variables,
+                                  std::vector<std::string>& double_variables,
+                                  bool is_remainder_loop = false);
 };
 
 /** @} */  // end of llvm_codegen_details
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0fa81de691..2f677cfbec 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -815,12 +815,18 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{platform.get_instruction_width()};
+    CodegenLLVMHelperVisitor v{platform};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
+    // \todo: implement GPU codegen functionality.
+    if (platform.is_gpu()) {
+      logger->warn("GPU code generation is not supported yet, aborting!");
+      return;
+    }
+
     // Initialize the builder for this NMODL program.
     ir_builder.initialize(*sym_tab, kernel_id);
 
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 17123fc833..72b2754b1a 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -71,6 +71,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/codegen_instance_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_struct.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_thread_id.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_list_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_type.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 30bae4c5c5..245010f054 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -286,3 +286,19 @@
                         - rhs:
                             brief: "Expression for atomic operation"
                             type: Expression
+                  - CodegenThreadId:
+                      brief: "Represents a generic thread id expression for GPU code generation"
+                      description: |
+                        For GPU code generation, we use a special AST node to enocde the thread
+                        id calculation. In NMODL, this expression is usually of the form:
+                        \code{.cpp}
+                            id = blockId.x * blockDim.x + threadId.x
+                        \endcode
+                        To be able to support multiple GPU backends, we choose to have a custom AST
+                        node. Therefore, the code generation for this node is kept very simple,
+                        mapping expression to target-specific GPU inrinsics.
+                      nmodl: "GPU_ID "
+                      members:
+                        - name:
+                            brief: "Name of the thread id variable"
+                            type: Identifier
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 34fcd8b0da..e723c850a8 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -68,14 +68,14 @@ std::string run_llvm_visitor(const std::string& text,
 
 std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
     const std::string& text,
-    int vector_width,
+    codegen::Platform& platform,
     const std::vector<ast::AstNodeType>& nodes_to_collect) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
+    CodegenLLVMHelperVisitor(platform).visit_program(*ast);
 
     const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 
@@ -1228,8 +1228,9 @@ SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
             })";
 
         THEN("a single scalar loops is constructed") {
+            codegen::Platform default_platform;
             auto result = run_llvm_visitor_helper(nmodl_text,
-                                                  /*vector_width=*/1,
+                                                  default_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 1);
 
@@ -1279,8 +1280,9 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
 
 
         THEN("vector and epilogue scalar loops are constructed") {
+            codegen::Platform simd_platform(/*use_single_precision=*/false, /*instruction_width=*/8);
             auto result = run_llvm_visitor_helper(nmodl_text,
-                                                  /*vector_width=*/8,
+                                                  simd_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 2);
 
@@ -1523,3 +1525,56 @@ SCENARIO("Removal of inlined functions and procedures", "[visitor][llvm][inline]
         }
     }
 }
+
+//=============================================================================
+// Basic GPU kernel AST generation
+//=============================================================================
+
+SCENARIO("GPU kernel body", "[visitor][llvm][gpu]") {
+    GIVEN("For GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+
+        std::string expected_kernel = R"(
+            VOID nrn_state_test(INSTANCE_STRUCT *mech){
+                GPU_ID id
+                INTEGER node_id
+                DOUBLE v
+                IF (id<mech->node_count) {
+                    node_id = mech->node_index[id]
+                    v = mech->voltage[node_id]
+                    mech->m[id] = mech->y[id]+2
+                }
+            })";
+
+        THEN("a kernel with thread id and if statement is created") {
+            std::string name = "default";
+            std::string math_library = "none";
+            codegen::Platform gpu_platform(codegen::PlatformID::GPU, name, math_library);
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  gpu_platform,
+                                                  {ast::AstNodeType::CODEGEN_FUNCTION});
+            REQUIRE(result.size() == 1);
+
+            auto kernel = reindent_text(to_nmodl(result[0]));
+            REQUIRE(kernel == reindent_text(expected_kernel));
+        }
+    }
+}

From 9940fd8287ff804ba281622ee13cf51dacc131be Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 00:45:11 +0100
Subject: [PATCH 187/331] fixed comments

---
 src/codegen/llvm/codegen_llvm_helper_visitor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 8de61f726b..be64784d33 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -707,7 +707,7 @@ void CodegenLLVMHelperVisitor::create_gpu_compute_body(ast::StatementVector& bod
                                                        ast::StatementVector& function_statements,
                                                        std::vector<std::string>& int_variables,
                                                        std::vector<std::string>& double_variables) {
-    // Then, create condition for thread id. For now - reuse the same functionality as for
+    // Then, create condition for thread id. For now reuse the functionality from `loop_count_expression`.
     auto kernel_block = std::make_shared<ast::StatementBlock>(body);
     const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, 1);
     ast::ElseIfStatementVector else_ifs = {};
@@ -746,7 +746,7 @@ void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::Sta
     // Clone the statement block if needed since it can be used by the remainder loop.
     auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
 
-    // Convert local statement to use CodegenVar statements and create  a FOR loop node. Also, if creating
+    // Convert local statement to use CodegenVar statements and create a FOR loop node. Also, if creating
     // a remainder loop then rename variables to avoid conflicts.
     if (is_remainder_loop)
         rename_local_variables(*loop_block);

From 196a5a3f84e8505a0c3c9149f024a316d6622e61 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 13:04:26 +0100
Subject: [PATCH 188/331] Added code generation for thread id

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 12 +++++-------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  1 +
 src/codegen/llvm/llvm_ir_builder.cpp      | 22 ++++++++++++++++++++++
 src/codegen/llvm/llvm_ir_builder.hpp      |  3 +++
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 2f677cfbec..0bd233ecbb 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -39,7 +39,7 @@ static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
            statement.is_if_statement() || statement.is_codegen_return_statement() ||
            statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_while_statement();
+           statement.is_while_statement() || statement.is_codegen_thread_id();
 }
 
 /// A utility to check that the kernel body can be vectorised.
@@ -694,6 +694,10 @@ void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturn
     ir_builder.create_return(ret_value);
 }
 
+void CodegenLLVMVisitor::visit_codegen_thread_id(const ast::CodegenThreadId& node) {
+    ir_builder.create_thread_id();
+}
+
 void CodegenLLVMVisitor::visit_codegen_var_list_statement(
     const ast::CodegenVarListStatement& node) {
     llvm::Type* scalar_type = get_codegen_var_type(*node.get_var_type());
@@ -821,12 +825,6 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     sym_tab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
-    // \todo: implement GPU codegen functionality.
-    if (platform.is_gpu()) {
-      logger->warn("GPU code generation is not supported yet, aborting!");
-      return;
-    }
-
     // Initialize the builder for this NMODL program.
     ir_builder.initialize(*sym_tab, kernel_id);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 396d8cbb67..6ff79a0ddb 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -138,6 +138,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
+    void visit_codegen_thread_id(const ast::CodegenThreadId& node) override;
     void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index e7a6a4a60b..b88e995771 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/ValueSymbolTable.h"
 
 namespace nmodl {
@@ -554,6 +555,27 @@ void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     }
 }
 
+void IRBuilder::create_thread_id() {
+    llvm::Value* alloca_ptr = create_alloca(kernel_id, get_i32_type());
+
+    llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
+    auto create_call = [&](llvm::Intrinsic::ID id) {
+      llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
+      return builder.CreateCall(intrinsic, {});
+    };
+
+    // For now, this function only supports NVPTX backend, however it can be easily
+    // adjusted to generate thread id variable for any other platform.
+    llvm::Value* block_id = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
+    llvm::Value* block_dim = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+    llvm::Value* tmp = builder.CreateMul(block_id, block_dim);
+
+    llvm::Value* tid = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
+    llvm::Value* id = builder.CreateAdd(tmp, tid);
+
+    builder.CreateStore(id, alloca_ptr);
+}
+
 
 /****************************************************************************************/
 /*                                 LLVM block utilities                                 */
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index cf9e7f936d..aa9c7ab1e3 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -230,6 +230,9 @@ class IRBuilder {
     void create_scalar_or_vector_alloca(const std::string& name,
                                         llvm::Type* element_or_scalar_type);
 
+    /// Creates a variable of the form: id = blockIdx.x * blockDim.x + threadIdx.x
+    void create_thread_id();
+
     /// Generates LLVM IR for the given unary operator.
     void create_unary_op(llvm::Value* value, ast::UnaryOp op);
 

From 7044204507d6a69c4ad14d1f26bcfea2788daea2 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 13:23:10 +0100
Subject: [PATCH 189/331] Added kernel annotation generation

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 28 +++++++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp |  3 +++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0bd233ecbb..86fe5b5443 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -64,6 +64,16 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
+    llvm::Metadata* metadata[] = {
+        llvm::ValueAsMetadata::get(kernel),
+        llvm::MDString::get(*context, "kernel"),
+        llvm::ValueAsMetadata::get(
+            llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
+    llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
+    module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
+}
+
 #if LLVM_VERSION_MAJOR >= 13
 void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
                                                                  llvm::Triple& triple) {
@@ -665,11 +675,19 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     ir_builder.allocate_function_arguments(func, arguments);
 
     // Process function or procedure body. If the function is a compute kernel, enable
-    // vectorization. If so, the return statement is handled in a separate visitor.
-    if (platform.is_cpu_with_simd() && is_kernel_function(name)) {
-        ir_builder.generate_vector_ir();
-        block->accept(*this);
-        ir_builder.generate_scalar_ir();
+    // vectorization or add NVVM annotations. If this is the case, the return statement is
+    // handled in a separate visitor.
+    if (is_kernel_function(name)) {
+        if (platform.is_cpu_with_simd()) {
+            ir_builder.generate_vector_ir();
+            block->accept(*this);
+            ir_builder.generate_scalar_ir();
+        } else if (platform.is_gpu()) {
+            block->accept(*this);
+            annotate_kernel_with_nvvm(func);
+        } else { // scalar
+            block->accept(*this);
+        }
     } else {
         block->accept(*this);
     }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 6ff79a0ddb..67a3a6fab6 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -157,6 +157,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void wrap_kernel_functions();
 
   private:
+    // Annotates kernel function with NVVM metadata.
+    void annotate_kernel_with_nvvm(llvm::Function* kernel);
+
 #if LLVM_VERSION_MAJOR >= 13
     /// Populates target library info with the vector library definitions.
     void add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,

From 9351e397fc934074331a4561b4ddcfb86902992b Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 13:54:10 +0100
Subject: [PATCH 190/331] Added tests for annotations/intrinsics

---
 test/unit/codegen/codegen_llvm_ir.cpp | 74 +++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index e723c850a8..4d71a5a276 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -34,6 +34,32 @@ using nmodl::parser::NmodlDriver;
 // Utility to get LLVM module as a string
 //=============================================================================
 
+std::string run_gpu_llvm_visitor(const std::string& text,
+                                 int opt_level = 0,
+                                 bool use_single_precision = false,
+                                 std::string math_library = "none",
+                                 bool nmodl_inline = false) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    SymtabVisitor().visit_program(*ast);
+    if (nmodl_inline) {
+        InlineVisitor().visit_program(*ast);
+    }
+    NeuronSolveVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
+
+    codegen::Platform gpu_platform(codegen::PlatformID::GPU, /*name=*/"nvidia",
+                                   math_library, use_single_precision, 1);
+    codegen::CodegenLLVMVisitor llvm_visitor(
+        /*mod_filename=*/"unknown",
+        /*output_dir=*/".", gpu_platform, opt_level,
+        /*add_debug_information=*/false);
+
+    llvm_visitor.visit_program(*ast);
+    return llvm_visitor.dump_module();
+}
+
 std::string run_llvm_visitor(const std::string& text,
                              int opt_level = 0,
                              bool use_single_precision = false,
@@ -1578,3 +1604,51 @@ SCENARIO("GPU kernel body", "[visitor][llvm][gpu]") {
         }
     }
 }
+
+//=============================================================================
+// Basic NVVM/LLVM IR generation for GPU platforms
+//=============================================================================
+
+SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
+    GIVEN("For GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+        THEN("kernel annotations are added and thread id intrinsics generated") {
+            std::string module_string = run_gpu_llvm_visitor(nmodl_text,
+                                                             /*opt_level=*/0,
+                                                             /*use_single_precision=*/false);
+            std::smatch m;
+
+            // Check kernel annotations are correclty created.
+            std::regex annotations(R"(!nvvm\.annotations = !\{!0\})");
+            std::regex kernel_data(R"(!0 = !\{void \(%.*__instance_var__type\*\)\* @nrn_state_.*, !\"kernel\", i32 1\})");
+            REQUIRE(std::regex_search(module_string, m, annotations));
+            REQUIRE(std::regex_search(module_string, m, kernel_data));
+
+            // Check thread/block id/dim instrinsics are created.
+            std::regex block_id(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ctaid\.x\(\))");
+            std::regex block_dim(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ntid\.x\(\))");
+            std::regex tid(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.tid\.x\(\))");
+            REQUIRE(std::regex_search(module_string, m, block_id));
+            REQUIRE(std::regex_search(module_string, m, block_dim));
+            REQUIRE(std::regex_search(module_string, m, tid));
+        }
+    }
+}

From e26d9618da7077e2338455b7bc4451b499cbc3ea Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 14 Mar 2022 10:57:57 +0100
Subject: [PATCH 191/331] Removed POINTER compatibility error (#821)

---
 src/codegen/codegen_compatibility_visitor.cpp |  9 ---------
 src/codegen/codegen_compatibility_visitor.hpp | 10 ----------
 2 files changed, 19 deletions(-)

diff --git a/src/codegen/codegen_compatibility_visitor.cpp b/src/codegen/codegen_compatibility_visitor.cpp
index 7d10358083..33863f1a02 100644
--- a/src/codegen/codegen_compatibility_visitor.cpp
+++ b/src/codegen/codegen_compatibility_visitor.cpp
@@ -36,7 +36,6 @@ const std::map<ast::AstNodeType, CodegenCompatibilityVisitor::FunctionPointer>
           &CodegenCompatibilityVisitor::return_error_if_solve_method_is_unhandled},
          {AstNodeType::GLOBAL_VAR, &CodegenCompatibilityVisitor::return_error_global_var},
          {AstNodeType::PARAM_ASSIGN, &CodegenCompatibilityVisitor::return_error_param_var},
-         {AstNodeType::POINTER_VAR, &CodegenCompatibilityVisitor::return_error_pointer},
          {AstNodeType::BBCORE_POINTER_VAR,
           &CodegenCompatibilityVisitor::return_error_if_no_bbcore_read_write}});
 
@@ -86,14 +85,6 @@ std::string CodegenCompatibilityVisitor::return_error_param_var(
     return error_message_global_var.str();
 }
 
-std::string CodegenCompatibilityVisitor::return_error_pointer(
-    ast::Ast& node,
-    const std::shared_ptr<ast::Ast>& ast_node) {
-    auto pointer_var = std::dynamic_pointer_cast<ast::PointerVar>(ast_node);
-    return "\"{}\" POINTER found at [{}] should be defined as BBCOREPOINTER to use it in CoreNeuron\n"_format(
-        pointer_var->get_node_name(), pointer_var->get_token()->position());
-}
-
 std::string CodegenCompatibilityVisitor::return_error_if_no_bbcore_read_write(
     ast::Ast& node,
     const std::shared_ptr<ast::Ast>& ast_node) {
diff --git a/src/codegen/codegen_compatibility_visitor.hpp b/src/codegen/codegen_compatibility_visitor.hpp
index a19940620c..b2f205aa4d 100644
--- a/src/codegen/codegen_compatibility_visitor.hpp
+++ b/src/codegen/codegen_compatibility_visitor.hpp
@@ -136,16 +136,6 @@ class CodegenCompatibilityVisitor: public visitor::AstVisitor {
 
     std::string return_error_param_var(ast::Ast& node, const std::shared_ptr<ast::Ast>& ast_node);
 
-    /// Takes as parameter an std::shared_ptr<ast::Ast> node
-    /// and returns a relative error with the name and the
-    /// location of the pointer, as well as a suggestion to
-    /// define it as BBCOREPOINTER
-    ///
-    /// \param node Not used by the function
-    /// \param ast_node Ast node which is checked
-    /// \return std::string error
-    std::string return_error_pointer(ast::Ast& node, const std::shared_ptr<ast::Ast>& ast_node);
-
     /// Takes as parameter the ast::Ast and checks if the
     /// functions "bbcore_read" and "bbcore_write" are defined
     /// in any of the ast::Ast VERBATIM blocks. The function is

From b93ce1212f5a26e551098762417eb774f13a4671 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Mon, 14 Mar 2022 12:48:49 +0100
Subject: [PATCH 192/331] [LLVM][GPU] Separated CPU and GPU CLI options (#817)

Now, CLI has two options: `cpu` and `gpu` that allow users to target different platforms. For example,

```
bin/nmodl mod/test.mod -o out llvm --ir

bin/nmodl mod/test.mod -o out llvm --ir cpu --name skylake --vector-width 2

bin/nmodl mod/test.mod -o out llvm --ir gpu --name cuda
```

Moreover, `assume_no_alias` option was dropped and
made default (it didn't affect the computation in
our experiments).

The new CLI looks like:
```
llvm
  LLVM code generation option
  Options:
    --ir REQUIRED                         Generate LLVM IR (false)
    --no-debug                            Disable debug information (false)
    --opt-level-ir INT:{0,1,2,3}          LLVM IR optimisation level (O0)
    --single-precision                    Use single precision floating-point types (false)
    --fmf TEXT:{afn,arcp,contract,ninf,nnan,nsz,reassoc,fast} ...
                                          Fast math flags for floating-point optimizations (none)

cpu
  LLVM CPU option
  Options:
    --name TEXT                           Name of CPU platform to use
    --math-library TEXT:{Accelerate,libmvec,libsystem_m,MASSV,SLEEF,SVML,none}
                                          Math library for SIMD code generation (none)
    --vector-width INT                    Explicit vectorization width for IR generation (1)

gpu
  LLVM GPU option
  Options:
    --name TEXT                           Name of GPU platform to use
    --math-library TEXT:{libdevice}       Math library for GPU code generation (none)

benchmark
  LLVM benchmark option
  Options:
    --run                                 Run LLVM benchmark (false)
    --opt-level-codegen INT:{0,1,2,3}     Machine code optimisation level (O0)
    --libs TEXT:FILE ...                  Shared libraries to link IR against
    --instance-size INT                   Instance struct size (10000)
    --repeat INT                          Number of experiments for benchmarking (100)
```
---
 src/main.cpp | 153 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 83 insertions(+), 70 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index b2102aaee7..140e4c77d0 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -176,26 +176,29 @@ int main(int argc, const char* argv[]) {
     /// use single precision floating-point types
     bool llvm_float_type(false);
 
-    /// llvm vector width
-    int llvm_vec_width = 1;
+    /// optimisation level for IR generation
+    int llvm_opt_level_ir = 0;
 
-    /// vector library name
-    std::string vector_library("none");
+    /// math library name
+    std::string llvm_math_library("none");
 
     /// disable debug information generation for the IR
-    bool disable_debug_information(false);
+    bool llvm_no_debug(false);
 
     /// fast math flags for LLVM backend
     std::vector<std::string> llvm_fast_math_flags;
 
-    /// run llvm benchmark
-    bool run_llvm_benchmark(false);
+    /// traget CPU platform name
+    std::string llvm_cpu_name = "default";
 
-    /// do not assume that instance struct fields do not alias
-    bool llvm_assume_alias(false);
+    /// traget GPU platform name
+    std::string llvm_gpu_name = "default";
 
-    /// optimisation level for IR generation
-    int llvm_opt_level_ir = 0;
+    /// llvm vector width if generating code for CPUs
+    int llvm_vector_width = 1;
+
+    /// run llvm benchmark
+    bool llvm_benchmark(false);
 
     /// optimisation level for machine code generation
     int llvm_opt_level_codegen = 0;
@@ -208,9 +211,6 @@ int main(int argc, const char* argv[]) {
 
     /// the number of repeated experiments for the benchmarking
     int num_experiments = 100;
-
-    /// specify the cpu for LLVM IR to target
-    std::string cpu = "default";
 #endif
 
     app.get_formatter()->column_width(40);
@@ -323,36 +323,57 @@ int main(int argc, const char* argv[]) {
 
     // LLVM IR code generation options.
     auto llvm_opt = app.add_subcommand("llvm", "LLVM code generation option")->ignore_case();
-    llvm_opt->add_flag("--ir",
+    auto llvm_ir_opt = llvm_opt->add_flag("--ir",
         llvm_ir,
         "Generate LLVM IR ({})"_format(llvm_ir))->ignore_case();
-    llvm_opt->add_flag("--disable-debug-info",
-                       disable_debug_information,
-                       "Disable debug information ({})"_format(disable_debug_information))->ignore_case();
+    llvm_ir_opt->required(true);
+    llvm_opt->add_flag("--no-debug",
+        llvm_no_debug,
+        "Disable debug information ({})"_format(llvm_no_debug))->ignore_case();
     llvm_opt->add_option("--opt-level-ir",
-                              llvm_opt_level_ir,
-                              "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
+        llvm_opt_level_ir,
+        "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
     llvm_opt->add_flag("--single-precision",
-                       llvm_float_type,
-                       "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
-    llvm_opt->add_flag("--assume-may-alias",
-                       llvm_assume_alias,
-                       "Assume instance struct fields may alias ({})"_format(llvm_assume_alias))->ignore_case();
-    llvm_opt->add_option("--vector-width",
-        llvm_vec_width,
-        "LLVM explicit vectorisation width ({})"_format(llvm_vec_width))->ignore_case();
-    llvm_opt->add_option("--veclib",
-                         vector_library,
-                         "Vector library for maths functions ({})"_format(vector_library))->check(CLI::IsMember({"Accelerate", "libsystem_m", "libmvec", "MASSV", "SLEEF", "SVML", "none"}));
+        llvm_float_type,
+        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
     llvm_opt->add_option("--fmf",
-                         llvm_fast_math_flags,
-                         "Fast math flags for floating-point optimizations (none)")->check(CLI::IsMember({"afn", "arcp", "contract", "ninf", "nnan", "nsz", "reassoc", "fast"}));
+        llvm_fast_math_flags,
+        "Fast math flags for floating-point optimizations (none)")->check(CLI::IsMember({"afn", "arcp", "contract", "ninf", "nnan", "nsz", "reassoc", "fast"}));
+
+    // Platform options for LLVM code generation.
+    auto cpu_opt = app.add_subcommand("cpu", "LLVM CPU option")->ignore_case();
+    cpu_opt->needs(llvm_opt);
+    cpu_opt->add_option("--name",
+        llvm_cpu_name,
+        "Name of CPU platform to use")->ignore_case();
+    auto simd_math_library_opt = cpu_opt->add_option("--math-library",
+        llvm_math_library,
+        "Math library for SIMD code generation ({})"_format(llvm_math_library));
+    simd_math_library_opt->check(CLI::IsMember({"Accelerate", "libmvec", "libsystem_m", "MASSV", "SLEEF", "SVML", "none"}));
+    cpu_opt->add_option("--vector-width",
+        llvm_vector_width,
+        "Explicit vectorization width for IR generation ({})"_format(llvm_vector_width))->ignore_case();
+
+    auto gpu_opt = app.add_subcommand("gpu", "LLVM GPU option")->ignore_case();
+    gpu_opt->needs(llvm_opt);
+    gpu_opt->add_option("--name",
+        llvm_gpu_name,
+        "Name of GPU platform to use")->ignore_case();
+    auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
+        llvm_math_library,
+        "Math library for GPU code generation ({})"_format(llvm_math_library));
+    gpu_math_library_opt->check(CLI::IsMember({"libdevice"}));
+
+    // Allow only one platform at a time.
+    cpu_opt->excludes(gpu_opt);
+    gpu_opt->excludes(cpu_opt);
 
     // LLVM IR benchmark options.
     auto benchmark_opt = app.add_subcommand("benchmark", "LLVM benchmark option")->ignore_case();
+    benchmark_opt->needs(llvm_opt);
     benchmark_opt->add_flag("--run",
-                            run_llvm_benchmark,
-                            "Run LLVM benchmark ({})"_format(run_llvm_benchmark))->ignore_case();
+                            llvm_benchmark,
+                            "Run LLVM benchmark ({})"_format(llvm_benchmark))->ignore_case();
     benchmark_opt->add_option("--opt-level-codegen",
                               llvm_opt_level_codegen,
                               "Machine code optimisation level (O{})"_format(llvm_opt_level_codegen))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
@@ -365,9 +386,6 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
-    benchmark_opt->add_option("--cpu",
-                       cpu,
-                       "Target's backend ({})"_format(cpu))->ignore_case();
 #endif
     // clang-format on
 
@@ -673,38 +691,33 @@ int main(int argc, const char* argv[]) {
             }
 
 #ifdef NMODL_LLVM_BACKEND
-            if (llvm_ir || run_llvm_benchmark) {
-                // If benchmarking, we want to optimize the IR with target information and not in
-                // LLVM visitor.
-                int llvm_opt_level = run_llvm_benchmark ? 0 : llvm_opt_level_ir;
-
-                logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(modfile,
-                                           output_dir,
-                                           llvm_opt_level,
-                                           llvm_float_type,
-                                           llvm_vec_width,
-                                           vector_library,
-                                           !disable_debug_information,
-                                           llvm_fast_math_flags,
-                                           llvm_assume_alias);
-                visitor.visit_program(*ast);
-                ast_to_nmodl(*ast, filepath("llvm", "mod"));
-                ast_to_json(*ast, filepath("llvm", "json"));
-
-                if (run_llvm_benchmark) {
-                    logger->info("Running LLVM benchmark");
-                    benchmark::LLVMBenchmark benchmark(visitor,
-                                                       modfile,
-                                                       output_dir,
-                                                       shared_lib_paths,
-                                                       num_experiments,
-                                                       instance_size,
-                                                       cpu,
-                                                       llvm_opt_level_ir,
-                                                       llvm_opt_level_codegen);
-                    benchmark.run(ast);
-                }
+            if (llvm_ir || llvm_benchmark) {
+              // If benchmarking, we want to optimize the IR with target
+              // information and not in LLVM visitor.
+              int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
+
+              if (llvm_gpu_name != "default") {
+                logger->warn("GPU code generation is not supported, targeting "
+                             "CPU instead");
+              }
+
+              logger->info("Running LLVM backend code generator");
+              CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_level,
+                                         llvm_float_type, llvm_vector_width,
+                                         llvm_math_library, !llvm_no_debug,
+                                         llvm_fast_math_flags, true);
+              visitor.visit_program(*ast);
+              ast_to_nmodl(*ast, filepath("llvm", "mod"));
+              ast_to_json(*ast, filepath("llvm", "json"));
+
+              if (llvm_benchmark) {
+                logger->info("Running LLVM benchmark");
+                benchmark::LLVMBenchmark benchmark(
+                    visitor, modfile, output_dir, shared_lib_paths,
+                    num_experiments, instance_size, llvm_cpu_name,
+                    llvm_opt_level_ir, llvm_opt_level_codegen);
+                benchmark.run(ast);
+              }
             }
 #endif
         }

From acb9deafea3193d084029a0d4d7216a2ab076b5c Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sat, 12 Mar 2022 19:25:13 +0100
Subject: [PATCH 193/331] [LLVM][refactoring] Added platform abstraction

This commit introduces a handy `Plarform` class
that is designed to incorporate target information
for IR  generation, such as precision, vectorization
width (if applicable), type of target (CPU/GPU), etc.

In future, more functionality can be added to `Platform`,
e.g. we can move functionality of handling `llvm::Target`,
math SIMD libraries, etc.

Note: this is just a very basic implementation that enables
easier integration of GPU code generation.
---
 src/codegen/llvm/CMakeLists.txt               |  4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 20 ++--
 src/codegen/llvm/codegen_llvm_visitor.hpp     | 25 ++---
 src/codegen/llvm/llvm_ir_builder.cpp          | 29 +++---
 src/codegen/llvm/llvm_ir_builder.hpp          | 23 ++---
 src/codegen/llvm/main.cpp                     |  5 +-
 src/codegen/llvm/target_platform.cpp          | 54 +++++++++++
 src/codegen/llvm/target_platform.hpp          | 92 +++++++++++++++++++
 src/main.cpp                                  | 24 +++--
 test/unit/codegen/codegen_llvm_execution.cpp  | 40 +++++---
 .../codegen/codegen_llvm_instance_struct.cpp  |  6 +-
 test/unit/codegen/codegen_llvm_ir.cpp         | 14 ++-
 12 files changed, 246 insertions(+), 90 deletions(-)
 create mode 100644 src/codegen/llvm/target_platform.cpp
 create mode 100644 src/codegen/llvm/target_platform.hpp

diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 5c7eadc91c..198d90c1a3 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -11,7 +11,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/target_platform.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/target_platform.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index bac6f4e0b2..0fa81de691 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -68,7 +68,7 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
 void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
                                                                  llvm::Triple& triple) {
     // Since LLVM does not support SLEEF as a vector library yet, process it separately.
-    if (vector_library == "SLEEF") {
+    if (platform.get_math_library() == "SLEEF") {
 // clang-format off
 #define FIXED(w) llvm::ElementCount::getFixed(w)
 // clang-format on
@@ -112,9 +112,9 @@ void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLib
             {"MASSV", VecLib::MASSV},
             {"none", VecLib::NoLibrary},
             {"SVML", VecLib::SVML}};
-        const auto& library = llvm_supported_vector_libraries.find(vector_library);
+        const auto& library = llvm_supported_vector_libraries.find(platform.get_math_library());
         if (library == llvm_supported_vector_libraries.end())
-            throw std::runtime_error("Error: unknown vector library - " + vector_library + "\n");
+            throw std::runtime_error("Error: unknown vector library - " + platform.get_math_library() + "\n");
 
         // Add vectorizable functions to the target library info.
         switch (library->second) {
@@ -542,7 +542,7 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
  * \todo support this properly.
  */
 void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomicStatement& node) {
-    if (vector_width > 1)
+    if (platform.is_cpu_with_simd())
         logger->warn("Atomic operations are not supported");
 
     // Support only assignment for now.
@@ -555,7 +555,7 @@ void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomic
         throw std::runtime_error("Error: only 'VarName' assignment is supported\n");
 
     // Process the assignment as if it was non-atomic.
-    if (vector_width > 1)
+    if (platform.is_cpu_with_simd())
         logger->warn("Treating write as non-atomic");
     write_to_variable(*var, rhs);
 }
@@ -625,7 +625,7 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     ir_builder.set_insertion_point(for_body);
 
     // If not processing remainder of the loop, start vectorization.
-    if (vector_width > 1 && main_loop_initialization)
+    if (platform.is_cpu_with_simd() && main_loop_initialization)
         ir_builder.generate_vector_ir();
 
     // Generate code for the loop body and create the basic block for the increment.
@@ -666,7 +666,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
 
     // Process function or procedure body. If the function is a compute kernel, enable
     // vectorization. If so, the return statement is handled in a separate visitor.
-    if (vector_width > 1 && is_kernel_function(name)) {
+    if (platform.is_cpu_with_simd() && is_kernel_function(name)) {
         ir_builder.generate_vector_ir();
         block->accept(*this);
         ir_builder.generate_scalar_ir();
@@ -740,7 +740,7 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
 
 void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     // If vectorizing the compute kernel with control flow, process it separately.
-    if (vector_width > 1 && ir_builder.vectorizing()) {
+    if (platform.is_cpu_with_simd() && ir_builder.vectorizing()) {
         create_vectorized_control_flow_block(node);
         return;
     }
@@ -815,7 +815,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{vector_width};
+    CodegenLLVMHelperVisitor v{platform.get_instruction_width()};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
@@ -864,7 +864,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     }
 
     // Optionally, replace LLVM math intrinsics with vector library calls.
-    if (vector_width > 1) {
+    if (platform.is_cpu_with_simd()) {
 #if LLVM_VERSION_MAJOR < 13
         logger->warn(
             "This version of LLVM does not support replacement of LLVM intrinsics with vector "
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 22b9fafd83..396d8cbb67 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -81,33 +81,22 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Optimisation level for LLVM IR transformations.
     int opt_level_ir;
 
-    /// Vector library used for math functions.
-    std::string vector_library;
-
-    /// Explicit vectorisation width.
-    int vector_width;
+    /// Target platform for the code generation.
+    Platform platform;
 
   public:
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
+                       Platform& platform,
                        int opt_level_ir,
-                       bool use_single_precision = false,
-                       int vector_width = 1,
-                       std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {},
-                       bool llvm_assume_alias = false)
+                       std::vector<std::string> fast_math_flags = {})
         : mod_filename(mod_filename)
         , output_dir(output_dir)
+        , platform(platform)
         , opt_level_ir(opt_level_ir)
-        , vector_width(vector_width)
-        , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
-        , ir_builder(*context,
-                     use_single_precision,
-                     vector_width,
-                     fast_math_flags,
-                     !llvm_assume_alias)
+        , ir_builder(*context, platform, fast_math_flags)
         , debug_builder(*module) {}
 
     /// Dumps the generated LLVM IR module to string.
@@ -139,7 +128,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
     /// Returns vector width
     int get_vector_width() const {
-        return vector_width;
+        return platform.get_instruction_width();
     }
 
     // Visitors.
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 1015b437f3..e7a6a4a60b 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -41,13 +41,13 @@ llvm::Type* IRBuilder::get_i64_type() {
 }
 
 llvm::Type* IRBuilder::get_fp_type() {
-    if (fp_precision == single_precision)
+    if (platform.is_single_precision())
         return llvm::Type::getFloatTy(builder.getContext());
     return llvm::Type::getDoubleTy(builder.getContext());
 }
 
 llvm::Type* IRBuilder::get_fp_ptr_type() {
-    if (fp_precision == single_precision)
+    if (platform.is_single_precision())
         return llvm::Type::getFloatPtrTy(builder.getContext());
     return llvm::Type::getDoublePtrTy(builder.getContext());
 }
@@ -92,7 +92,7 @@ llvm::Value* IRBuilder::pop_last_value() {
 /****************************************************************************************/
 
 void IRBuilder::create_boolean_constant(int value) {
-    if (vector_width > 1 && vectorize) {
+    if (platform.is_cpu_with_simd() && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_boolean_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_boolean_type(), value));
@@ -100,7 +100,7 @@ void IRBuilder::create_boolean_constant(int value) {
 }
 
 void IRBuilder::create_fp_constant(const std::string& value) {
-    if (vector_width > 1 && vectorize) {
+    if (platform.is_cpu_with_simd() && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantFP>(get_fp_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantFP>(get_fp_type(), value));
@@ -112,7 +112,7 @@ llvm::Value* IRBuilder::create_global_string(const ast::String& node) {
 }
 
 void IRBuilder::create_i32_constant(int value) {
-    if (vector_width > 1 && vectorize) {
+    if (platform.is_cpu_with_simd() && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_i32_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_i32_type(), value));
@@ -126,6 +126,8 @@ llvm::Value* IRBuilder::get_scalar_constant(llvm::Type* type, V value) {
 
 template <typename C, typename V>
 llvm::Value* IRBuilder::get_vector_constant(llvm::Type* type, V value) {
+    int vector_width = platform.get_instruction_width();
+
     ConstantVector constants;
     for (unsigned i = 0; i < vector_width; ++i) {
         const auto& element = C::get(type, value);
@@ -206,9 +208,7 @@ void IRBuilder::set_kernel_attributes() {
     //  > The `noalias` attribute indicates that the only memory accesses inside function are loads
     //  > and stores from objects pointed to by its pointer-typed arguments, with arbitrary
     //  > offsets.
-    if (assume_noalias) {
-        current_function->addParamAttr(0, llvm::Attribute::NoAlias);
-    }
+    current_function->addParamAttr(0, llvm::Attribute::NoAlias);
 
     // Finally, specify that the struct pointer does not capture and is read-only.
     current_function->addParamAttr(0, llvm::Attribute::NoCapture);
@@ -227,7 +227,7 @@ void IRBuilder::set_loop_metadata(llvm::BranchInst* branch) {
     loop_metadata.push_back(nullptr);
 
     // If `vector_width` is 1, explicitly disable vectorization for benchmarking purposes.
-    if (vector_width == 1) {
+    if (platform.is_cpu() && platform.get_instruction_width() == 1) {
         llvm::MDString* name = llvm::MDString::get(context, "llvm.loop.vectorize.enable");
         llvm::Value* false_value = llvm::ConstantInt::get(get_boolean_type(), 0);
         llvm::ValueAsMetadata* value = llvm::ValueAsMetadata::get(false_value);
@@ -376,6 +376,7 @@ llvm::Value* IRBuilder::create_index(llvm::Value* value) {
     const auto& element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
     if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
         return value;
+    int vector_width = platform.get_instruction_width();
     return builder.CreateSExtOrTrunc(value, llvm::FixedVectorType::get(i64_type, vector_width));
 }
 
@@ -449,7 +450,8 @@ void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
     // Even if generating vectorised code, some variables still need to be scalar. Particularly, the
     // induction variable "id" and remainder loop variables (that start with "epilogue" prefix).
     llvm::Type* type;
-    if (vector_width > 1 && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+    if (platform.is_cpu_with_simd() && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+        int vector_width = platform.get_instruction_width();
         type = llvm::FixedVectorType::get(element_or_scalar_type, vector_width);
     } else {
         type = element_or_scalar_type;
@@ -495,7 +497,7 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     llvm::Value* element_ptr = create_inbounds_gep(array, id_value);
 
     // Find out if the vector code is generated.
-    bool generating_vector_ir = vector_width > 1 && vectorize;
+    bool generating_vector_ir = platform.is_cpu_with_simd() && vectorize;
 
     // If the vector code is generated, we need to distinguish between two cases. If the array is
     // indexed indirectly (i.e. not by an induction variable `kernel_id`), create gather/scatter
@@ -523,7 +525,7 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
         // to a vector pointer
         llvm::Type* vector_type = llvm::PointerType::get(
             llvm::FixedVectorType::get(element_ptr->getType()->getPointerElementType(),
-                                       vector_width),
+                                       platform.get_instruction_width()),
             /*AddressSpace=*/0);
         ptr = builder.CreateBitCast(element_ptr, vector_type);
     } else {
@@ -541,11 +543,12 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
 
 void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     // If the value should not be vectorised, or it is already a vector, add it to the stack.
-    if (!vectorize || vector_width == 1 || value->getType()->isVectorTy()) {
+    if (!vectorize || !platform.is_cpu_with_simd() || value->getType()->isVectorTy()) {
         value_stack.push_back(value);
     } else {
         // Otherwise, we generate vectorized code inside the loop, so replicate the value to form a
         // vector.
+        int vector_width = platform.get_instruction_width();
         llvm::Value* vector_value = builder.CreateVectorSplat(vector_width, value);
         value_stack.push_back(vector_value);
     }
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index b3005db0c7..cf9e7f936d 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -10,6 +10,7 @@
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+#include "codegen/llvm/target_platform.hpp"
 #include "symtab/symbol_table.hpp"
 
 #include "llvm/IR/IRBuilder.h"
@@ -52,14 +53,8 @@ class IRBuilder {
     /// Flag to indicate that the generated IR should be vectorized.
     bool vectorize;
 
-    /// Precision of the floating-point numbers (32 or 64 bit).
-    unsigned fp_precision;
-
-    /// The vector width used for the vectorized code.
-    unsigned vector_width;
-
-    /// Instance struct fields do not alias.
-    bool assume_noalias;
+    /// Target platform for which IR is built.
+    Platform platform;
 
     /// Masked value used to predicate vector instructions.
     llvm::Value* mask;
@@ -72,21 +67,17 @@ class IRBuilder {
 
   public:
     IRBuilder(llvm::LLVMContext& context,
-              bool use_single_precision = false,
-              unsigned vector_width = 1,
-              std::vector<std::string> fast_math_flags = {},
-              bool assume_noalias = true)
+              Platform& platform,
+              std::vector<std::string> fast_math_flags = {})
         : builder(context)
+        , platform(platform)
         , symbol_table(nullptr)
         , current_function(nullptr)
         , vectorize(false)
         , alloca_ip(nullptr)
-        , fp_precision(use_single_precision ? single_precision : double_precision)
-        , vector_width(vector_width)
         , mask(nullptr)
         , kernel_id("")
-        , fast_math_flags(fast_math_flags)
-        , assume_noalias(assume_noalias) {}
+        , fast_math_flags(fast_math_flags) {}
 
     /// Transforms the fast math flags provided to the builder into LLVM's representation.
     llvm::FastMathFlags transform_to_fmf(std::vector<std::string>& flags) {
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 6d374999c3..92d8a486c1 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -47,8 +47,11 @@ int main(int argc, const char* argv[]) {
     logger->info("Running Symtab Visitor");
     visitor::SymtabVisitor().visit_program(*ast);
 
+    // Use default platform for this toy example.
+    codegen::Platform platform;
+
     logger->info("Running LLVM Visitor");
-    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", /*opt_level_ir=*/0);
+    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", platform, /*opt_level_ir=*/0);
     llvm_visitor.visit_program(*ast);
     std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
 
diff --git a/src/codegen/llvm/target_platform.cpp b/src/codegen/llvm/target_platform.cpp
new file mode 100644
index 0000000000..6cb8c7bb2b
--- /dev/null
+++ b/src/codegen/llvm/target_platform.cpp
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/target_platform.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+const std::string Platform::DEFAULT_PLATFORM_NAME = "default";
+const std::string Platform::DEFAULT_MATH_LIBRARY = "none";
+
+bool Platform::is_default_platform() {
+    // Default platform is a CPU.
+    return platform_id == PlatformID::CPU &&  name == Platform::DEFAULT_PLATFORM_NAME;
+}
+
+bool Platform::is_cpu() {
+    return platform_id == PlatformID::CPU;
+}
+
+bool Platform::is_cpu_with_simd() {
+    return platform_id == PlatformID::CPU && instruction_width > 1;
+}
+
+bool Platform::is_gpu() {
+    return platform_id == PlatformID::GPU;
+}
+
+bool Platform::is_single_precision() {
+  return use_single_precision;
+}
+
+std::string Platform::get_name() const {
+    return name;
+}
+
+std::string Platform::get_math_library() const {
+    return math_library;
+}
+
+int Platform::get_instruction_width() const {
+    return instruction_width;
+}
+
+int Platform::get_precision() const {
+    return use_single_precision? 32 : 64;
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/target_platform.hpp b/src/codegen/llvm/target_platform.hpp
new file mode 100644
index 0000000000..2eabbb1a4b
--- /dev/null
+++ b/src/codegen/llvm/target_platform.hpp
@@ -0,0 +1,92 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+namespace nmodl {
+namespace codegen {
+
+enum PlatformID {
+    CPU,
+    GPU
+};
+
+/**
+ * \class Platform
+ * \brief A class that represents the target platform. It is needed to
+ * reduce the amount of code passed to LLVM visitor and its helpers.
+ */
+class Platform {
+  public:
+    /// Default name of the target and math library.
+    static const std::string DEFAULT_PLATFORM_NAME;
+    static const std::string DEFAULT_MATH_LIBRARY;
+
+  private:
+    /// Name of the platform.
+    const std::string name = Platform::DEFAULT_PLATFORM_NAME;
+
+    /// Target-specific id to compare platforms easily.
+    PlatformID platform_id;
+
+    /// User-provided width that is used to construct LLVM instructions
+    //  and types.
+    int instruction_width = 1;
+
+    /// Use single-precision floating-point types.
+    bool use_single_precision = false;
+
+    /// A name of user-provided math library.
+    std::string math_library = Platform::DEFAULT_MATH_LIBRARY;
+
+  public:
+    Platform(PlatformID platform_id,
+             const std::string& name,
+             std::string& math_library,
+             bool use_single_precision = false,
+             int instruction_width = 1)
+              : platform_id(platform_id)
+              , name(name)
+              , math_library(math_library)
+              , use_single_precision(use_single_precision)
+              , instruction_width(instruction_width) {}
+
+    Platform(bool use_single_precision,
+             int instruction_width)
+            : platform_id(PlatformID::CPU)
+            , use_single_precision(use_single_precision)
+            , instruction_width(instruction_width) {}
+
+    Platform() : platform_id(PlatformID::CPU) {}
+
+    /// Checks if this platform is a default platform.
+    bool is_default_platform();
+
+    /// Checks if this platform is a CPU.
+    bool is_cpu();
+
+    /// Checks if this platform is a CPU with SIMD support.
+    bool is_cpu_with_simd();
+
+    /// Checks if this platform is a GPU.
+    bool is_gpu();
+
+    bool is_single_precision();
+
+    std::string get_name() const;
+
+    std::string get_math_library() const;
+
+    int get_instruction_width() const;
+
+    int get_precision() const;
+};
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/main.cpp b/src/main.cpp
index 140e4c77d0..f2678fcb48 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -696,21 +696,29 @@ int main(int argc, const char* argv[]) {
               // information and not in LLVM visitor.
               int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
 
-              if (llvm_gpu_name != "default") {
-                logger->warn("GPU code generation is not supported, targeting "
-                             "CPU instead");
-              }
+              // Create platform abstraction.
+              PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU
+                                                          : PlatformID::GPU;
+              const std::string name =
+                  llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
+              Platform platform(pid, name, llvm_math_library, llvm_float_type,
+                                llvm_vector_width);
 
               logger->info("Running LLVM backend code generator");
-              CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_level,
-                                         llvm_float_type, llvm_vector_width,
-                                         llvm_math_library, !llvm_no_debug,
-                                         llvm_fast_math_flags, true);
+              CodegenLLVMVisitor visitor(modfile, output_dir, platform,
+                                         llvm_opt_level, !llvm_no_debug,
+                                         llvm_fast_math_flags);
               visitor.visit_program(*ast);
               ast_to_nmodl(*ast, filepath("llvm", "mod"));
               ast_to_json(*ast, filepath("llvm", "json"));
 
               if (llvm_benchmark) {
+                // \todo integrate Platform class here
+                if (llvm_gpu_name != "default") {
+                  logger->warn("GPU benchmarking is not supported, targeting "
+                               "CPU instead");
+                }
+
                 logger->info("Running LLVM benchmark");
                 benchmark::LLVMBenchmark benchmark(
                     visitor, modfile, output_dir, shared_lib_paths,
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 41605ecbd3..4c9515f814 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -124,8 +124,12 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         const auto& ast = driver.parse_string(nmodl_text);
 
         SymtabVisitor().visit_program(*ast);
+
+        codegen::Platform cpu_platform(/*use_single_precision=*/false,
+                                       /*instruction_width=*/1);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
+                                                 cpu_platform,
                                                  /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
 
@@ -226,8 +230,12 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
         const auto& ast = driver.parse_string(nmodl_text);
 
         SymtabVisitor().visit_program(*ast);
+
+        codegen::Platform cpu_platform(/*use_single_precision=*/false,
+                                       /*instruction_width=*/1);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
+                                                 cpu_platform,
                                                  /*opt_level_ir=*/3);
         llvm_visitor.visit_program(*ast);
 
@@ -299,11 +307,13 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform cpu_platform(/*use_single_precision=*/false,
+                                       /*instruction_width=*/1);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/0,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/1);
+                                                 cpu_platform,
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
@@ -381,11 +391,13 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/4);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/3,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/4);
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/3);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
@@ -463,11 +475,13 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/2);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/0,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/2);
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
@@ -554,11 +568,13 @@ SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/2);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/0,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/2);
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 6042aecfc8..fbb07dfbcd 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -39,11 +39,11 @@ codegen::CodegenInstanceData generate_instance_data(const std::string& text,
     SymtabVisitor().visit_program(*ast);
     NeuronSolveVisitor().visit_program(*ast);
 
+    codegen::Platform cpu_platform(use_single_precision, vector_width);
     codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"test",
                                              /*output_dir=*/".",
-                                             opt_level,
-                                             use_single_precision,
-                                             vector_width);
+                                             cpu_platform,
+                                             opt_level);
     llvm_visitor.visit_program(*ast);
     llvm_visitor.dump_module();
     const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 23f6977aea..34fcd8b0da 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -51,14 +51,12 @@ std::string run_llvm_visitor(const std::string& text,
     NeuronSolveVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
 
-    codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
-                                             /*output_dir=*/".",
-                                             opt_level,
-                                             use_single_precision,
-                                             vector_width,
-                                             vec_lib,
-                                             /*add_debug_information=*/false,
-                                             fast_math_flags);
+    codegen::Platform cpu_platform(codegen::PlatformID::CPU, /*name=*/"default",
+                                   vec_lib, use_single_precision, vector_width);
+    codegen::CodegenLLVMVisitor llvm_visitor(
+        /*mod_filename=*/"unknown",
+        /*output_dir=*/".", cpu_platform, opt_level,
+        /*add_debug_information=*/false, fast_math_flags);
 
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();

From 0907d3bd4e97494da8aeec2a7a79de7d69e0cb4f Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 00:34:41 +0100
Subject: [PATCH 194/331] [LLVM][GPU] Added GPU-specific AST transformations

This commit adds a new AST node: `CodegenThreadId` that
represents thread id used in GPU computation. Thanks to
the new platform class abstraction, the code to generate
compute body of NEURON block was readapted to support
AST transformations needed for GPU.

Example of the transformation:
```
GPU_ID id
INTEGER node_id
DOUBLE v
IF (id<mech->node_count) {
    node_id = mech->node_index[id]
    v = mech->voltage[node_id]
    mech->m[id] = mech->y[id]+2
}
```
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 208 +++++++++---------
 .../llvm/codegen_llvm_helper_visitor.hpp      |  28 ++-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   8 +-
 src/language/code_generator.cmake             |   1 +
 src/language/codegen.yaml                     |  16 ++
 test/unit/codegen/codegen_llvm_ir.cpp         |  63 +++++-
 6 files changed, 212 insertions(+), 112 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 654afd8ef5..8de61f726b 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -443,7 +443,7 @@ void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
  * @param node Ast node under which variables to be converted to instance type
  */
 void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
-                                                            std::string& index_var) {
+                                                            const std::string& index_var) {
     /// collect all variables in the node of type ast::VarName
     auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
     for (const auto& v: variables) {
@@ -612,35 +612,29 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// statements for new function to be generated
     ast::StatementVector function_statements;
 
-    /// create variable definition for loop index and insert at the beginning
-    std::string loop_index_var = "id";
-    std::vector<std::string> induction_variables{"id"};
-    function_statements.push_back(
-        create_local_variable_statement(induction_variables, INTEGER_TYPE));
-
     /// create vectors of local variables that would be used in compute part
     std::vector<std::string> int_variables{"node_id"};
     std::vector<std::string> double_variables{"v"};
 
-    /// create now main compute part : for loop over channel instances
+    /// create now main compute part
 
-    /// loop body : initialization + solve blocks
-    ast::StatementVector loop_def_statements;
-    ast::StatementVector loop_index_statements;
-    ast::StatementVector loop_body_statements;
+    /// compute body : initialization + solve blocks
+    ast::StatementVector def_statements;
+    ast::StatementVector index_statements;
+    ast::StatementVector body_statements;
     {
         /// access node index and corresponding voltage
-        loop_index_statements.push_back(
+        index_statements.push_back(
             visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
-        loop_body_statements.push_back(
+        body_statements.push_back(
             visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
 
         /// read ion variables
         ion_read_statements(BlockType::State,
                             int_variables,
                             double_variables,
-                            loop_index_statements,
-                            loop_body_statements);
+                            index_statements,
+                            body_statements);
 
         /// main compute node : extract solution expressions from the derivative block
         const auto& solutions = collect_nodes(node, {ast::AstNodeType::SOLUTION_EXPRESSION});
@@ -648,109 +642,41 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
             const auto& solution = std::dynamic_pointer_cast<ast::SolutionExpression>(statement);
             const auto& block = std::dynamic_pointer_cast<ast::StatementBlock>(
                 solution->get_node_to_solve());
-            append_statements_from_block(loop_body_statements, block);
+            append_statements_from_block(body_statements, block);
         }
 
         /// add breakpoint block if no current
         if (info.currents.empty() && info.breakpoint_node != nullptr) {
             auto block = info.breakpoint_node->get_statement_block();
-            append_statements_from_block(loop_body_statements, block);
+            append_statements_from_block(body_statements, block);
         }
 
         /// write ion statements
         ion_write_statements(BlockType::State,
                              int_variables,
                              double_variables,
-                             loop_index_statements,
-                             loop_body_statements);
+                             index_statements,
+                             body_statements);
 
         // \todo handle process_shadow_update_statement and wrote_conc_call yet
     }
 
-    ast::StatementVector loop_body;
-    loop_body.insert(loop_body.end(), loop_def_statements.begin(), loop_def_statements.end());
-    loop_body.insert(loop_body.end(), loop_index_statements.begin(), loop_index_statements.end());
-    loop_body.insert(loop_body.end(), loop_body_statements.begin(), loop_body_statements.end());
-
-    /// now construct a new code block which will become the body of the loop
-    auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
-
-    /// declare main FOR loop local variables
-    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
-    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
-
-    /// main loop possibly vectorized on vector_width
-    {
-        /// loop constructs : initialization, condition and increment
-        const auto& initialization = int_initialization_expression(INDUCTION_VAR);
-        const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, vector_width);
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
-
-        /// clone it
-        auto local_loop_block = std::shared_ptr<ast::StatementBlock>(loop_block->clone());
-
-        /// convert local statement to codegenvar statement
-        convert_local_statement(*local_loop_block);
-
-        auto for_loop_statement_main = std::make_shared<ast::CodegenForStatement>(initialization,
-                                                                                  condition,
-                                                                                  increment,
-                                                                                  local_loop_block);
-
-        /// convert all variables inside loop body to instance variables
-        convert_to_instance_variable(*for_loop_statement_main, loop_index_var);
-
-        /// loop itself becomes one of the statement in the function
-        function_statements.push_back(for_loop_statement_main);
-    }
-
-    /// vectors containing renamed FOR loop local variables
-    std::vector<std::string> renamed_int_variables;
-    std::vector<std::string> renamed_double_variables;
-
-    /// remainder loop possibly vectorized on vector_width
-    if (vector_width > 1) {
-        /// loop constructs : initialization, condition and increment
-        const auto& condition =
-            loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, /*vector_width=*/1);
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, /*vector_width=*/1);
-
-        /// rename local variables to avoid conflict with main loop
-        rename_local_variables(*loop_block);
-
-        /// convert local statement to codegenvar statement
-        convert_local_statement(*loop_block);
-
-        auto for_loop_statement_remainder =
-            std::make_shared<ast::CodegenForStatement>(nullptr, condition, increment, loop_block);
-
-        const auto& loop_statements = for_loop_statement_remainder->get_statement_block();
-        // \todo: Change RenameVisitor to take a vector of names to which it would append a single
-        // prefix.
-        for (const auto& name: int_variables) {
-            std::string new_name = epilogue_variable_prefix + name;
-            renamed_int_variables.push_back(new_name);
-            visitor::RenameVisitor v(name, new_name);
-            loop_statements->accept(v);
-        }
-        for (const auto& name: double_variables) {
-            std::string new_name = epilogue_variable_prefix + name;
-            renamed_double_variables.push_back(new_name);
-            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
-            loop_statements->accept(v);
-        }
+    /// create target-specific compute body
+    ast::StatementVector compute_body;
+    compute_body.insert(compute_body.end(), def_statements.begin(), def_statements.end());
+    compute_body.insert(compute_body.end(), index_statements.begin(), index_statements.end());
+    compute_body.insert(compute_body.end(), body_statements.begin(), body_statements.end());
 
-        /// declare remainder FOR loop local variables
-        function_statements.push_back(
-            create_local_variable_statement(renamed_int_variables, INTEGER_TYPE));
+    if (platform.is_gpu()) {
+        const auto& id_statement = std::make_shared<ast::CodegenThreadId>(create_varname(INDUCTION_VAR));
+        function_statements.push_back(id_statement);
+        create_gpu_compute_body(compute_body, function_statements, int_variables, double_variables);
+    } else {
+        // Create induction variable
+        std::vector<std::string> induction_variables{INDUCTION_VAR};
         function_statements.push_back(
-            create_local_variable_statement(renamed_double_variables, FLOAT_TYPE));
-
-        /// convert all variables inside loop body to instance variables
-        convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
-
-        /// loop itself becomes one of the statement in the function
-        function_statements.push_back(for_loop_statement_remainder);
+                create_local_variable_statement(induction_variables, INTEGER_TYPE));
+        create_cpu_compute_body(compute_body, function_statements, int_variables, double_variables);
     }
 
     /// new block for the function
@@ -777,6 +703,84 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     std::cout << nmodl::to_nmodl(function) << std::endl;
 }
 
+void CodegenLLVMHelperVisitor::create_gpu_compute_body(ast::StatementVector& body,
+                                                       ast::StatementVector& function_statements,
+                                                       std::vector<std::string>& int_variables,
+                                                       std::vector<std::string>& double_variables) {
+    // Then, create condition for thread id. For now - reuse the same functionality as for
+    auto kernel_block = std::make_shared<ast::StatementBlock>(body);
+    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, 1);
+    ast::ElseIfStatementVector else_ifs = {};
+    auto if_statement = std::make_shared<ast::IfStatement>(condition, kernel_block, else_ifs, nullptr);
+
+    convert_to_instance_variable(*if_statement, INDUCTION_VAR);
+
+    // Push variables and the loop to the function statements vector.
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
+    function_statements.push_back(if_statement);
+}
+
+void CodegenLLVMHelperVisitor::create_cpu_compute_body(ast::StatementVector& body,
+                                                       ast::StatementVector& function_statements,
+                                                       std::vector<std::string>& int_variables,
+                                                       std::vector<std::string>& double_variables) {
+    auto loop_block = std::make_shared<ast::StatementBlock>(body);
+    create_compute_body_loop(loop_block, function_statements, int_variables, double_variables);
+    if (platform.is_cpu_with_simd())
+        create_compute_body_loop(loop_block, function_statements, int_variables, double_variables, /*is_remainder_loop=*/true);
+}
+
+void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::StatementBlock>& block,
+                                                        ast::StatementVector& function_statements,
+                                                        std::vector<std::string>& int_variables,
+                                                        std::vector<std::string>& double_variables,
+                                                        bool is_remainder_loop) {
+    // First, check if we are creating a main or remainder loop. If it is a remainder loop, then
+    // no initialization is needed and instruction width is simply 1.
+    int width = is_remainder_loop ? 1 : platform.get_instruction_width();
+    const auto& initialization = is_remainder_loop ? nullptr : int_initialization_expression(INDUCTION_VAR);
+    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, width);
+    const auto& increment = loop_increment_expression(INDUCTION_VAR, width);
+
+    // Clone the statement block if needed since it can be used by the remainder loop.
+    auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
+
+    // Convert local statement to use CodegenVar statements and create  a FOR loop node. Also, if creating
+    // a remainder loop then rename variables to avoid conflicts.
+    if (is_remainder_loop)
+        rename_local_variables(*loop_block);
+    convert_local_statement(*loop_block);
+    auto for_loop = std::make_shared<ast::CodegenForStatement>(initialization,
+                                                               condition,
+                                                               increment,
+                                                               loop_block);
+
+    // Convert all variables inside loop body to be instance variables.
+    convert_to_instance_variable(*for_loop, INDUCTION_VAR);
+
+    // Rename variables if processing remainder loop.
+    if (is_remainder_loop) {
+        const auto& loop_statements = for_loop->get_statement_block();
+        auto rename = [&](std::vector<std::string>& vars) {
+            for (int i = 0; i < vars.size(); ++i) {
+                std::string old_name = vars[i];
+                std::string new_name = epilogue_variable_prefix + vars[i];
+                vars[i] = new_name;
+                visitor::RenameVisitor v(old_name, new_name);
+                loop_statements->accept(v);
+            }
+        };
+        rename(int_variables);
+        rename(double_variables);
+    }
+
+    // Push variables and  the loop to the function statements vector.
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
+    function_statements.push_back(for_loop);
+}
+
 void CodegenLLVMHelperVisitor::remove_inlined_nodes(ast::Program& node) {
     auto program_symtab = node.get_model_symbol_table();
     const auto& func_proc_nodes =
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 21aff4a92d..a40d7923cc 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -16,6 +16,7 @@
 
 #include "ast/instance_struct.hpp"
 #include "codegen/codegen_info.hpp"
+#include "codegen/llvm/target_platform.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
@@ -101,8 +102,8 @@ struct InstanceVarHelper {
  * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
-    /// explicit vectorisation width
-    int vector_width;
+    /// target platform
+    Platform platform;
 
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
@@ -135,8 +136,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     static const std::string VOLTAGE_VAR;
     static const std::string NODE_INDEX_VAR;
 
-    CodegenLLVMHelperVisitor(int vector_width)
-        : vector_width(vector_width) {}
+    CodegenLLVMHelperVisitor(Platform& platform)
+        : platform(platform) {}
 
     const InstanceVarHelper& get_instance_var_helper() {
         return instance_var_helper;
@@ -161,7 +162,7 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
                               ast::StatementVector& index_statements,
                               ast::StatementVector& body_statements);
 
-    void convert_to_instance_variable(ast::Node& node, std::string& index_var);
+    void convert_to_instance_variable(ast::Node& node, const std::string& index_var);
 
     void convert_local_statement(ast::StatementBlock& node);
     void rename_local_variables(ast::StatementBlock& node);
@@ -173,6 +174,23 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void visit_function_block(ast::FunctionBlock& node) override;
     void visit_nrn_state_block(ast::NrnStateBlock& node) override;
     void visit_program(ast::Program& node) override;
+
+  private:
+    /// Methods to populate`function_statements` with necessary AST constructs to form
+    /// a kernel for a specific target.
+    void create_gpu_compute_body(ast::StatementVector& body,
+                                 ast::StatementVector& function_statements,
+                                 std::vector<std::string>& int_variables,
+                                 std::vector<std::string>& double_variables);
+    void create_cpu_compute_body(ast::StatementVector& body,
+                                 ast::StatementVector& function_statements,
+                                 std::vector<std::string>& int_variables,
+                                 std::vector<std::string>& double_variables);
+    void create_compute_body_loop(std::shared_ptr<ast::StatementBlock>& block,
+                                  ast::StatementVector& function_statements,
+                                  std::vector<std::string>& int_variables,
+                                  std::vector<std::string>& double_variables,
+                                  bool is_remainder_loop = false);
 };
 
 /** @} */  // end of llvm_codegen_details
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0fa81de691..2f677cfbec 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -815,12 +815,18 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{platform.get_instruction_width()};
+    CodegenLLVMHelperVisitor v{platform};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
+    // \todo: implement GPU codegen functionality.
+    if (platform.is_gpu()) {
+      logger->warn("GPU code generation is not supported yet, aborting!");
+      return;
+    }
+
     // Initialize the builder for this NMODL program.
     ir_builder.initialize(*sym_tab, kernel_id);
 
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 17123fc833..72b2754b1a 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -71,6 +71,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/codegen_instance_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_struct.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_thread_id.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_list_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_type.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 30bae4c5c5..245010f054 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -286,3 +286,19 @@
                         - rhs:
                             brief: "Expression for atomic operation"
                             type: Expression
+                  - CodegenThreadId:
+                      brief: "Represents a generic thread id expression for GPU code generation"
+                      description: |
+                        For GPU code generation, we use a special AST node to enocde the thread
+                        id calculation. In NMODL, this expression is usually of the form:
+                        \code{.cpp}
+                            id = blockId.x * blockDim.x + threadId.x
+                        \endcode
+                        To be able to support multiple GPU backends, we choose to have a custom AST
+                        node. Therefore, the code generation for this node is kept very simple,
+                        mapping expression to target-specific GPU inrinsics.
+                      nmodl: "GPU_ID "
+                      members:
+                        - name:
+                            brief: "Name of the thread id variable"
+                            type: Identifier
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 34fcd8b0da..e723c850a8 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -68,14 +68,14 @@ std::string run_llvm_visitor(const std::string& text,
 
 std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
     const std::string& text,
-    int vector_width,
+    codegen::Platform& platform,
     const std::vector<ast::AstNodeType>& nodes_to_collect) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
+    CodegenLLVMHelperVisitor(platform).visit_program(*ast);
 
     const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 
@@ -1228,8 +1228,9 @@ SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
             })";
 
         THEN("a single scalar loops is constructed") {
+            codegen::Platform default_platform;
             auto result = run_llvm_visitor_helper(nmodl_text,
-                                                  /*vector_width=*/1,
+                                                  default_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 1);
 
@@ -1279,8 +1280,9 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
 
 
         THEN("vector and epilogue scalar loops are constructed") {
+            codegen::Platform simd_platform(/*use_single_precision=*/false, /*instruction_width=*/8);
             auto result = run_llvm_visitor_helper(nmodl_text,
-                                                  /*vector_width=*/8,
+                                                  simd_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 2);
 
@@ -1523,3 +1525,56 @@ SCENARIO("Removal of inlined functions and procedures", "[visitor][llvm][inline]
         }
     }
 }
+
+//=============================================================================
+// Basic GPU kernel AST generation
+//=============================================================================
+
+SCENARIO("GPU kernel body", "[visitor][llvm][gpu]") {
+    GIVEN("For GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+
+        std::string expected_kernel = R"(
+            VOID nrn_state_test(INSTANCE_STRUCT *mech){
+                GPU_ID id
+                INTEGER node_id
+                DOUBLE v
+                IF (id<mech->node_count) {
+                    node_id = mech->node_index[id]
+                    v = mech->voltage[node_id]
+                    mech->m[id] = mech->y[id]+2
+                }
+            })";
+
+        THEN("a kernel with thread id and if statement is created") {
+            std::string name = "default";
+            std::string math_library = "none";
+            codegen::Platform gpu_platform(codegen::PlatformID::GPU, name, math_library);
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  gpu_platform,
+                                                  {ast::AstNodeType::CODEGEN_FUNCTION});
+            REQUIRE(result.size() == 1);
+
+            auto kernel = reindent_text(to_nmodl(result[0]));
+            REQUIRE(kernel == reindent_text(expected_kernel));
+        }
+    }
+}

From b4943fd23bee31085b178433ccb3e18b73b0860e Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 00:45:11 +0100
Subject: [PATCH 195/331] fixed comments

---
 src/codegen/llvm/codegen_llvm_helper_visitor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 8de61f726b..be64784d33 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -707,7 +707,7 @@ void CodegenLLVMHelperVisitor::create_gpu_compute_body(ast::StatementVector& bod
                                                        ast::StatementVector& function_statements,
                                                        std::vector<std::string>& int_variables,
                                                        std::vector<std::string>& double_variables) {
-    // Then, create condition for thread id. For now - reuse the same functionality as for
+    // Then, create condition for thread id. For now reuse the functionality from `loop_count_expression`.
     auto kernel_block = std::make_shared<ast::StatementBlock>(body);
     const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, 1);
     ast::ElseIfStatementVector else_ifs = {};
@@ -746,7 +746,7 @@ void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::Sta
     // Clone the statement block if needed since it can be used by the remainder loop.
     auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
 
-    // Convert local statement to use CodegenVar statements and create  a FOR loop node. Also, if creating
+    // Convert local statement to use CodegenVar statements and create a FOR loop node. Also, if creating
     // a remainder loop then rename variables to avoid conflicts.
     if (is_remainder_loop)
         rename_local_variables(*loop_block);

From 06deff97a129f7e3ba45933f316c3a084782a055 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 13:04:26 +0100
Subject: [PATCH 196/331] Added code generation for thread id

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 12 +++++-------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  1 +
 src/codegen/llvm/llvm_ir_builder.cpp      | 22 ++++++++++++++++++++++
 src/codegen/llvm/llvm_ir_builder.hpp      |  3 +++
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 2f677cfbec..0bd233ecbb 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -39,7 +39,7 @@ static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
            statement.is_if_statement() || statement.is_codegen_return_statement() ||
            statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_while_statement();
+           statement.is_while_statement() || statement.is_codegen_thread_id();
 }
 
 /// A utility to check that the kernel body can be vectorised.
@@ -694,6 +694,10 @@ void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturn
     ir_builder.create_return(ret_value);
 }
 
+void CodegenLLVMVisitor::visit_codegen_thread_id(const ast::CodegenThreadId& node) {
+    ir_builder.create_thread_id();
+}
+
 void CodegenLLVMVisitor::visit_codegen_var_list_statement(
     const ast::CodegenVarListStatement& node) {
     llvm::Type* scalar_type = get_codegen_var_type(*node.get_var_type());
@@ -821,12 +825,6 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     sym_tab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
-    // \todo: implement GPU codegen functionality.
-    if (platform.is_gpu()) {
-      logger->warn("GPU code generation is not supported yet, aborting!");
-      return;
-    }
-
     // Initialize the builder for this NMODL program.
     ir_builder.initialize(*sym_tab, kernel_id);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 396d8cbb67..6ff79a0ddb 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -138,6 +138,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
+    void visit_codegen_thread_id(const ast::CodegenThreadId& node) override;
     void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index e7a6a4a60b..b88e995771 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/ValueSymbolTable.h"
 
 namespace nmodl {
@@ -554,6 +555,27 @@ void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     }
 }
 
+void IRBuilder::create_thread_id() {
+    llvm::Value* alloca_ptr = create_alloca(kernel_id, get_i32_type());
+
+    llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
+    auto create_call = [&](llvm::Intrinsic::ID id) {
+      llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
+      return builder.CreateCall(intrinsic, {});
+    };
+
+    // For now, this function only supports NVPTX backend, however it can be easily
+    // adjusted to generate thread id variable for any other platform.
+    llvm::Value* block_id = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
+    llvm::Value* block_dim = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+    llvm::Value* tmp = builder.CreateMul(block_id, block_dim);
+
+    llvm::Value* tid = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
+    llvm::Value* id = builder.CreateAdd(tmp, tid);
+
+    builder.CreateStore(id, alloca_ptr);
+}
+
 
 /****************************************************************************************/
 /*                                 LLVM block utilities                                 */
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index cf9e7f936d..aa9c7ab1e3 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -230,6 +230,9 @@ class IRBuilder {
     void create_scalar_or_vector_alloca(const std::string& name,
                                         llvm::Type* element_or_scalar_type);
 
+    /// Creates a variable of the form: id = blockIdx.x * blockDim.x + threadIdx.x
+    void create_thread_id();
+
     /// Generates LLVM IR for the given unary operator.
     void create_unary_op(llvm::Value* value, ast::UnaryOp op);
 

From b506574ea8078fcca6b31af72675fdb716ad5a5e Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 13:23:10 +0100
Subject: [PATCH 197/331] Added kernel annotation generation

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 28 +++++++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp |  3 +++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0bd233ecbb..86fe5b5443 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -64,6 +64,16 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
+    llvm::Metadata* metadata[] = {
+        llvm::ValueAsMetadata::get(kernel),
+        llvm::MDString::get(*context, "kernel"),
+        llvm::ValueAsMetadata::get(
+            llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
+    llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
+    module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
+}
+
 #if LLVM_VERSION_MAJOR >= 13
 void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
                                                                  llvm::Triple& triple) {
@@ -665,11 +675,19 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     ir_builder.allocate_function_arguments(func, arguments);
 
     // Process function or procedure body. If the function is a compute kernel, enable
-    // vectorization. If so, the return statement is handled in a separate visitor.
-    if (platform.is_cpu_with_simd() && is_kernel_function(name)) {
-        ir_builder.generate_vector_ir();
-        block->accept(*this);
-        ir_builder.generate_scalar_ir();
+    // vectorization or add NVVM annotations. If this is the case, the return statement is
+    // handled in a separate visitor.
+    if (is_kernel_function(name)) {
+        if (platform.is_cpu_with_simd()) {
+            ir_builder.generate_vector_ir();
+            block->accept(*this);
+            ir_builder.generate_scalar_ir();
+        } else if (platform.is_gpu()) {
+            block->accept(*this);
+            annotate_kernel_with_nvvm(func);
+        } else { // scalar
+            block->accept(*this);
+        }
     } else {
         block->accept(*this);
     }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 6ff79a0ddb..67a3a6fab6 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -157,6 +157,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void wrap_kernel_functions();
 
   private:
+    // Annotates kernel function with NVVM metadata.
+    void annotate_kernel_with_nvvm(llvm::Function* kernel);
+
 #if LLVM_VERSION_MAJOR >= 13
     /// Populates target library info with the vector library definitions.
     void add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,

From d097cf02502ad0ab3089d4a6fcbbf778ccbf01f4 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 13:54:10 +0100
Subject: [PATCH 198/331] Added tests for annotations/intrinsics

---
 test/unit/codegen/codegen_llvm_ir.cpp | 74 +++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index e723c850a8..4d71a5a276 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -34,6 +34,32 @@ using nmodl::parser::NmodlDriver;
 // Utility to get LLVM module as a string
 //=============================================================================
 
+std::string run_gpu_llvm_visitor(const std::string& text,
+                                 int opt_level = 0,
+                                 bool use_single_precision = false,
+                                 std::string math_library = "none",
+                                 bool nmodl_inline = false) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    SymtabVisitor().visit_program(*ast);
+    if (nmodl_inline) {
+        InlineVisitor().visit_program(*ast);
+    }
+    NeuronSolveVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
+
+    codegen::Platform gpu_platform(codegen::PlatformID::GPU, /*name=*/"nvidia",
+                                   math_library, use_single_precision, 1);
+    codegen::CodegenLLVMVisitor llvm_visitor(
+        /*mod_filename=*/"unknown",
+        /*output_dir=*/".", gpu_platform, opt_level,
+        /*add_debug_information=*/false);
+
+    llvm_visitor.visit_program(*ast);
+    return llvm_visitor.dump_module();
+}
+
 std::string run_llvm_visitor(const std::string& text,
                              int opt_level = 0,
                              bool use_single_precision = false,
@@ -1578,3 +1604,51 @@ SCENARIO("GPU kernel body", "[visitor][llvm][gpu]") {
         }
     }
 }
+
+//=============================================================================
+// Basic NVVM/LLVM IR generation for GPU platforms
+//=============================================================================
+
+SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
+    GIVEN("For GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+        THEN("kernel annotations are added and thread id intrinsics generated") {
+            std::string module_string = run_gpu_llvm_visitor(nmodl_text,
+                                                             /*opt_level=*/0,
+                                                             /*use_single_precision=*/false);
+            std::smatch m;
+
+            // Check kernel annotations are correclty created.
+            std::regex annotations(R"(!nvvm\.annotations = !\{!0\})");
+            std::regex kernel_data(R"(!0 = !\{void \(%.*__instance_var__type\*\)\* @nrn_state_.*, !\"kernel\", i32 1\})");
+            REQUIRE(std::regex_search(module_string, m, annotations));
+            REQUIRE(std::regex_search(module_string, m, kernel_data));
+
+            // Check thread/block id/dim instrinsics are created.
+            std::regex block_id(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ctaid\.x\(\))");
+            std::regex block_dim(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ntid\.x\(\))");
+            std::regex tid(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.tid\.x\(\))");
+            REQUIRE(std::regex_search(module_string, m, block_id));
+            REQUIRE(std::regex_search(module_string, m, block_dim));
+            REQUIRE(std::regex_search(module_string, m, tid));
+        }
+    }
+}

From 98184f159aae3475886faed00d8ac3226f0e79a7 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 15 Mar 2022 12:46:11 +0100
Subject: [PATCH 199/331] [LLVM][refactoring] Added platform abstraction (#818)

* This commit introduces a handy `Plarform` class that is designed
   to incorporate target information for IR  generation, such as precision,
  vectorization width (if applicable), type of target (CPU/GPU), etc.
* In future, more functionality can be added to `Platform`, e.g. we can
  move functionality of handling `llvm::Target`, math SIMD libraries, etc.
* Note: this is just a very basic implementation that enables
   easier integration of GPU code generation.
---
 src/codegen/llvm/CMakeLists.txt               |  4 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 20 ++--
 src/codegen/llvm/codegen_llvm_visitor.hpp     | 25 ++---
 src/codegen/llvm/llvm_ir_builder.cpp          | 29 +++---
 src/codegen/llvm/llvm_ir_builder.hpp          | 23 ++---
 src/codegen/llvm/main.cpp                     |  5 +-
 src/codegen/llvm/target_platform.cpp          | 54 +++++++++++
 src/codegen/llvm/target_platform.hpp          | 92 +++++++++++++++++++
 src/main.cpp                                  | 24 +++--
 test/unit/codegen/codegen_llvm_execution.cpp  | 40 +++++---
 .../codegen/codegen_llvm_instance_struct.cpp  |  6 +-
 test/unit/codegen/codegen_llvm_ir.cpp         | 14 ++-
 12 files changed, 246 insertions(+), 90 deletions(-)
 create mode 100644 src/codegen/llvm/target_platform.cpp
 create mode 100644 src/codegen/llvm/target_platform.hpp

diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 5c7eadc91c..198d90c1a3 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -11,7 +11,9 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/target_platform.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/target_platform.hpp)
 
 # =============================================================================
 # LLVM codegen library and executable
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index bac6f4e0b2..0fa81de691 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -68,7 +68,7 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
 void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
                                                                  llvm::Triple& triple) {
     // Since LLVM does not support SLEEF as a vector library yet, process it separately.
-    if (vector_library == "SLEEF") {
+    if (platform.get_math_library() == "SLEEF") {
 // clang-format off
 #define FIXED(w) llvm::ElementCount::getFixed(w)
 // clang-format on
@@ -112,9 +112,9 @@ void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLib
             {"MASSV", VecLib::MASSV},
             {"none", VecLib::NoLibrary},
             {"SVML", VecLib::SVML}};
-        const auto& library = llvm_supported_vector_libraries.find(vector_library);
+        const auto& library = llvm_supported_vector_libraries.find(platform.get_math_library());
         if (library == llvm_supported_vector_libraries.end())
-            throw std::runtime_error("Error: unknown vector library - " + vector_library + "\n");
+            throw std::runtime_error("Error: unknown vector library - " + platform.get_math_library() + "\n");
 
         // Add vectorizable functions to the target library info.
         switch (library->second) {
@@ -542,7 +542,7 @@ void CodegenLLVMVisitor::visit_boolean(const ast::Boolean& node) {
  * \todo support this properly.
  */
 void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomicStatement& node) {
-    if (vector_width > 1)
+    if (platform.is_cpu_with_simd())
         logger->warn("Atomic operations are not supported");
 
     // Support only assignment for now.
@@ -555,7 +555,7 @@ void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomic
         throw std::runtime_error("Error: only 'VarName' assignment is supported\n");
 
     // Process the assignment as if it was non-atomic.
-    if (vector_width > 1)
+    if (platform.is_cpu_with_simd())
         logger->warn("Treating write as non-atomic");
     write_to_variable(*var, rhs);
 }
@@ -625,7 +625,7 @@ void CodegenLLVMVisitor::visit_codegen_for_statement(const ast::CodegenForStatem
     ir_builder.set_insertion_point(for_body);
 
     // If not processing remainder of the loop, start vectorization.
-    if (vector_width > 1 && main_loop_initialization)
+    if (platform.is_cpu_with_simd() && main_loop_initialization)
         ir_builder.generate_vector_ir();
 
     // Generate code for the loop body and create the basic block for the increment.
@@ -666,7 +666,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
 
     // Process function or procedure body. If the function is a compute kernel, enable
     // vectorization. If so, the return statement is handled in a separate visitor.
-    if (vector_width > 1 && is_kernel_function(name)) {
+    if (platform.is_cpu_with_simd() && is_kernel_function(name)) {
         ir_builder.generate_vector_ir();
         block->accept(*this);
         ir_builder.generate_scalar_ir();
@@ -740,7 +740,7 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
 
 void CodegenLLVMVisitor::visit_if_statement(const ast::IfStatement& node) {
     // If vectorizing the compute kernel with control flow, process it separately.
-    if (vector_width > 1 && ir_builder.vectorizing()) {
+    if (platform.is_cpu_with_simd() && ir_builder.vectorizing()) {
         create_vectorized_control_flow_block(node);
         return;
     }
@@ -815,7 +815,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{vector_width};
+    CodegenLLVMHelperVisitor v{platform.get_instruction_width()};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
@@ -864,7 +864,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     }
 
     // Optionally, replace LLVM math intrinsics with vector library calls.
-    if (vector_width > 1) {
+    if (platform.is_cpu_with_simd()) {
 #if LLVM_VERSION_MAJOR < 13
         logger->warn(
             "This version of LLVM does not support replacement of LLVM intrinsics with vector "
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 22b9fafd83..396d8cbb67 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -81,33 +81,22 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Optimisation level for LLVM IR transformations.
     int opt_level_ir;
 
-    /// Vector library used for math functions.
-    std::string vector_library;
-
-    /// Explicit vectorisation width.
-    int vector_width;
+    /// Target platform for the code generation.
+    Platform platform;
 
   public:
     CodegenLLVMVisitor(const std::string& mod_filename,
                        const std::string& output_dir,
+                       Platform& platform,
                        int opt_level_ir,
-                       bool use_single_precision = false,
-                       int vector_width = 1,
-                       std::string vec_lib = "none",
                        bool add_debug_information = false,
-                       std::vector<std::string> fast_math_flags = {},
-                       bool llvm_assume_alias = false)
+                       std::vector<std::string> fast_math_flags = {})
         : mod_filename(mod_filename)
         , output_dir(output_dir)
+        , platform(platform)
         , opt_level_ir(opt_level_ir)
-        , vector_width(vector_width)
-        , vector_library(vec_lib)
         , add_debug_information(add_debug_information)
-        , ir_builder(*context,
-                     use_single_precision,
-                     vector_width,
-                     fast_math_flags,
-                     !llvm_assume_alias)
+        , ir_builder(*context, platform, fast_math_flags)
         , debug_builder(*module) {}
 
     /// Dumps the generated LLVM IR module to string.
@@ -139,7 +128,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
     /// Returns vector width
     int get_vector_width() const {
-        return vector_width;
+        return platform.get_instruction_width();
     }
 
     // Visitors.
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index 1015b437f3..e7a6a4a60b 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -41,13 +41,13 @@ llvm::Type* IRBuilder::get_i64_type() {
 }
 
 llvm::Type* IRBuilder::get_fp_type() {
-    if (fp_precision == single_precision)
+    if (platform.is_single_precision())
         return llvm::Type::getFloatTy(builder.getContext());
     return llvm::Type::getDoubleTy(builder.getContext());
 }
 
 llvm::Type* IRBuilder::get_fp_ptr_type() {
-    if (fp_precision == single_precision)
+    if (platform.is_single_precision())
         return llvm::Type::getFloatPtrTy(builder.getContext());
     return llvm::Type::getDoublePtrTy(builder.getContext());
 }
@@ -92,7 +92,7 @@ llvm::Value* IRBuilder::pop_last_value() {
 /****************************************************************************************/
 
 void IRBuilder::create_boolean_constant(int value) {
-    if (vector_width > 1 && vectorize) {
+    if (platform.is_cpu_with_simd() && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_boolean_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_boolean_type(), value));
@@ -100,7 +100,7 @@ void IRBuilder::create_boolean_constant(int value) {
 }
 
 void IRBuilder::create_fp_constant(const std::string& value) {
-    if (vector_width > 1 && vectorize) {
+    if (platform.is_cpu_with_simd() && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantFP>(get_fp_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantFP>(get_fp_type(), value));
@@ -112,7 +112,7 @@ llvm::Value* IRBuilder::create_global_string(const ast::String& node) {
 }
 
 void IRBuilder::create_i32_constant(int value) {
-    if (vector_width > 1 && vectorize) {
+    if (platform.is_cpu_with_simd() && vectorize) {
         value_stack.push_back(get_vector_constant<llvm::ConstantInt>(get_i32_type(), value));
     } else {
         value_stack.push_back(get_scalar_constant<llvm::ConstantInt>(get_i32_type(), value));
@@ -126,6 +126,8 @@ llvm::Value* IRBuilder::get_scalar_constant(llvm::Type* type, V value) {
 
 template <typename C, typename V>
 llvm::Value* IRBuilder::get_vector_constant(llvm::Type* type, V value) {
+    int vector_width = platform.get_instruction_width();
+
     ConstantVector constants;
     for (unsigned i = 0; i < vector_width; ++i) {
         const auto& element = C::get(type, value);
@@ -206,9 +208,7 @@ void IRBuilder::set_kernel_attributes() {
     //  > The `noalias` attribute indicates that the only memory accesses inside function are loads
     //  > and stores from objects pointed to by its pointer-typed arguments, with arbitrary
     //  > offsets.
-    if (assume_noalias) {
-        current_function->addParamAttr(0, llvm::Attribute::NoAlias);
-    }
+    current_function->addParamAttr(0, llvm::Attribute::NoAlias);
 
     // Finally, specify that the struct pointer does not capture and is read-only.
     current_function->addParamAttr(0, llvm::Attribute::NoCapture);
@@ -227,7 +227,7 @@ void IRBuilder::set_loop_metadata(llvm::BranchInst* branch) {
     loop_metadata.push_back(nullptr);
 
     // If `vector_width` is 1, explicitly disable vectorization for benchmarking purposes.
-    if (vector_width == 1) {
+    if (platform.is_cpu() && platform.get_instruction_width() == 1) {
         llvm::MDString* name = llvm::MDString::get(context, "llvm.loop.vectorize.enable");
         llvm::Value* false_value = llvm::ConstantInt::get(get_boolean_type(), 0);
         llvm::ValueAsMetadata* value = llvm::ValueAsMetadata::get(false_value);
@@ -376,6 +376,7 @@ llvm::Value* IRBuilder::create_index(llvm::Value* value) {
     const auto& element_type = llvm::cast<llvm::IntegerType>(vector_type->getElementType());
     if (element_type->getBitWidth() == i64_type->getIntegerBitWidth())
         return value;
+    int vector_width = platform.get_instruction_width();
     return builder.CreateSExtOrTrunc(value, llvm::FixedVectorType::get(i64_type, vector_width));
 }
 
@@ -449,7 +450,8 @@ void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
     // Even if generating vectorised code, some variables still need to be scalar. Particularly, the
     // induction variable "id" and remainder loop variables (that start with "epilogue" prefix).
     llvm::Type* type;
-    if (vector_width > 1 && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+    if (platform.is_cpu_with_simd() && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+        int vector_width = platform.get_instruction_width();
         type = llvm::FixedVectorType::get(element_or_scalar_type, vector_width);
     } else {
         type = element_or_scalar_type;
@@ -495,7 +497,7 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
     llvm::Value* element_ptr = create_inbounds_gep(array, id_value);
 
     // Find out if the vector code is generated.
-    bool generating_vector_ir = vector_width > 1 && vectorize;
+    bool generating_vector_ir = platform.is_cpu_with_simd() && vectorize;
 
     // If the vector code is generated, we need to distinguish between two cases. If the array is
     // indexed indirectly (i.e. not by an induction variable `kernel_id`), create gather/scatter
@@ -523,7 +525,7 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
         // to a vector pointer
         llvm::Type* vector_type = llvm::PointerType::get(
             llvm::FixedVectorType::get(element_ptr->getType()->getPointerElementType(),
-                                       vector_width),
+                                       platform.get_instruction_width()),
             /*AddressSpace=*/0);
         ptr = builder.CreateBitCast(element_ptr, vector_type);
     } else {
@@ -541,11 +543,12 @@ llvm::Value* IRBuilder::load_to_or_store_from_array(const std::string& id_name,
 
 void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     // If the value should not be vectorised, or it is already a vector, add it to the stack.
-    if (!vectorize || vector_width == 1 || value->getType()->isVectorTy()) {
+    if (!vectorize || !platform.is_cpu_with_simd() || value->getType()->isVectorTy()) {
         value_stack.push_back(value);
     } else {
         // Otherwise, we generate vectorized code inside the loop, so replicate the value to form a
         // vector.
+        int vector_width = platform.get_instruction_width();
         llvm::Value* vector_value = builder.CreateVectorSplat(vector_width, value);
         value_stack.push_back(vector_value);
     }
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index b3005db0c7..cf9e7f936d 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -10,6 +10,7 @@
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
+#include "codegen/llvm/target_platform.hpp"
 #include "symtab/symbol_table.hpp"
 
 #include "llvm/IR/IRBuilder.h"
@@ -52,14 +53,8 @@ class IRBuilder {
     /// Flag to indicate that the generated IR should be vectorized.
     bool vectorize;
 
-    /// Precision of the floating-point numbers (32 or 64 bit).
-    unsigned fp_precision;
-
-    /// The vector width used for the vectorized code.
-    unsigned vector_width;
-
-    /// Instance struct fields do not alias.
-    bool assume_noalias;
+    /// Target platform for which IR is built.
+    Platform platform;
 
     /// Masked value used to predicate vector instructions.
     llvm::Value* mask;
@@ -72,21 +67,17 @@ class IRBuilder {
 
   public:
     IRBuilder(llvm::LLVMContext& context,
-              bool use_single_precision = false,
-              unsigned vector_width = 1,
-              std::vector<std::string> fast_math_flags = {},
-              bool assume_noalias = true)
+              Platform& platform,
+              std::vector<std::string> fast_math_flags = {})
         : builder(context)
+        , platform(platform)
         , symbol_table(nullptr)
         , current_function(nullptr)
         , vectorize(false)
         , alloca_ip(nullptr)
-        , fp_precision(use_single_precision ? single_precision : double_precision)
-        , vector_width(vector_width)
         , mask(nullptr)
         , kernel_id("")
-        , fast_math_flags(fast_math_flags)
-        , assume_noalias(assume_noalias) {}
+        , fast_math_flags(fast_math_flags) {}
 
     /// Transforms the fast math flags provided to the builder into LLVM's representation.
     llvm::FastMathFlags transform_to_fmf(std::vector<std::string>& flags) {
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 6d374999c3..92d8a486c1 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -47,8 +47,11 @@ int main(int argc, const char* argv[]) {
     logger->info("Running Symtab Visitor");
     visitor::SymtabVisitor().visit_program(*ast);
 
+    // Use default platform for this toy example.
+    codegen::Platform platform;
+
     logger->info("Running LLVM Visitor");
-    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", /*opt_level_ir=*/0);
+    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", platform, /*opt_level_ir=*/0);
     llvm_visitor.visit_program(*ast);
     std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
 
diff --git a/src/codegen/llvm/target_platform.cpp b/src/codegen/llvm/target_platform.cpp
new file mode 100644
index 0000000000..6cb8c7bb2b
--- /dev/null
+++ b/src/codegen/llvm/target_platform.cpp
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/target_platform.hpp"
+
+namespace nmodl {
+namespace codegen {
+
+const std::string Platform::DEFAULT_PLATFORM_NAME = "default";
+const std::string Platform::DEFAULT_MATH_LIBRARY = "none";
+
+bool Platform::is_default_platform() {
+    // Default platform is a CPU.
+    return platform_id == PlatformID::CPU &&  name == Platform::DEFAULT_PLATFORM_NAME;
+}
+
+bool Platform::is_cpu() {
+    return platform_id == PlatformID::CPU;
+}
+
+bool Platform::is_cpu_with_simd() {
+    return platform_id == PlatformID::CPU && instruction_width > 1;
+}
+
+bool Platform::is_gpu() {
+    return platform_id == PlatformID::GPU;
+}
+
+bool Platform::is_single_precision() {
+  return use_single_precision;
+}
+
+std::string Platform::get_name() const {
+    return name;
+}
+
+std::string Platform::get_math_library() const {
+    return math_library;
+}
+
+int Platform::get_instruction_width() const {
+    return instruction_width;
+}
+
+int Platform::get_precision() const {
+    return use_single_precision? 32 : 64;
+}
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/target_platform.hpp b/src/codegen/llvm/target_platform.hpp
new file mode 100644
index 0000000000..2eabbb1a4b
--- /dev/null
+++ b/src/codegen/llvm/target_platform.hpp
@@ -0,0 +1,92 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+
+namespace nmodl {
+namespace codegen {
+
+enum PlatformID {
+    CPU,
+    GPU
+};
+
+/**
+ * \class Platform
+ * \brief A class that represents the target platform. It is needed to
+ * reduce the amount of code passed to LLVM visitor and its helpers.
+ */
+class Platform {
+  public:
+    /// Default name of the target and math library.
+    static const std::string DEFAULT_PLATFORM_NAME;
+    static const std::string DEFAULT_MATH_LIBRARY;
+
+  private:
+    /// Name of the platform.
+    const std::string name = Platform::DEFAULT_PLATFORM_NAME;
+
+    /// Target-specific id to compare platforms easily.
+    PlatformID platform_id;
+
+    /// User-provided width that is used to construct LLVM instructions
+    //  and types.
+    int instruction_width = 1;
+
+    /// Use single-precision floating-point types.
+    bool use_single_precision = false;
+
+    /// A name of user-provided math library.
+    std::string math_library = Platform::DEFAULT_MATH_LIBRARY;
+
+  public:
+    Platform(PlatformID platform_id,
+             const std::string& name,
+             std::string& math_library,
+             bool use_single_precision = false,
+             int instruction_width = 1)
+              : platform_id(platform_id)
+              , name(name)
+              , math_library(math_library)
+              , use_single_precision(use_single_precision)
+              , instruction_width(instruction_width) {}
+
+    Platform(bool use_single_precision,
+             int instruction_width)
+            : platform_id(PlatformID::CPU)
+            , use_single_precision(use_single_precision)
+            , instruction_width(instruction_width) {}
+
+    Platform() : platform_id(PlatformID::CPU) {}
+
+    /// Checks if this platform is a default platform.
+    bool is_default_platform();
+
+    /// Checks if this platform is a CPU.
+    bool is_cpu();
+
+    /// Checks if this platform is a CPU with SIMD support.
+    bool is_cpu_with_simd();
+
+    /// Checks if this platform is a GPU.
+    bool is_gpu();
+
+    bool is_single_precision();
+
+    std::string get_name() const;
+
+    std::string get_math_library() const;
+
+    int get_instruction_width() const;
+
+    int get_precision() const;
+};
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/main.cpp b/src/main.cpp
index 140e4c77d0..f2678fcb48 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -696,21 +696,29 @@ int main(int argc, const char* argv[]) {
               // information and not in LLVM visitor.
               int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
 
-              if (llvm_gpu_name != "default") {
-                logger->warn("GPU code generation is not supported, targeting "
-                             "CPU instead");
-              }
+              // Create platform abstraction.
+              PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU
+                                                          : PlatformID::GPU;
+              const std::string name =
+                  llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
+              Platform platform(pid, name, llvm_math_library, llvm_float_type,
+                                llvm_vector_width);
 
               logger->info("Running LLVM backend code generator");
-              CodegenLLVMVisitor visitor(modfile, output_dir, llvm_opt_level,
-                                         llvm_float_type, llvm_vector_width,
-                                         llvm_math_library, !llvm_no_debug,
-                                         llvm_fast_math_flags, true);
+              CodegenLLVMVisitor visitor(modfile, output_dir, platform,
+                                         llvm_opt_level, !llvm_no_debug,
+                                         llvm_fast_math_flags);
               visitor.visit_program(*ast);
               ast_to_nmodl(*ast, filepath("llvm", "mod"));
               ast_to_json(*ast, filepath("llvm", "json"));
 
               if (llvm_benchmark) {
+                // \todo integrate Platform class here
+                if (llvm_gpu_name != "default") {
+                  logger->warn("GPU benchmarking is not supported, targeting "
+                               "CPU instead");
+                }
+
                 logger->info("Running LLVM benchmark");
                 benchmark::LLVMBenchmark benchmark(
                     visitor, modfile, output_dir, shared_lib_paths,
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 41605ecbd3..4c9515f814 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -124,8 +124,12 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         const auto& ast = driver.parse_string(nmodl_text);
 
         SymtabVisitor().visit_program(*ast);
+
+        codegen::Platform cpu_platform(/*use_single_precision=*/false,
+                                       /*instruction_width=*/1);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
+                                                 cpu_platform,
                                                  /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
 
@@ -226,8 +230,12 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
         const auto& ast = driver.parse_string(nmodl_text);
 
         SymtabVisitor().visit_program(*ast);
+
+        codegen::Platform cpu_platform(/*use_single_precision=*/false,
+                                       /*instruction_width=*/1);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
+                                                 cpu_platform,
                                                  /*opt_level_ir=*/3);
         llvm_visitor.visit_program(*ast);
 
@@ -299,11 +307,13 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform cpu_platform(/*use_single_precision=*/false,
+                                       /*instruction_width=*/1);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/0,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/1);
+                                                 cpu_platform,
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
@@ -381,11 +391,13 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/4);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/3,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/4);
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/3);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
@@ -463,11 +475,13 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/2);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/0,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/2);
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
@@ -554,11 +568,13 @@ SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
         SymtabVisitor().visit_program(*ast);
         NeuronSolveVisitor().visit_program(*ast);
         SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/2);
         codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
                                                  /*output_dir=*/".",
-                                                 /*opt_level_ir=*/0,
-                                                 /*use_single_precision=*/false,
-                                                 /*vector_width=*/2);
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/0);
         llvm_visitor.visit_program(*ast);
         llvm_visitor.wrap_kernel_functions();
 
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 6042aecfc8..fbb07dfbcd 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -39,11 +39,11 @@ codegen::CodegenInstanceData generate_instance_data(const std::string& text,
     SymtabVisitor().visit_program(*ast);
     NeuronSolveVisitor().visit_program(*ast);
 
+    codegen::Platform cpu_platform(use_single_precision, vector_width);
     codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"test",
                                              /*output_dir=*/".",
-                                             opt_level,
-                                             use_single_precision,
-                                             vector_width);
+                                             cpu_platform,
+                                             opt_level);
     llvm_visitor.visit_program(*ast);
     llvm_visitor.dump_module();
     const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 23f6977aea..34fcd8b0da 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -51,14 +51,12 @@ std::string run_llvm_visitor(const std::string& text,
     NeuronSolveVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
 
-    codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
-                                             /*output_dir=*/".",
-                                             opt_level,
-                                             use_single_precision,
-                                             vector_width,
-                                             vec_lib,
-                                             /*add_debug_information=*/false,
-                                             fast_math_flags);
+    codegen::Platform cpu_platform(codegen::PlatformID::CPU, /*name=*/"default",
+                                   vec_lib, use_single_precision, vector_width);
+    codegen::CodegenLLVMVisitor llvm_visitor(
+        /*mod_filename=*/"unknown",
+        /*output_dir=*/".", cpu_platform, opt_level,
+        /*add_debug_information=*/false, fast_math_flags);
 
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();

From ad583562fa520236832991d4b2fdad00f6404cad Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 00:34:41 +0100
Subject: [PATCH 200/331] [LLVM][GPU] Added GPU-specific AST transformations

This commit adds a new AST node: `CodegenThreadId` that
represents thread id used in GPU computation. Thanks to
the new platform class abstraction, the code to generate
compute body of NEURON block was readapted to support
AST transformations needed for GPU.

Example of the transformation:
```
GPU_ID id
INTEGER node_id
DOUBLE v
IF (id<mech->node_count) {
    node_id = mech->node_index[id]
    v = mech->voltage[node_id]
    mech->m[id] = mech->y[id]+2
}
```
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 208 +++++++++---------
 .../llvm/codegen_llvm_helper_visitor.hpp      |  28 ++-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   8 +-
 src/language/code_generator.cmake             |   1 +
 src/language/codegen.yaml                     |  16 ++
 test/unit/codegen/codegen_llvm_ir.cpp         |  63 +++++-
 6 files changed, 212 insertions(+), 112 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 654afd8ef5..8de61f726b 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -443,7 +443,7 @@ void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
  * @param node Ast node under which variables to be converted to instance type
  */
 void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
-                                                            std::string& index_var) {
+                                                            const std::string& index_var) {
     /// collect all variables in the node of type ast::VarName
     auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
     for (const auto& v: variables) {
@@ -612,35 +612,29 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// statements for new function to be generated
     ast::StatementVector function_statements;
 
-    /// create variable definition for loop index and insert at the beginning
-    std::string loop_index_var = "id";
-    std::vector<std::string> induction_variables{"id"};
-    function_statements.push_back(
-        create_local_variable_statement(induction_variables, INTEGER_TYPE));
-
     /// create vectors of local variables that would be used in compute part
     std::vector<std::string> int_variables{"node_id"};
     std::vector<std::string> double_variables{"v"};
 
-    /// create now main compute part : for loop over channel instances
+    /// create now main compute part
 
-    /// loop body : initialization + solve blocks
-    ast::StatementVector loop_def_statements;
-    ast::StatementVector loop_index_statements;
-    ast::StatementVector loop_body_statements;
+    /// compute body : initialization + solve blocks
+    ast::StatementVector def_statements;
+    ast::StatementVector index_statements;
+    ast::StatementVector body_statements;
     {
         /// access node index and corresponding voltage
-        loop_index_statements.push_back(
+        index_statements.push_back(
             visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
-        loop_body_statements.push_back(
+        body_statements.push_back(
             visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
 
         /// read ion variables
         ion_read_statements(BlockType::State,
                             int_variables,
                             double_variables,
-                            loop_index_statements,
-                            loop_body_statements);
+                            index_statements,
+                            body_statements);
 
         /// main compute node : extract solution expressions from the derivative block
         const auto& solutions = collect_nodes(node, {ast::AstNodeType::SOLUTION_EXPRESSION});
@@ -648,109 +642,41 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
             const auto& solution = std::dynamic_pointer_cast<ast::SolutionExpression>(statement);
             const auto& block = std::dynamic_pointer_cast<ast::StatementBlock>(
                 solution->get_node_to_solve());
-            append_statements_from_block(loop_body_statements, block);
+            append_statements_from_block(body_statements, block);
         }
 
         /// add breakpoint block if no current
         if (info.currents.empty() && info.breakpoint_node != nullptr) {
             auto block = info.breakpoint_node->get_statement_block();
-            append_statements_from_block(loop_body_statements, block);
+            append_statements_from_block(body_statements, block);
         }
 
         /// write ion statements
         ion_write_statements(BlockType::State,
                              int_variables,
                              double_variables,
-                             loop_index_statements,
-                             loop_body_statements);
+                             index_statements,
+                             body_statements);
 
         // \todo handle process_shadow_update_statement and wrote_conc_call yet
     }
 
-    ast::StatementVector loop_body;
-    loop_body.insert(loop_body.end(), loop_def_statements.begin(), loop_def_statements.end());
-    loop_body.insert(loop_body.end(), loop_index_statements.begin(), loop_index_statements.end());
-    loop_body.insert(loop_body.end(), loop_body_statements.begin(), loop_body_statements.end());
-
-    /// now construct a new code block which will become the body of the loop
-    auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
-
-    /// declare main FOR loop local variables
-    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
-    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
-
-    /// main loop possibly vectorized on vector_width
-    {
-        /// loop constructs : initialization, condition and increment
-        const auto& initialization = int_initialization_expression(INDUCTION_VAR);
-        const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, vector_width);
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
-
-        /// clone it
-        auto local_loop_block = std::shared_ptr<ast::StatementBlock>(loop_block->clone());
-
-        /// convert local statement to codegenvar statement
-        convert_local_statement(*local_loop_block);
-
-        auto for_loop_statement_main = std::make_shared<ast::CodegenForStatement>(initialization,
-                                                                                  condition,
-                                                                                  increment,
-                                                                                  local_loop_block);
-
-        /// convert all variables inside loop body to instance variables
-        convert_to_instance_variable(*for_loop_statement_main, loop_index_var);
-
-        /// loop itself becomes one of the statement in the function
-        function_statements.push_back(for_loop_statement_main);
-    }
-
-    /// vectors containing renamed FOR loop local variables
-    std::vector<std::string> renamed_int_variables;
-    std::vector<std::string> renamed_double_variables;
-
-    /// remainder loop possibly vectorized on vector_width
-    if (vector_width > 1) {
-        /// loop constructs : initialization, condition and increment
-        const auto& condition =
-            loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, /*vector_width=*/1);
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, /*vector_width=*/1);
-
-        /// rename local variables to avoid conflict with main loop
-        rename_local_variables(*loop_block);
-
-        /// convert local statement to codegenvar statement
-        convert_local_statement(*loop_block);
-
-        auto for_loop_statement_remainder =
-            std::make_shared<ast::CodegenForStatement>(nullptr, condition, increment, loop_block);
-
-        const auto& loop_statements = for_loop_statement_remainder->get_statement_block();
-        // \todo: Change RenameVisitor to take a vector of names to which it would append a single
-        // prefix.
-        for (const auto& name: int_variables) {
-            std::string new_name = epilogue_variable_prefix + name;
-            renamed_int_variables.push_back(new_name);
-            visitor::RenameVisitor v(name, new_name);
-            loop_statements->accept(v);
-        }
-        for (const auto& name: double_variables) {
-            std::string new_name = epilogue_variable_prefix + name;
-            renamed_double_variables.push_back(new_name);
-            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
-            loop_statements->accept(v);
-        }
+    /// create target-specific compute body
+    ast::StatementVector compute_body;
+    compute_body.insert(compute_body.end(), def_statements.begin(), def_statements.end());
+    compute_body.insert(compute_body.end(), index_statements.begin(), index_statements.end());
+    compute_body.insert(compute_body.end(), body_statements.begin(), body_statements.end());
 
-        /// declare remainder FOR loop local variables
-        function_statements.push_back(
-            create_local_variable_statement(renamed_int_variables, INTEGER_TYPE));
+    if (platform.is_gpu()) {
+        const auto& id_statement = std::make_shared<ast::CodegenThreadId>(create_varname(INDUCTION_VAR));
+        function_statements.push_back(id_statement);
+        create_gpu_compute_body(compute_body, function_statements, int_variables, double_variables);
+    } else {
+        // Create induction variable
+        std::vector<std::string> induction_variables{INDUCTION_VAR};
         function_statements.push_back(
-            create_local_variable_statement(renamed_double_variables, FLOAT_TYPE));
-
-        /// convert all variables inside loop body to instance variables
-        convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
-
-        /// loop itself becomes one of the statement in the function
-        function_statements.push_back(for_loop_statement_remainder);
+                create_local_variable_statement(induction_variables, INTEGER_TYPE));
+        create_cpu_compute_body(compute_body, function_statements, int_variables, double_variables);
     }
 
     /// new block for the function
@@ -777,6 +703,84 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     std::cout << nmodl::to_nmodl(function) << std::endl;
 }
 
+void CodegenLLVMHelperVisitor::create_gpu_compute_body(ast::StatementVector& body,
+                                                       ast::StatementVector& function_statements,
+                                                       std::vector<std::string>& int_variables,
+                                                       std::vector<std::string>& double_variables) {
+    // Then, create condition for thread id. For now - reuse the same functionality as for
+    auto kernel_block = std::make_shared<ast::StatementBlock>(body);
+    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, 1);
+    ast::ElseIfStatementVector else_ifs = {};
+    auto if_statement = std::make_shared<ast::IfStatement>(condition, kernel_block, else_ifs, nullptr);
+
+    convert_to_instance_variable(*if_statement, INDUCTION_VAR);
+
+    // Push variables and the loop to the function statements vector.
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
+    function_statements.push_back(if_statement);
+}
+
+void CodegenLLVMHelperVisitor::create_cpu_compute_body(ast::StatementVector& body,
+                                                       ast::StatementVector& function_statements,
+                                                       std::vector<std::string>& int_variables,
+                                                       std::vector<std::string>& double_variables) {
+    auto loop_block = std::make_shared<ast::StatementBlock>(body);
+    create_compute_body_loop(loop_block, function_statements, int_variables, double_variables);
+    if (platform.is_cpu_with_simd())
+        create_compute_body_loop(loop_block, function_statements, int_variables, double_variables, /*is_remainder_loop=*/true);
+}
+
+void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::StatementBlock>& block,
+                                                        ast::StatementVector& function_statements,
+                                                        std::vector<std::string>& int_variables,
+                                                        std::vector<std::string>& double_variables,
+                                                        bool is_remainder_loop) {
+    // First, check if we are creating a main or remainder loop. If it is a remainder loop, then
+    // no initialization is needed and instruction width is simply 1.
+    int width = is_remainder_loop ? 1 : platform.get_instruction_width();
+    const auto& initialization = is_remainder_loop ? nullptr : int_initialization_expression(INDUCTION_VAR);
+    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, width);
+    const auto& increment = loop_increment_expression(INDUCTION_VAR, width);
+
+    // Clone the statement block if needed since it can be used by the remainder loop.
+    auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
+
+    // Convert local statement to use CodegenVar statements and create  a FOR loop node. Also, if creating
+    // a remainder loop then rename variables to avoid conflicts.
+    if (is_remainder_loop)
+        rename_local_variables(*loop_block);
+    convert_local_statement(*loop_block);
+    auto for_loop = std::make_shared<ast::CodegenForStatement>(initialization,
+                                                               condition,
+                                                               increment,
+                                                               loop_block);
+
+    // Convert all variables inside loop body to be instance variables.
+    convert_to_instance_variable(*for_loop, INDUCTION_VAR);
+
+    // Rename variables if processing remainder loop.
+    if (is_remainder_loop) {
+        const auto& loop_statements = for_loop->get_statement_block();
+        auto rename = [&](std::vector<std::string>& vars) {
+            for (int i = 0; i < vars.size(); ++i) {
+                std::string old_name = vars[i];
+                std::string new_name = epilogue_variable_prefix + vars[i];
+                vars[i] = new_name;
+                visitor::RenameVisitor v(old_name, new_name);
+                loop_statements->accept(v);
+            }
+        };
+        rename(int_variables);
+        rename(double_variables);
+    }
+
+    // Push variables and  the loop to the function statements vector.
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
+    function_statements.push_back(for_loop);
+}
+
 void CodegenLLVMHelperVisitor::remove_inlined_nodes(ast::Program& node) {
     auto program_symtab = node.get_model_symbol_table();
     const auto& func_proc_nodes =
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 21aff4a92d..a40d7923cc 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -16,6 +16,7 @@
 
 #include "ast/instance_struct.hpp"
 #include "codegen/codegen_info.hpp"
+#include "codegen/llvm/target_platform.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
@@ -101,8 +102,8 @@ struct InstanceVarHelper {
  * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
-    /// explicit vectorisation width
-    int vector_width;
+    /// target platform
+    Platform platform;
 
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
@@ -135,8 +136,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     static const std::string VOLTAGE_VAR;
     static const std::string NODE_INDEX_VAR;
 
-    CodegenLLVMHelperVisitor(int vector_width)
-        : vector_width(vector_width) {}
+    CodegenLLVMHelperVisitor(Platform& platform)
+        : platform(platform) {}
 
     const InstanceVarHelper& get_instance_var_helper() {
         return instance_var_helper;
@@ -161,7 +162,7 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
                               ast::StatementVector& index_statements,
                               ast::StatementVector& body_statements);
 
-    void convert_to_instance_variable(ast::Node& node, std::string& index_var);
+    void convert_to_instance_variable(ast::Node& node, const std::string& index_var);
 
     void convert_local_statement(ast::StatementBlock& node);
     void rename_local_variables(ast::StatementBlock& node);
@@ -173,6 +174,23 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void visit_function_block(ast::FunctionBlock& node) override;
     void visit_nrn_state_block(ast::NrnStateBlock& node) override;
     void visit_program(ast::Program& node) override;
+
+  private:
+    /// Methods to populate`function_statements` with necessary AST constructs to form
+    /// a kernel for a specific target.
+    void create_gpu_compute_body(ast::StatementVector& body,
+                                 ast::StatementVector& function_statements,
+                                 std::vector<std::string>& int_variables,
+                                 std::vector<std::string>& double_variables);
+    void create_cpu_compute_body(ast::StatementVector& body,
+                                 ast::StatementVector& function_statements,
+                                 std::vector<std::string>& int_variables,
+                                 std::vector<std::string>& double_variables);
+    void create_compute_body_loop(std::shared_ptr<ast::StatementBlock>& block,
+                                  ast::StatementVector& function_statements,
+                                  std::vector<std::string>& int_variables,
+                                  std::vector<std::string>& double_variables,
+                                  bool is_remainder_loop = false);
 };
 
 /** @} */  // end of llvm_codegen_details
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0fa81de691..2f677cfbec 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -815,12 +815,18 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{platform.get_instruction_width()};
+    CodegenLLVMHelperVisitor v{platform};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
+    // \todo: implement GPU codegen functionality.
+    if (platform.is_gpu()) {
+      logger->warn("GPU code generation is not supported yet, aborting!");
+      return;
+    }
+
     // Initialize the builder for this NMODL program.
     ir_builder.initialize(*sym_tab, kernel_id);
 
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 17123fc833..72b2754b1a 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -71,6 +71,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/codegen_instance_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_struct.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_thread_id.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_list_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_type.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 30bae4c5c5..245010f054 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -286,3 +286,19 @@
                         - rhs:
                             brief: "Expression for atomic operation"
                             type: Expression
+                  - CodegenThreadId:
+                      brief: "Represents a generic thread id expression for GPU code generation"
+                      description: |
+                        For GPU code generation, we use a special AST node to enocde the thread
+                        id calculation. In NMODL, this expression is usually of the form:
+                        \code{.cpp}
+                            id = blockId.x * blockDim.x + threadId.x
+                        \endcode
+                        To be able to support multiple GPU backends, we choose to have a custom AST
+                        node. Therefore, the code generation for this node is kept very simple,
+                        mapping expression to target-specific GPU inrinsics.
+                      nmodl: "GPU_ID "
+                      members:
+                        - name:
+                            brief: "Name of the thread id variable"
+                            type: Identifier
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 34fcd8b0da..e723c850a8 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -68,14 +68,14 @@ std::string run_llvm_visitor(const std::string& text,
 
 std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
     const std::string& text,
-    int vector_width,
+    codegen::Platform& platform,
     const std::vector<ast::AstNodeType>& nodes_to_collect) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
+    CodegenLLVMHelperVisitor(platform).visit_program(*ast);
 
     const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 
@@ -1228,8 +1228,9 @@ SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
             })";
 
         THEN("a single scalar loops is constructed") {
+            codegen::Platform default_platform;
             auto result = run_llvm_visitor_helper(nmodl_text,
-                                                  /*vector_width=*/1,
+                                                  default_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 1);
 
@@ -1279,8 +1280,9 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
 
 
         THEN("vector and epilogue scalar loops are constructed") {
+            codegen::Platform simd_platform(/*use_single_precision=*/false, /*instruction_width=*/8);
             auto result = run_llvm_visitor_helper(nmodl_text,
-                                                  /*vector_width=*/8,
+                                                  simd_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 2);
 
@@ -1523,3 +1525,56 @@ SCENARIO("Removal of inlined functions and procedures", "[visitor][llvm][inline]
         }
     }
 }
+
+//=============================================================================
+// Basic GPU kernel AST generation
+//=============================================================================
+
+SCENARIO("GPU kernel body", "[visitor][llvm][gpu]") {
+    GIVEN("For GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+
+        std::string expected_kernel = R"(
+            VOID nrn_state_test(INSTANCE_STRUCT *mech){
+                GPU_ID id
+                INTEGER node_id
+                DOUBLE v
+                IF (id<mech->node_count) {
+                    node_id = mech->node_index[id]
+                    v = mech->voltage[node_id]
+                    mech->m[id] = mech->y[id]+2
+                }
+            })";
+
+        THEN("a kernel with thread id and if statement is created") {
+            std::string name = "default";
+            std::string math_library = "none";
+            codegen::Platform gpu_platform(codegen::PlatformID::GPU, name, math_library);
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  gpu_platform,
+                                                  {ast::AstNodeType::CODEGEN_FUNCTION});
+            REQUIRE(result.size() == 1);
+
+            auto kernel = reindent_text(to_nmodl(result[0]));
+            REQUIRE(kernel == reindent_text(expected_kernel));
+        }
+    }
+}

From 0530417523f82b78a8ca60a6d99d06aa46b2a3e2 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 00:45:11 +0100
Subject: [PATCH 201/331] fixed comments

---
 src/codegen/llvm/codegen_llvm_helper_visitor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 8de61f726b..be64784d33 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -707,7 +707,7 @@ void CodegenLLVMHelperVisitor::create_gpu_compute_body(ast::StatementVector& bod
                                                        ast::StatementVector& function_statements,
                                                        std::vector<std::string>& int_variables,
                                                        std::vector<std::string>& double_variables) {
-    // Then, create condition for thread id. For now - reuse the same functionality as for
+    // Then, create condition for thread id. For now reuse the functionality from `loop_count_expression`.
     auto kernel_block = std::make_shared<ast::StatementBlock>(body);
     const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, 1);
     ast::ElseIfStatementVector else_ifs = {};
@@ -746,7 +746,7 @@ void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::Sta
     // Clone the statement block if needed since it can be used by the remainder loop.
     auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
 
-    // Convert local statement to use CodegenVar statements and create  a FOR loop node. Also, if creating
+    // Convert local statement to use CodegenVar statements and create a FOR loop node. Also, if creating
     // a remainder loop then rename variables to avoid conflicts.
     if (is_remainder_loop)
         rename_local_variables(*loop_block);

From 1d4117ac4353714cc89ae056690a733b16214ee9 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 15 Mar 2022 23:27:16 +0100
Subject: [PATCH 202/331] Initial work for gpu runner

---
 test/benchmark/jit_driver.cpp | 58 +++++++++++++++++++++++++++++++++++
 test/benchmark/jit_driver.hpp | 37 ++++++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index f91b41cda0..ff7f87f164 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -201,5 +201,63 @@ void JITDriver::init(const std::string& cpu, BenchmarkInfo* benchmark_info) {
             llvm::orc::DumpObjects(benchmark_info->output_dir, benchmark_info->filename));
     }
 }
+
+DeviceInfo get_device_info() {
+    DeviceInfo device_info;
+    checkCudaErrors(cuDeviceGetCount(&device_info.count));
+    checkCudaErrors(cuDeviceGet(&device, 0));
+    char name[128];
+    checkCudaErrors(cuDeviceGetName(name, 128, device));
+    device_info.name = std::string(name);
+    int devMajor, devMinor;
+    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
+    if (devMajor < 2) {
+        throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
+    }
+}
+
+
+void GPUJITDriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
+    // CUDA initialization
+    checkCudaErrors(cuInit(0));
+    device_info = get_device_info();
+
+    // Save the LLVM IR module to string
+    std::string kernel_llvm_ir;
+    llvm::raw_string_ostream os(kernel_llvm_ir);
+    os << *module;
+    os.flush();
+
+    // Create NVVM program object
+    nvvmCreateProgram(&prog);
+
+    // Add custom IR to program
+    nvvmAddModuleToProgram(prog, kernel_llvm_ir, kernel_llvm_ir.size(), "nmodl_llvm_ir");
+
+    // Declare compile options
+    const char *options[] = { "-ftz=1" };
+
+    // Compile the program
+    nvvmCompileProgram(prog, 1, options);
+
+    // Get compiled module
+    char* compiled_module;
+    size_t compiled_module_size;
+    nvvmGetCompiledResultSize(prog, &compiled_module_size);
+    std::cout << "Compiled module size: " << compiled_module_size << "\n";
+    compiled_module = (char*)malloc(compiled_module_size);
+    nvvmGetCompiledResult(prog, compiled_module);
+
+    // Create driver context
+    checkCudaErrors(cuCtxCreate(&context, 0, device));
+
+    // Create module for object
+    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, kernel_llvm_ir.c_str(), 0, 0, 0));
+
+    // Get kernel function
+    checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));
+}
+
 }  // namespace runner
 }  // namespace nmodl
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index 7106311523..dc8e740e50 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -19,6 +19,10 @@
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/Support/Host.h"
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+#include "cuda.h"
+#endif
+
 namespace nmodl {
 namespace runner {
 
@@ -94,6 +98,39 @@ class JITDriver {
     }
 };
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+void checkCudaErrors(CUresult err) {
+  assert(err == CUDA_SUCCESS);
+}
+
+class DeviceInfo {
+    int count;
+    std::string name;
+    int compute_version_major;
+    int compute_version_minor;
+}
+
+class GPUJITDriver: public JITDriver {
+    nvvmProgram prog;
+    CUdevice    device;
+    CUmodule    cudaModule;
+    CUcontext   context;
+    CUfunction  function;
+    CUlinkState linker;
+    DeviceInfo device_info;
+
+    /// Gets available GPU device information
+   DeviceInfo get_device_info();
+
+    public:
+        explicit GPUJITDriver(std::unique_ptr<llvm::Module> m)
+            : JITDriver(std::move(m)) {}
+    
+        /// Initializes the CUDA GPU JIT driver.
+        void init(const std::string& cpu, BenchmarkInfo* benchmark_info = nullptr);
+};
+#endif
+
 /**
  * \class BaseRunner
  * \brief A base runner class that provides functionality to execute an

From d0d10515058aff023653634186015f4a7b83ef34 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 16 Mar 2022 17:01:41 +0100
Subject: [PATCH 203/331] Compile module and load it in the GPUJITDriver

---
 CMakeLists.txt                |  8 ++++--
 test/benchmark/CMakeLists.txt |  3 +++
 test/benchmark/jit_driver.cpp | 50 ++++++++++++++++++++++-------------
 test/benchmark/jit_driver.hpp | 14 +++++-----
 test/unit/CMakeLists.txt      |  1 -
 5 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c5aae4a7e..b15448b052 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 
-cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 project(NMODL LANGUAGES CXX)
 
@@ -165,6 +165,10 @@ if(NMODL_ENABLE_LLVM)
   if(NMODL_ENABLE_LLVM_CUDA)
     enable_language(CUDA)
     find_package(CUDAToolkit)
+    set(CUDA_NVVM_INCLUDE_DIR ${CUDAToolkit_LIBRARY_ROOT}/nvvm/include)
+    set(CUDA_NVVM_LIBRARY_DIR ${CUDAToolkit_LIBRARY_ROOT}/nvvm/lib64)
+    set(NMODL_CUDA_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS} ${CUDA_NVVM_INCLUDE_DIR})
+    include_directories(${NMODL_CUDA_INCLUDE_DIRECTORIES})
     add_definitions(-DNMODL_LLVM_CUDA_BACKEND)
   endif()
 endif()
@@ -276,7 +280,7 @@ endif()
 message(STATUS "LLVM CUDA Codegen   | ${NMODL_ENABLE_LLVM_CUDA}")
 if(NMODL_ENABLE_LLVM_CUDA)
   message(STATUS "  CUDA VERSION      | ${CUDAToolkit_VERSION}")
-  message(STATUS "  INCLUDE           | ${CUDAToolkit_INCLUDE_DIRS}")
+  message(STATUS "  INCLUDE           | ${NMODL_CUDA_INCLUDE_DIRECTORIES}")
   message(STATUS "  LIBRARY           | ${CUDAToolkit_LIBRARY_DIR}")
 endif()
 if(NMODL_CLANG_FORMAT)
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index 4441d53251..684ca0c58c 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -11,6 +11,9 @@ set(LLVM_BENCHMARK_SOURCE_FILES
 include_directories(${LLVM_INCLUDE_DIRS})
 add_library(llvm_benchmark STATIC ${LLVM_BENCHMARK_SOURCE_FILES})
 add_dependencies(llvm_benchmark lexer util visitor)
+if(NMODL_ENABLE_LLVM_CUDA)
+  target_link_libraries(llvm_benchmark PRIVATE CUDA::cudart CUDA::nvrtc ${CUDA_NVVM_LIBRARY_DIR}/libnvvm.so)
+endif()
 
 if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
   target_compile_definitions(llvm_benchmark PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index ff7f87f164..78028e72ea 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -201,27 +201,38 @@ void JITDriver::init(const std::string& cpu, BenchmarkInfo* benchmark_info) {
             llvm::orc::DumpObjects(benchmark_info->output_dir, benchmark_info->filename));
     }
 }
+#ifdef NMODL_LLVM_CUDA_BACKEND
+void checkCudaErrors(CUresult err) {
+    if (err != CUDA_SUCCESS) {
+        const char *ret = NULL;
+        cuGetErrorName(err, &ret);
+        throw std::runtime_error("CUDA error: " + std::string(ret));
+    }
+}
 
-DeviceInfo get_device_info() {
-    DeviceInfo device_info;
+void checkNVVMErrors(nvvmResult err) {
+    if (err != NVVM_SUCCESS) {
+        throw std::runtime_error("NVVM Error: " + std::string(nvvmGetErrorString(err)));
+    }
+}
+void GPUJITDriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
+    // CUDA initialization
+    checkCudaErrors(cuInit(0));
     checkCudaErrors(cuDeviceGetCount(&device_info.count));
     checkCudaErrors(cuDeviceGet(&device, 0));
+
     char name[128];
     checkCudaErrors(cuDeviceGetName(name, 128, device));
-    device_info.name = std::string(name);
-    int devMajor, devMinor;
+    device_info.name = name;
+    std::cout << "Using CUDA Device [0]: " << device_info.name << "\n";
+
     checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
-    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
-    if (devMajor < 2) {
+    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
+    std::cout << "Device Compute Capability: "
+                << device_info.compute_version_major << "." << device_info.compute_version_minor << "\n";
+    if (device_info.compute_version_major < 2) {
         throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
     }
-}
-
-
-void GPUJITDriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
-    // CUDA initialization
-    checkCudaErrors(cuInit(0));
-    device_info = get_device_info();
 
     // Save the LLVM IR module to string
     std::string kernel_llvm_ir;
@@ -233,10 +244,10 @@ void GPUJITDriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     nvvmCreateProgram(&prog);
 
     // Add custom IR to program
-    nvvmAddModuleToProgram(prog, kernel_llvm_ir, kernel_llvm_ir.size(), "nmodl_llvm_ir");
+    nvvmAddModuleToProgram(prog, kernel_llvm_ir.c_str(), kernel_llvm_ir.size(), "nmodl_llvm_ir");
 
     // Declare compile options
-    const char *options[] = { "-ftz=1" };
+    const char *options[] = { "-arch=compute_60" };
 
     // Compile the program
     nvvmCompileProgram(prog, 1, options);
@@ -245,19 +256,20 @@ void GPUJITDriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     char* compiled_module;
     size_t compiled_module_size;
     nvvmGetCompiledResultSize(prog, &compiled_module_size);
-    std::cout << "Compiled module size: " << compiled_module_size << "\n";
     compiled_module = (char*)malloc(compiled_module_size);
     nvvmGetCompiledResult(prog, compiled_module);
+    ptx_compiled_module = std::string(compiled_module, compiled_module_size);
 
     // Create driver context
     checkCudaErrors(cuCtxCreate(&context, 0, device));
 
     // Create module for object
-    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, kernel_llvm_ir.c_str(), 0, 0, 0));
+    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, compiled_module, 0, 0, 0));
 
-    // Get kernel function
-    checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));
+    // // Get kernel function
+    // checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));
 }
+#endif
 
 }  // namespace runner
 }  // namespace nmodl
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index dc8e740e50..7df631b430 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -21,6 +21,7 @@
 
 #ifdef NMODL_LLVM_CUDA_BACKEND
 #include "cuda.h"
+#include "nvvm.h"
 #endif
 
 namespace nmodl {
@@ -49,7 +50,7 @@ struct BenchmarkInfo {
  * \brief Driver to execute a MOD file function via LLVM IR backend.
  */
 class JITDriver {
-  private:
+  protected:
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
 
     std::unique_ptr<llvm::orc::LLJIT> jit;
@@ -99,16 +100,12 @@ class JITDriver {
 };
 
 #ifdef NMODL_LLVM_CUDA_BACKEND
-void checkCudaErrors(CUresult err) {
-  assert(err == CUDA_SUCCESS);
-}
-
-class DeviceInfo {
+struct DeviceInfo {
     int count;
     std::string name;
     int compute_version_major;
     int compute_version_minor;
-}
+};
 
 class GPUJITDriver: public JITDriver {
     nvvmProgram prog;
@@ -118,6 +115,7 @@ class GPUJITDriver: public JITDriver {
     CUfunction  function;
     CUlinkState linker;
     DeviceInfo device_info;
+    std::string ptx_compiled_module;
 
     /// Gets available GPU device information
    DeviceInfo get_device_info();
@@ -127,7 +125,7 @@ class GPUJITDriver: public JITDriver {
             : JITDriver(std::move(m)) {}
     
         /// Initializes the CUDA GPU JIT driver.
-        void init(const std::string& cpu, BenchmarkInfo* benchmark_info = nullptr);
+        void init(const std::string& gpu, BenchmarkInfo* benchmark_info = nullptr);
 };
 #endif
 
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 4e30d48f1e..9f2d99e1d9 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -116,7 +116,6 @@ if(NMODL_ENABLE_LLVM)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_data_helper.cpp
                                   codegen/codegen_llvm_execution.cpp)
   if(NMODL_ENABLE_LLVM_CUDA)
-    include_directories(${CUDAToolkit_INCLUDE_DIRS})
     target_link_libraries(benchmark_data PRIVATE CUDA::cudart)
     target_link_libraries(testllvm CUDA::cudart)
     target_link_libraries(test_llvm_runner CUDA::cudart)

From 35e46feb9acf9caac48b30ccc142f084148b9cd5 Mon Sep 17 00:00:00 2001
From: Omar Awile <omar.awile@epfl.ch>
Date: Thu, 17 Mar 2022 00:41:25 +0100
Subject: [PATCH 204/331] Get rid of one unnecessary shared_ptr (#826) (#827)

The field ast in GlobalToRangeVisitor doesn't need to be a shared pointer,
so we store it as a reference that is initialized at construction time.
---
 src/main.cpp                          | 2 +-
 src/visitors/global_var_visitor.cpp   | 2 +-
 src/visitors/global_var_visitor.hpp   | 6 +++---
 test/unit/visitor/global_to_range.cpp | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index f2678fcb48..19dc4e5cbc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -485,7 +485,7 @@ int main(int argc, const char* argv[]) {
             // make sure to run the GlobalToRange visitor after all the
             // reinitializations of Symtab
             logger->info("Running GlobalToRange visitor");
-            GlobalToRangeVisitor(ast).visit_program(*ast);
+            GlobalToRangeVisitor(*ast).visit_program(*ast);
             SymtabVisitor(update_symtab).visit_program(*ast);
             ast_to_nmodl(*ast, filepath("global_to_range", "mod"));
         }
diff --git a/src/visitors/global_var_visitor.cpp b/src/visitors/global_var_visitor.cpp
index 7ce72436e1..3488fdd6d0 100644
--- a/src/visitors/global_var_visitor.cpp
+++ b/src/visitors/global_var_visitor.cpp
@@ -27,7 +27,7 @@ void GlobalToRangeVisitor::visit_neuron_block(ast::NeuronBlock& node) {
 
     auto& statement_block = node.get_statement_block();
     auto& statements = (*statement_block).get_statements();
-    const auto& symbol_table = ast->get_symbol_table();
+    const auto& symbol_table = ast.get_symbol_table();
 
     for (auto& statement: statements) {
         /// only process global statements
diff --git a/src/visitors/global_var_visitor.hpp b/src/visitors/global_var_visitor.hpp
index 4774958fbf..c16b99decd 100644
--- a/src/visitors/global_var_visitor.hpp
+++ b/src/visitors/global_var_visitor.hpp
@@ -61,7 +61,7 @@ namespace visitor {
 class GlobalToRangeVisitor: public AstVisitor {
   private:
     /// ast::Ast* node
-    std::shared_ptr<ast::Program> ast;
+    const ast::Program& ast;
 
   public:
     /// \name Ctor & dtor
@@ -71,8 +71,8 @@ class GlobalToRangeVisitor: public AstVisitor {
     GlobalToRangeVisitor() = delete;
 
     /// Constructor that takes as parameter the AST
-    explicit GlobalToRangeVisitor(std::shared_ptr<ast::Program> node)
-        : ast(std::move(node)) {}
+    explicit GlobalToRangeVisitor(const ast::Program& node)
+        : ast(node) {}
 
     /// \}
 
diff --git a/test/unit/visitor/global_to_range.cpp b/test/unit/visitor/global_to_range.cpp
index 8fbc4a3199..6b77c47a9d 100644
--- a/test/unit/visitor/global_to_range.cpp
+++ b/test/unit/visitor/global_to_range.cpp
@@ -33,7 +33,7 @@ std::shared_ptr<ast::Program> run_global_to_var_visitor(const std::string& text)
 
     SymtabVisitor().visit_program(*ast);
     PerfVisitor().visit_program(*ast);
-    GlobalToRangeVisitor(ast).visit_program(*ast);
+    GlobalToRangeVisitor(*ast).visit_program(*ast);
     SymtabVisitor().visit_program(*ast);
     return ast;
 }

From b9b14184ba36e9d457c9bf59f208aaa202d7dd49 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 17 Mar 2022 18:44:10 +0100
Subject: [PATCH 205/331] more changes to support gpu execution

---
 src/codegen/llvm/target_platform.cpp |  2 +-
 src/main.cpp                         | 58 +++++++++++++++++++++++-----
 test/benchmark/jit_driver.cpp        |  3 +-
 test/benchmark/jit_driver.hpp        | 54 ++++++++++++++++++++------
 test/benchmark/llvm_benchmark.cpp    | 12 ++++--
 test/benchmark/llvm_benchmark.hpp    | 34 ++++++++++++++--
 6 files changed, 132 insertions(+), 31 deletions(-)

diff --git a/src/codegen/llvm/target_platform.cpp b/src/codegen/llvm/target_platform.cpp
index 6cb8c7bb2b..be2e2540c9 100644
--- a/src/codegen/llvm/target_platform.cpp
+++ b/src/codegen/llvm/target_platform.cpp
@@ -15,7 +15,7 @@ const std::string Platform::DEFAULT_MATH_LIBRARY = "none";
 
 bool Platform::is_default_platform() {
     // Default platform is a CPU.
-    return platform_id == PlatformID::CPU &&  name == Platform::DEFAULT_PLATFORM_NAME;
+    return platform_id == PlatformID::CPU && name == Platform::DEFAULT_PLATFORM_NAME;
 }
 
 bool Platform::is_cpu() {
diff --git a/src/main.cpp b/src/main.cpp
index f2678fcb48..cfe5d2e792 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -197,6 +197,24 @@ int main(int argc, const char* argv[]) {
     /// llvm vector width if generating code for CPUs
     int llvm_vector_width = 1;
 
+    /// X dimension of grid in blocks for GPU execution
+    int llvm_cuda_grid_dim_x = 1;
+
+    /// Y dimension of grid in blocks for GPU execution
+    int llvm_cuda_grid_dim_y = 1;
+
+    /// Z dimension of grid in blocks for GPU execution
+    int llvm_cuda_grid_dim_z = 1;
+
+    /// X dimension of block in threads for GPU execution
+    int llvm_cuda_block_dim_x = 1;
+
+    /// Y dimension of block in threads for GPU execution
+    int llvm_cuda_block_dim_y = 1;
+
+    /// Z dimension of block in threads for GPU execution
+    int llvm_cuda_block_dim_z = 1;
+
     /// run llvm benchmark
     bool llvm_benchmark(false);
 
@@ -386,6 +404,24 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
+    benchmark_opt->add_option("--gridDimX",
+                              gridDimX,
+                              "Grid dimension X ({})"_format(gridDimX))->ignore_case();
+    benchmark_opt->add_option("--gridDimY",
+                                gridDimY,
+                                "Grid dimension Y ({})"_format(gridDimY))->ignore_case();
+    benchmark_opt->add_option("--gridDimZ",
+                                gridDimZ,
+                                "Grid dimension Z ({})"_format(gridDimZ))->ignore_case();
+    benchmark_opt->add_option("--blockDimX",
+                                blockDimX,
+                                "Block dimension X ({})"_format(blockDimX))->ignore_case();
+    benchmark_opt->add_option("--blockDimY",
+                                blockDimY,
+                                "Block dimension Y ({})"_format(blockDimY))->ignore_case();
+    benchmark_opt->add_option("--blockDimZ",
+                                blockDimZ,
+                                "Block dimension Z ({})"_format(blockDimZ))->ignore_case();
 #endif
     // clang-format on
 
@@ -704,6 +740,8 @@ int main(int argc, const char* argv[]) {
               Platform platform(pid, name, llvm_math_library, llvm_float_type,
                                 llvm_vector_width);
 
+              
+
               logger->info("Running LLVM backend code generator");
               CodegenLLVMVisitor visitor(modfile, output_dir, platform,
                                          llvm_opt_level, !llvm_no_debug,
@@ -713,17 +751,19 @@ int main(int argc, const char* argv[]) {
               ast_to_json(*ast, filepath("llvm", "json"));
 
               if (llvm_benchmark) {
-                // \todo integrate Platform class here
-                if (llvm_gpu_name != "default") {
-                  logger->warn("GPU benchmarking is not supported, targeting "
-                               "CPU instead");
-                }
-
                 logger->info("Running LLVM benchmark");
-                benchmark::LLVMBenchmark benchmark(
+                if (llvm_gpu_name == "cuda"){
+                  const GPUExecutionParameters gpu_execution_parameters{llvm_cuda_grid_dim_x, llvm_cuda_grid_dim_y, llvm_cuda_grid_dim_z, llvm_cuda_block_dim_x, llvm_cuda_block_dim_y, llvm_cuda_block_dim_z};
+                  benchmark::LLVMBenchmark<CUDADriver> benchmark(
                     visitor, modfile, output_dir, shared_lib_paths,
-                    num_experiments, instance_size, llvm_cpu_name,
-                    llvm_opt_level_ir, llvm_opt_level_codegen);
+                    num_experiments, instance_size, platform,
+                    llvm_opt_level_ir, llvm_opt_level_codegen, gpu_execution_parameters);
+                } else {
+                    benchmark::LLVMBenchmark<JITDriver> benchmark(
+                        visitor, modfile, output_dir, shared_lib_paths,
+                        num_experiments, instance_size, platform,
+                        llvm_opt_level_ir, llvm_opt_level_codegen);
+                }
                 benchmark.run(ast);
               }
             }
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index 78028e72ea..96b73c3613 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -201,6 +201,7 @@ void JITDriver::init(const std::string& cpu, BenchmarkInfo* benchmark_info) {
             llvm::orc::DumpObjects(benchmark_info->output_dir, benchmark_info->filename));
     }
 }
+
 #ifdef NMODL_LLVM_CUDA_BACKEND
 void checkCudaErrors(CUresult err) {
     if (err != CUDA_SUCCESS) {
@@ -215,7 +216,7 @@ void checkNVVMErrors(nvvmResult err) {
         throw std::runtime_error("NVVM Error: " + std::string(nvvmGetErrorString(err)));
     }
 }
-void GPUJITDriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
+void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     // CUDA initialization
     checkCudaErrors(cuInit(0));
     checkCudaErrors(cuDeviceGetCount(&device_info.count));
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index 7df631b430..81e4b9846f 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -22,6 +22,7 @@
 #ifdef NMODL_LLVM_CUDA_BACKEND
 #include "cuda.h"
 #include "nvvm.h"
+#include "gpu_parameters.hpp"
 #endif
 
 namespace nmodl {
@@ -107,7 +108,7 @@ struct DeviceInfo {
     int compute_version_minor;
 };
 
-class GPUJITDriver: public JITDriver {
+class CUDADriver: public JITDriver {
     nvvmProgram prog;
     CUdevice    device;
     CUmodule    cudaModule;
@@ -117,15 +118,41 @@ class GPUJITDriver: public JITDriver {
     DeviceInfo device_info;
     std::string ptx_compiled_module;
 
-    /// Gets available GPU device information
-   DeviceInfo get_device_info();
-
     public:
-        explicit GPUJITDriver(std::unique_ptr<llvm::Module> m)
+        explicit CUDADriver(std::unique_ptr<llvm::Module> m)
             : JITDriver(std::move(m)) {}
     
         /// Initializes the CUDA GPU JIT driver.
         void init(const std::string& gpu, BenchmarkInfo* benchmark_info = nullptr);
+
+        /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
+        template <typename ReturnType>
+        ReturnType execute_without_arguments(const std::string& entry_point, const GPUExecutionParameters& gpu_execution_parameters) {
+            // Get kernel function
+            checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
+
+            // Kernel launch
+            void *kernel_parameters = {}
+            checkCudaErrors(cuLaunchKernel(function, gpu_execution_parameters.grid_dim_x, gpu_execution_parameters.grid_dim_y, gpu_execution_parameters.grid_dim_z,
+                                    gpu_execution_parameters.block_dim_x, gpu_execution_parameters.block_dim_y, gpu_execution_parameters.block_dim_z,
+                                    gpu_execution_parameters.shared_mem_bytes, nullptr, kernel_parameters, nullptr));
+
+            auto (*res)() = (ReturnType(*)())(intptr_t) expected_symbol->getAddress();
+            ReturnType result = res();
+            return result;
+        }
+
+        /// Lookups the entry-point with an argument in the JIT and executes it, returning the result.
+        template <typename ReturnType, typename ArgType>
+        ReturnType execute_with_arguments(const std::string& entry_point, ArgType arg) {
+            auto expected_symbol = jit->lookup(entry_point);
+            if (!expected_symbol)
+                throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
+
+            auto (*res)(ArgType) = (ReturnType(*)(ArgType))(intptr_t) expected_symbol->getAddress();
+            ReturnType result = res(arg);
+            return result;
+        }
 };
 #endif
 
@@ -134,12 +161,13 @@ class GPUJITDriver: public JITDriver {
  * \brief A base runner class that provides functionality to execute an
  * entry point in the LLVM IR module.
  */
+template <typename JITType = JITDriver>
 class BaseRunner {
   protected:
-    std::unique_ptr<JITDriver> driver;
+    std::unique_ptr<JITType> driver;
 
     explicit BaseRunner(std::unique_ptr<llvm::Module> m)
-        : driver(std::make_unique<JITDriver>(std::move(m))) {}
+        : driver(std::make_unique<JITType>(std::move(m))) {}
 
   public:
     /// Sets up the JIT driver.
@@ -162,6 +190,7 @@ class BaseRunner {
  * \class TestRunner
  * \brief A simple runner for testing purposes.
  */
+template <typename JITType = JITDriver>
 class TestRunner: public BaseRunner {
   public:
     explicit TestRunner(std::unique_ptr<llvm::Module> m)
@@ -177,28 +206,29 @@ class TestRunner: public BaseRunner {
  * \brief A runner with benchmarking functionality. It takes user-specified CPU
  * features into account, as well as it can link against shared libraries.
  */
+template <typename JITType = JITDriver>
 class BenchmarkRunner: public BaseRunner {
   private:
     /// Benchmarking information passed to JIT driver.
     BenchmarkInfo benchmark_info;
 
-    /// CPU to target.
-    std::string cpu;
+    /// Beckend to target.
+    std::string backend;
 
   public:
     BenchmarkRunner(std::unique_ptr<llvm::Module> m,
                     std::string filename,
                     std::string output_dir,
-                    std::string cpu,
+                    std::string backend,
                     std::vector<std::string> lib_paths = {},
                     int opt_level_ir = 0,
                     int opt_level_codegen = 0)
         : BaseRunner(std::move(m))
-        , cpu(cpu)
+        , backend(backend)
         , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
     virtual void initialize_driver() {
-        driver->init(cpu, &benchmark_info);
+        driver->init(backend, &benchmark_info);
     }
 };
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 0e94ae231b..4483bf00a7 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -43,8 +43,8 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     llvm_visitor.find_kernel_names(kernel_names);
 
     // Get feature's string and turn them off depending on the cpu.
-    std::string cpu_name = cpu == "default" ? llvm::sys::getHostCPUName().str() : cpu;
-    logger->info("CPU: {}", cpu_name);
+    const auto backend_name = platform.get_name();
+    logger->info("Backend: {}", backend_name);
 
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
@@ -52,7 +52,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
                            mod_filename;
     runner::BenchmarkRunner runner(
-        std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
+        std::move(m), filename, output_dir, backend_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
     // Benchmark every kernel.
@@ -75,7 +75,11 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+            if (backend_name == "cuda") {
+                runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr, gpu_execution_parameters);
+            } else {
+                runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+            }
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index cc9dd3bcf0..cdef661f92 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -11,8 +11,12 @@
 #include <string>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "gpu_parameters.hpp"
 #include "utils/logger.hpp"
 
+using nmodl::codegen::Platform;
+using nmodl::cuda_details::GPUExecutionParameters;
+
 namespace nmodl {
 namespace benchmark {
 
@@ -41,8 +45,10 @@ class LLVMBenchmark {
     /// The size of the instance struct for benchmarking.
     int instance_size;
 
-    /// CPU to target.
-    std::string cpu;
+    /// Target platform for the code generation.
+    Platform platform;
+
+    GPUExecutionParameters gpu_execution_parameters;
 
     /// Optimisation level for IR generation.
     int opt_level_ir;
@@ -60,7 +66,7 @@ class LLVMBenchmark {
                   std::vector<std::string> shared_libs,
                   int num_experiments,
                   int instance_size,
-                  const std::string& cpu,
+                  const Platform& platform,
                   int opt_level_ir,
                   int opt_level_codegen)
         : llvm_visitor(llvm_visitor)
@@ -69,9 +75,29 @@ class LLVMBenchmark {
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
-        , cpu(cpu)
+        , platform(platform)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen) {}
+    LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,
+                  const std::string& mod_filename,
+                  const std::string& output_dir,
+                  std::vector<std::string> shared_libs,
+                  int num_experiments,
+                  int instance_size,
+                  const Platform& platform,
+                  int opt_level_ir,
+                  int opt_level_codegen
+                  const GPUExecutionParameters& gpu_exec_params)
+        : llvm_visitor(llvm_visitor)
+        , mod_filename(mod_filename)
+        , output_dir(output_dir)
+        , shared_libs(shared_libs)
+        , num_experiments(num_experiments)
+        , instance_size(instance_size)
+        , platform(platform)
+        , opt_level_ir(opt_level_ir)
+        , opt_level_codegen(opt_level_codegen)
+        , gpu_execution_parameters(gpu_exec_params) {}
 
     /// Runs the benchmark.
     void run(const std::shared_ptr<ast::Program>& node);

From 7e9484da902daebb67162108fe8ca07b5343ef17 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 18 Mar 2022 13:42:31 +0100
Subject: [PATCH 206/331] Template BaseRunner

---
 src/main.cpp                      | 97 +++++++++++++++----------------
 test/benchmark/gpu_parameters.hpp | 32 ++++++++++
 test/benchmark/jit_driver.hpp     | 66 +++++++++++++--------
 test/benchmark/llvm_benchmark.cpp |  4 +-
 test/benchmark/llvm_benchmark.hpp |  3 +-
 5 files changed, 126 insertions(+), 76 deletions(-)
 create mode 100644 test/benchmark/gpu_parameters.hpp

diff --git a/src/main.cpp b/src/main.cpp
index cfe5d2e792..4fe4ef389b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -405,23 +405,23 @@ int main(int argc, const char* argv[]) {
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
     benchmark_opt->add_option("--gridDimX",
-                              gridDimX,
-                              "Grid dimension X ({})"_format(gridDimX))->ignore_case();
+                              llvm_cuda_grid_dim_x,
+                              "Grid dimension X ({})"_format(llvm_cuda_grid_dim_x))->ignore_case();
     benchmark_opt->add_option("--gridDimY",
-                                gridDimY,
-                                "Grid dimension Y ({})"_format(gridDimY))->ignore_case();
+                                llvm_cuda_grid_dim_y,
+                                "Grid dimension Y ({})"_format(llvm_cuda_grid_dim_y))->ignore_case();
     benchmark_opt->add_option("--gridDimZ",
-                                gridDimZ,
-                                "Grid dimension Z ({})"_format(gridDimZ))->ignore_case();
+                                llvm_cuda_grid_dim_z,
+                                "Grid dimension Z ({})"_format(llvm_cuda_grid_dim_z))->ignore_case();
     benchmark_opt->add_option("--blockDimX",
-                                blockDimX,
-                                "Block dimension X ({})"_format(blockDimX))->ignore_case();
+                                llvm_cuda_block_dim_x,
+                                "Block dimension X ({})"_format(llvm_cuda_block_dim_x))->ignore_case();
     benchmark_opt->add_option("--blockDimY",
-                                blockDimY,
-                                "Block dimension Y ({})"_format(blockDimY))->ignore_case();
+                                llvm_cuda_block_dim_y,
+                                "Block dimension Y ({})"_format(llvm_cuda_block_dim_y))->ignore_case();
     benchmark_opt->add_option("--blockDimZ",
-                                blockDimZ,
-                                "Block dimension Z ({})"_format(blockDimZ))->ignore_case();
+                                llvm_cuda_block_dim_z,
+                                "Block dimension Z ({})"_format(llvm_cuda_block_dim_z))->ignore_case();
 #endif
     // clang-format on
 
@@ -728,44 +728,43 @@ int main(int argc, const char* argv[]) {
 
 #ifdef NMODL_LLVM_BACKEND
             if (llvm_ir || llvm_benchmark) {
-              // If benchmarking, we want to optimize the IR with target
-              // information and not in LLVM visitor.
-              int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
-
-              // Create platform abstraction.
-              PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU
-                                                          : PlatformID::GPU;
-              const std::string name =
-                  llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
-              Platform platform(pid, name, llvm_math_library, llvm_float_type,
-                                llvm_vector_width);
-
-              
-
-              logger->info("Running LLVM backend code generator");
-              CodegenLLVMVisitor visitor(modfile, output_dir, platform,
-                                         llvm_opt_level, !llvm_no_debug,
-                                         llvm_fast_math_flags);
-              visitor.visit_program(*ast);
-              ast_to_nmodl(*ast, filepath("llvm", "mod"));
-              ast_to_json(*ast, filepath("llvm", "json"));
-
-              if (llvm_benchmark) {
-                logger->info("Running LLVM benchmark");
-                if (llvm_gpu_name == "cuda"){
-                  const GPUExecutionParameters gpu_execution_parameters{llvm_cuda_grid_dim_x, llvm_cuda_grid_dim_y, llvm_cuda_grid_dim_z, llvm_cuda_block_dim_x, llvm_cuda_block_dim_y, llvm_cuda_block_dim_z};
-                  benchmark::LLVMBenchmark<CUDADriver> benchmark(
-                    visitor, modfile, output_dir, shared_lib_paths,
-                    num_experiments, instance_size, platform,
-                    llvm_opt_level_ir, llvm_opt_level_codegen, gpu_execution_parameters);
-                } else {
-                    benchmark::LLVMBenchmark<JITDriver> benchmark(
-                        visitor, modfile, output_dir, shared_lib_paths,
-                        num_experiments, instance_size, platform,
-                        llvm_opt_level_ir, llvm_opt_level_codegen);
+                // If benchmarking, we want to optimize the IR with target
+                // information and not in LLVM visitor.
+                int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
+
+                // Create platform abstraction.
+                PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU
+                                                            : PlatformID::GPU;
+                const std::string name =
+                    llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
+                Platform platform(pid, name, llvm_math_library, llvm_float_type,
+                                    llvm_vector_width);
+
+                logger->info("Running LLVM backend code generator");
+                CodegenLLVMVisitor visitor(modfile, output_dir, platform,
+                                            llvm_opt_level, !llvm_no_debug,
+                                            llvm_fast_math_flags);
+                visitor.visit_program(*ast);
+                ast_to_nmodl(*ast, filepath("llvm", "mod"));
+                ast_to_json(*ast, filepath("llvm", "json"));
+
+                if (llvm_benchmark) {
+                    logger->info("Running LLVM benchmark");
+                    if (llvm_gpu_name == "cuda"){
+                        const GPUExecutionParameters gpu_execution_parameters{llvm_cuda_grid_dim_x, llvm_cuda_grid_dim_y, llvm_cuda_grid_dim_z, llvm_cuda_block_dim_x, llvm_cuda_block_dim_y, llvm_cuda_block_dim_z};
+                        benchmark::LLVMBenchmark benchmark(
+                            visitor, modfile, output_dir, shared_lib_paths,
+                            num_experiments, instance_size, platform,
+                            llvm_opt_level_ir, llvm_opt_level_codegen, gpu_execution_parameters);
+                        benchmark.run(ast);
+                    } else {
+                        benchmark::LLVMBenchmark benchmark(
+                            visitor, modfile, output_dir, shared_lib_paths,
+                            num_experiments, instance_size, platform,
+                            llvm_opt_level_ir, llvm_opt_level_codegen);
+                        benchmark.run(ast);
+                    }
                 }
-                benchmark.run(ast);
-              }
             }
 #endif
         }
diff --git a/test/benchmark/gpu_parameters.hpp b/test/benchmark/gpu_parameters.hpp
new file mode 100644
index 0000000000..fa32bf7602
--- /dev/null
+++ b/test/benchmark/gpu_parameters.hpp
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief GPU execution parameters struct
+ *
+ * \file
+ * \brief \copybrief nmodl::cuda_details::GPUExecutionParameters
+ */
+
+namespace nmodl {
+namespace cuda_details {
+
+struct GPUExecutionParameters {
+    int gridDimX;
+    int gridDimY;
+    int gridDimZ;
+    int blockDimX;
+    int blockDimY;
+    int blockDimZ;
+    int sharedMemBytes;
+};
+
+} // namespace cuda_details
+} // namespace nmodl
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index 81e4b9846f..13eb225bbb 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -25,6 +25,8 @@
 #include "gpu_parameters.hpp"
 #endif
 
+using nmodl::cuda_details::GPUExecutionParameters;
+
 namespace nmodl {
 namespace runner {
 
@@ -108,6 +110,26 @@ struct DeviceInfo {
     int compute_version_minor;
 };
 
+/**
+ * @brief Throw meaningful error in case CUDA API call fails
+ *
+ * Checks whether a call to the CUDA API was succsful and if not it throws a runntime_error with
+ * the error message from CUDA.
+ *
+ * @param err Return value of the CUDA API call
+ */
+void checkCudaErrors(CUresult err);
+
+/**
+ * @brief Throw meaningful error in case NVVM API call fails
+ *
+ * Checks whether a call to the NVVM API was succsful and if not it throws a runntime_error with
+ * the error message from NVVM.
+ *
+ * @param err Return value of the NVVM API call
+ */
+void checkNVVMErrors(nvvmResult err);
+
 class CUDADriver: public JITDriver {
     nvvmProgram prog;
     CUdevice    device;
@@ -126,24 +148,20 @@ class CUDADriver: public JITDriver {
         void init(const std::string& gpu, BenchmarkInfo* benchmark_info = nullptr);
 
         /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
-        template <typename ReturnType>
+        template <typename ReturnType = void>
         ReturnType execute_without_arguments(const std::string& entry_point, const GPUExecutionParameters& gpu_execution_parameters) {
             // Get kernel function
             checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
 
             // Kernel launch
-            void *kernel_parameters = {}
-            checkCudaErrors(cuLaunchKernel(function, gpu_execution_parameters.grid_dim_x, gpu_execution_parameters.grid_dim_y, gpu_execution_parameters.grid_dim_z,
-                                    gpu_execution_parameters.block_dim_x, gpu_execution_parameters.block_dim_y, gpu_execution_parameters.block_dim_z,
-                                    gpu_execution_parameters.shared_mem_bytes, nullptr, kernel_parameters, nullptr));
-
-            auto (*res)() = (ReturnType(*)())(intptr_t) expected_symbol->getAddress();
-            ReturnType result = res();
-            return result;
+            void *kernel_parameters[] = {};
+            checkCudaErrors(cuLaunchKernel(function, gpu_execution_parameters.gridDimX, gpu_execution_parameters.gridDimY, gpu_execution_parameters.gridDimY,
+                                    gpu_execution_parameters.blockDimX, gpu_execution_parameters.blockDimY, gpu_execution_parameters.blockDimY,
+                                    gpu_execution_parameters.sharedMemBytes, nullptr, kernel_parameters, nullptr));
         }
 
         /// Lookups the entry-point with an argument in the JIT and executes it, returning the result.
-        template <typename ReturnType, typename ArgType>
+        template <typename ReturnType = void, typename ArgType>
         ReturnType execute_with_arguments(const std::string& entry_point, ArgType arg) {
             auto expected_symbol = jit->lookup(entry_point);
             if (!expected_symbol)
@@ -161,13 +179,13 @@ class CUDADriver: public JITDriver {
  * \brief A base runner class that provides functionality to execute an
  * entry point in the LLVM IR module.
  */
-template <typename JITType = JITDriver>
+template<typename DriverType = JITDriver>
 class BaseRunner {
   protected:
-    std::unique_ptr<JITType> driver;
+    std::unique_ptr<DriverType> driver;
 
-    explicit BaseRunner(std::unique_ptr<llvm::Module> m)
-        : driver(std::make_unique<JITType>(std::move(m))) {}
+    explicit BaseRunner<DriverType>(std::unique_ptr<llvm::Module> m)
+        : driver(std::make_unique<DriverType>(std::move(m))) {}
 
   public:
     /// Sets up the JIT driver.
@@ -190,14 +208,14 @@ class BaseRunner {
  * \class TestRunner
  * \brief A simple runner for testing purposes.
  */
-template <typename JITType = JITDriver>
-class TestRunner: public BaseRunner {
+template<typename DriverType = JITDriver>
+class TestRunner: public BaseRunner<DriverType> {
   public:
-    explicit TestRunner(std::unique_ptr<llvm::Module> m)
-        : BaseRunner(std::move(m)) {}
+    explicit TestRunner<DriverType>(std::unique_ptr<llvm::Module> m)
+        : BaseRunner<DriverType>(std::move(m)) {}
 
     virtual void initialize_driver() {
-        driver->init(llvm::sys::getHostCPUName().str());
+        this->driver->init(llvm::sys::getHostCPUName().str());
     }
 };
 
@@ -206,8 +224,8 @@ class TestRunner: public BaseRunner {
  * \brief A runner with benchmarking functionality. It takes user-specified CPU
  * features into account, as well as it can link against shared libraries.
  */
-template <typename JITType = JITDriver>
-class BenchmarkRunner: public BaseRunner {
+template<typename DriverType = JITDriver>
+class BenchmarkRunner: public BaseRunner<DriverType> {
   private:
     /// Benchmarking information passed to JIT driver.
     BenchmarkInfo benchmark_info;
@@ -216,19 +234,19 @@ class BenchmarkRunner: public BaseRunner {
     std::string backend;
 
   public:
-    BenchmarkRunner(std::unique_ptr<llvm::Module> m,
+    BenchmarkRunner<DriverType>(std::unique_ptr<llvm::Module> m,
                     std::string filename,
                     std::string output_dir,
                     std::string backend,
                     std::vector<std::string> lib_paths = {},
                     int opt_level_ir = 0,
                     int opt_level_codegen = 0)
-        : BaseRunner(std::move(m))
+        : BaseRunner<DriverType>(std::move(m))
         , backend(backend)
         , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
     virtual void initialize_driver() {
-        driver->init(backend, &benchmark_info);
+        this->driver->init(backend, &benchmark_info);
     }
 };
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 4483bf00a7..1078d1a4c6 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -51,7 +51,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     // Create the benchmark runner and initialize it.
     std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
                            mod_filename;
-    runner::BenchmarkRunner runner(
+    runner::BenchmarkRunner<> runner(
         std::move(m), filename, output_dir, backend_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
@@ -76,7 +76,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
             if (backend_name == "cuda") {
-                runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr, gpu_execution_parameters);
+                runner.run_with_argument<void, void*>(kernel_name, instance_data.base_ptr, gpu_execution_parameters);
             } else {
                 runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
             }
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index cdef661f92..833c37923f 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -48,6 +48,7 @@ class LLVMBenchmark {
     /// Target platform for the code generation.
     Platform platform;
 
+    /// The GPU execution parameters needed to configure the kernels' execution.
     GPUExecutionParameters gpu_execution_parameters;
 
     /// Optimisation level for IR generation.
@@ -86,7 +87,7 @@ class LLVMBenchmark {
                   int instance_size,
                   const Platform& platform,
                   int opt_level_ir,
-                  int opt_level_codegen
+                  int opt_level_codegen,
                   const GPUExecutionParameters& gpu_exec_params)
         : llvm_visitor(llvm_visitor)
         , mod_filename(mod_filename)

From 9f4a1428d78f9da051cc934730cc05b913385174 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 18 Mar 2022 17:02:16 +0100
Subject: [PATCH 207/331] Fixed compilation issues with templates

---
 src/codegen/llvm/main.cpp                    |  2 +-
 test/benchmark/jit_driver.hpp                | 49 ++++++++----
 test/benchmark/llvm_benchmark.cpp            | 79 +++++++++++++++++---
 test/benchmark/llvm_benchmark.hpp            |  7 +-
 test/unit/codegen/codegen_llvm_execution.cpp | 12 +--
 5 files changed, 117 insertions(+), 32 deletions(-)

diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 92d8a486c1..4f25474dbd 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -67,7 +67,7 @@ int main(int argc, const char* argv[]) {
         throw std::runtime_error(
             "Error: entry-point functions with non-double return type are not supported\n");
 
-    TestRunner runner(std::move(module));
+    TestRunner<> runner(std::move(module));
     runner.initialize_driver();
 
     // Since only double type is supported, provide explicit double type to the running function.
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index 13eb225bbb..3c7d8f8ebd 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -100,6 +100,18 @@ class JITDriver {
         ReturnType result = res(arg);
         return result;
     }
+
+    /// Lookups the entry-point with an argument in the JIT and executes it, returning the result.
+    template <typename ReturnType, typename ArgType1, typename ArgType2>
+    ReturnType execute_with_arguments(const std::string& entry_point, ArgType1 arg1, ArgType2 arg2) {
+        auto expected_symbol = jit->lookup(entry_point);
+        if (!expected_symbol)
+            throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
+
+        auto (*res)(ArgType1, ArgType2) = (ReturnType(*)(ArgType1, ArgType2))(intptr_t) expected_symbol->getAddress();
+        ReturnType result = res(arg1, arg2);
+        return result;
+    }
 };
 
 #ifdef NMODL_LLVM_CUDA_BACKEND
@@ -130,7 +142,9 @@ void checkCudaErrors(CUresult err);
  */
 void checkNVVMErrors(nvvmResult err);
 
-class CUDADriver: public JITDriver {
+class CUDADriver {
+    /// LLVM IR module to execute.
+    std::unique_ptr<llvm::Module> module;
     nvvmProgram prog;
     CUdevice    device;
     CUmodule    cudaModule;
@@ -142,14 +156,14 @@ class CUDADriver: public JITDriver {
 
     public:
         explicit CUDADriver(std::unique_ptr<llvm::Module> m)
-            : JITDriver(std::move(m)) {}
+            : module(std::move(m)) {}
     
         /// Initializes the CUDA GPU JIT driver.
         void init(const std::string& gpu, BenchmarkInfo* benchmark_info = nullptr);
 
-        /// Lookups the entry-point without arguments in the JIT and executes it, returning the result.
-        template <typename ReturnType = void>
-        ReturnType execute_without_arguments(const std::string& entry_point, const GPUExecutionParameters& gpu_execution_parameters) {
+        /// Lookups the entry-point without arguments in the CUDA module and executes it.
+        template <typename ReturnType, const GPUExecutionParameters&>
+        ReturnType execute_with_arguments(const std::string& entry_point, const GPUExecutionParameters& gpu_execution_parameters) {
             // Get kernel function
             checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
 
@@ -160,16 +174,17 @@ class CUDADriver: public JITDriver {
                                     gpu_execution_parameters.sharedMemBytes, nullptr, kernel_parameters, nullptr));
         }
 
-        /// Lookups the entry-point with an argument in the JIT and executes it, returning the result.
-        template <typename ReturnType = void, typename ArgType>
-        ReturnType execute_with_arguments(const std::string& entry_point, ArgType arg) {
-            auto expected_symbol = jit->lookup(entry_point);
-            if (!expected_symbol)
-                throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
+        /// Lookups the entry-point with arguments in the CUDA module and executes it.
+        template <typename ReturnType, typename ArgType1, typename ArgType2>
+        ReturnType execute_with_arguments(const std::string& entry_point, ArgType1 arg1, ArgType2 gpu_execution_parameters) {
+            // Get kernel function
+            checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
 
-            auto (*res)(ArgType) = (ReturnType(*)(ArgType))(intptr_t) expected_symbol->getAddress();
-            ReturnType result = res(arg);
-            return result;
+            // Kernel launch
+            void *kernel_parameters[] = {&arg1};
+            checkCudaErrors(cuLaunchKernel(function, gpu_execution_parameters.gridDimX, gpu_execution_parameters.gridDimY, gpu_execution_parameters.gridDimY,
+                                    gpu_execution_parameters.blockDimX, gpu_execution_parameters.blockDimY, gpu_execution_parameters.blockDimY,
+                                    gpu_execution_parameters.sharedMemBytes, nullptr, kernel_parameters, nullptr));
         }
 };
 #endif
@@ -202,6 +217,12 @@ class BaseRunner {
     ReturnType run_with_argument(const std::string& entry_point, ArgType arg) {
         return driver->template execute_with_arguments<ReturnType, ArgType>(entry_point, arg);
     }
+
+    /// Runs the entry-point function with a pointer to the data as an argument.
+    template <typename ReturnType, typename ArgType1, typename ArgType2>
+    ReturnType run_with_argument(const std::string& entry_point, ArgType1 arg1, ArgType2 arg2) {
+        return driver->template execute_with_arguments<ReturnType, ArgType1, ArgType2>(entry_point, arg1, arg2);
+    }
 };
 
 /**
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 1078d1a4c6..bba01fe743 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -22,7 +22,11 @@ void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
     // create functions
     generate_llvm(node);
     // Finally, run the benchmark and log the measurements.
-    run_benchmark(node);
+    if (platform.get_name() == "cuda") {
+        run_benchmark_on_gpu(node);
+    } else {
+        run_benchmark_on_cpu(node);
+    }
 }
 
 void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
@@ -36,15 +40,15 @@ void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
     logger->info("Created LLVM IR module from NMODL AST in {} sec", diff.count());
 }
 
-void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
+void LLVMBenchmark::run_benchmark_on_cpu(const std::shared_ptr<ast::Program>& node) {
     // Set the codegen data helper and find the kernels.
     auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
     std::vector<std::string> kernel_names;
     llvm_visitor.find_kernel_names(kernel_names);
 
     // Get feature's string and turn them off depending on the cpu.
-    const auto backend_name = platform.get_name();
-    logger->info("Backend: {}", backend_name);
+    const auto cpu_name = platform.get_name();
+    logger->info("CPU: {}", cpu_name);
 
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
@@ -52,7 +56,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
                            mod_filename;
     runner::BenchmarkRunner<> runner(
-        std::move(m), filename, output_dir, backend_name, shared_libs, opt_level_ir, opt_level_codegen);
+        std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
     // Benchmark every kernel.
@@ -75,11 +79,68 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            if (backend_name == "cuda") {
-                runner.run_with_argument<void, void*>(kernel_name, instance_data.base_ptr, gpu_execution_parameters);
-            } else {
-                runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+            runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+            auto end = std::chrono::steady_clock::now();
+            std::chrono::duration<double> diff = end - start;
+
+            // Log the time taken for each run.
+            logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
+
+            // Update statistics.
+            time_sum += diff.count();
+            time_squared_sum += diff.count() * diff.count();
+            time_min = std::min(time_min, diff.count());
+            time_max = std::max(time_max, diff.count());
+        }
+        // Log the average time taken for the kernel.
+        double time_mean = time_sum / num_experiments;
+        logger->info("Average compute time = {:.6f}", time_mean);
+        logger->info("Compute time variance = {:g}",
+                     time_squared_sum / num_experiments - time_mean * time_mean);
+        logger->info("Minimum compute time = {:.6f}", time_min);
+        logger->info("Maximum compute time = {:.6f}\n", time_max);
+    }
+}
+
+void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& node) {
+    // Set the codegen data helper and find the kernels.
+    auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
+    std::vector<std::string> kernel_names;
+    llvm_visitor.find_kernel_names(kernel_names);
+
+    // Get feature's string and turn them off depending on the cpu.
+    const auto gpu_name = platform.get_name();
+    logger->info("GPU: {}", gpu_name);
+
+    std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
+
+    // Create the benchmark runner and initialize it.
+    std::string filename = "cuda_" + mod_filename;
+    runner::BenchmarkRunner<runner::CUDADriver> runner(
+        std::move(m), filename, output_dir, gpu_name, shared_libs, opt_level_ir, opt_level_codegen);
+    runner.initialize_driver();
+
+    // Benchmark every kernel.
+    for (const auto& kernel_name: kernel_names) {
+        // For every kernel run the benchmark `num_experiments` times.
+        double time_min = std::numeric_limits<double>::max();
+        double time_max = 0.0;
+        double time_sum = 0.0;
+        double time_squared_sum = 0.0;
+        for (int i = 0; i < num_experiments; ++i) {
+            // Initialise the data.
+            auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
+
+            // Log instance size once.
+            if (i == 0) {
+                double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
+                logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
             }
+
+            // Record the execution time of the kernel.
+            std::string wrapper_name = "__" + kernel_name + "_wrapper";
+            auto start = std::chrono::steady_clock::now();
+            runner.run_with_argument<void, void*, GPUExecutionParameters>(kernel_name, instance_data.base_ptr, gpu_execution_parameters);
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index 833c37923f..52232f7787 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -107,8 +107,11 @@ class LLVMBenchmark {
     /// Visits the AST to construct the LLVM IR module.
     void generate_llvm(const std::shared_ptr<ast::Program>& node);
 
-    /// Runs the main body of the benchmark, executing the compute kernels.
-    void run_benchmark(const std::shared_ptr<ast::Program>& node);
+    /// Runs the main body of the benchmark, executing the compute kernels on CPU.
+    void run_benchmark_on_cpu(const std::shared_ptr<ast::Program>& node);
+
+    /// Runs the main body of the benchmark, executing the compute kernels on GPU.
+    void run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& node);
 
     /// Sets the log output stream (file or console).
     void set_log_output();
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 4c9515f814..4ab271e740 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -134,7 +134,7 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
-        TestRunner runner(std::move(m));
+        TestRunner<> runner(std::move(m));
         runner.initialize_driver();
 
         THEN("functions are evaluated correctly") {
@@ -240,7 +240,7 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
-        TestRunner runner(std::move(m));
+        TestRunner<> runner(std::move(m));
         runner.initialize_driver();
 
         THEN("optimizations preserve function results") {
@@ -337,7 +337,7 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        TestRunner runner(std::move(module));
+        TestRunner<> runner(std::move(module));
         runner.initialize_driver();
 
         THEN("Values in struct have changed according to the formula") {
@@ -427,7 +427,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        TestRunner runner(std::move(module));
+        TestRunner<> runner(std::move(module));
         runner.initialize_driver();
 
         THEN("Values in struct have changed according to the formula") {
@@ -505,7 +505,7 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        TestRunner runner(std::move(module));
+        TestRunner<> runner(std::move(module));
         runner.initialize_driver();
 
         THEN("Ion values in struct have been updated correctly") {
@@ -606,7 +606,7 @@ SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        TestRunner runner(std::move(module));
+        TestRunner<> runner(std::move(module));
         runner.initialize_driver();
 
         THEN("Masked instructions are generated") {

From d5159610cb532d23460d49ac1520d043c663dd91 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 18 Mar 2022 17:05:10 +0100
Subject: [PATCH 208/331] More small fixes

---
 test/benchmark/jit_driver.hpp     | 4 ++--
 test/benchmark/llvm_benchmark.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index 3c7d8f8ebd..f04093dd06 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -162,8 +162,8 @@ class CUDADriver {
         void init(const std::string& gpu, BenchmarkInfo* benchmark_info = nullptr);
 
         /// Lookups the entry-point without arguments in the CUDA module and executes it.
-        template <typename ReturnType, const GPUExecutionParameters&>
-        ReturnType execute_with_arguments(const std::string& entry_point, const GPUExecutionParameters& gpu_execution_parameters) {
+        template <typename ReturnType, typename ArgType2>
+        ReturnType execute_with_arguments(const std::string& entry_point, ArgType2 gpu_execution_parameters) {
             // Get kernel function
             checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index bba01fe743..a454d040b5 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -140,7 +140,7 @@ void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& no
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            runner.run_with_argument<void, void*, GPUExecutionParameters>(kernel_name, instance_data.base_ptr, gpu_execution_parameters);
+            runner.run_with_argument<void, void*, const GPUExecutionParameters&>(kernel_name, instance_data.base_ptr, gpu_execution_parameters);
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 

From b69f2c78653e5f1ebdba44d08a69dfab7101a7f5 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 18 Mar 2022 18:24:17 +0100
Subject: [PATCH 209/331] Separated CUDADriver from JITDriver files

---
 src/main.cpp                                 |  55 +++--
 test/benchmark/CMakeLists.txt                |   1 +
 test/benchmark/benchmark_info.hpp            |  29 +++
 test/benchmark/cuda_driver.cpp               |  93 ++++++++
 test/benchmark/cuda_driver.hpp               | 212 +++++++++++++++++++
 test/benchmark/gpu_parameters.hpp            |   4 +-
 test/benchmark/jit_driver.cpp                |  71 -------
 test/benchmark/jit_driver.hpp                | 156 ++------------
 test/benchmark/llvm_benchmark.cpp            |   9 +-
 test/unit/codegen/codegen_llvm_execution.cpp |  12 +-
 10 files changed, 403 insertions(+), 239 deletions(-)
 create mode 100644 test/benchmark/benchmark_info.hpp
 create mode 100644 test/benchmark/cuda_driver.cpp
 create mode 100644 test/benchmark/cuda_driver.hpp

diff --git a/src/main.cpp b/src/main.cpp
index 4fe4ef389b..dc3fc69d68 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -733,35 +733,52 @@ int main(int argc, const char* argv[]) {
                 int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
 
                 // Create platform abstraction.
-                PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU
-                                                            : PlatformID::GPU;
-                const std::string name =
-                    llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
-                Platform platform(pid, name, llvm_math_library, llvm_float_type,
-                                    llvm_vector_width);
+                PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU : PlatformID::GPU;
+                const std::string name = llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
+                Platform platform(pid, name, llvm_math_library, llvm_float_type, llvm_vector_width);
 
                 logger->info("Running LLVM backend code generator");
-                CodegenLLVMVisitor visitor(modfile, output_dir, platform,
-                                            llvm_opt_level, !llvm_no_debug,
-                                            llvm_fast_math_flags);
+                CodegenLLVMVisitor visitor(modfile,
+                                           output_dir,
+                                           platform,
+                                           llvm_opt_level,
+                                           !llvm_no_debug,
+                                           llvm_fast_math_flags);
                 visitor.visit_program(*ast);
                 ast_to_nmodl(*ast, filepath("llvm", "mod"));
                 ast_to_json(*ast, filepath("llvm", "json"));
 
                 if (llvm_benchmark) {
                     logger->info("Running LLVM benchmark");
-                    if (llvm_gpu_name == "cuda"){
-                        const GPUExecutionParameters gpu_execution_parameters{llvm_cuda_grid_dim_x, llvm_cuda_grid_dim_y, llvm_cuda_grid_dim_z, llvm_cuda_block_dim_x, llvm_cuda_block_dim_y, llvm_cuda_block_dim_z};
-                        benchmark::LLVMBenchmark benchmark(
-                            visitor, modfile, output_dir, shared_lib_paths,
-                            num_experiments, instance_size, platform,
-                            llvm_opt_level_ir, llvm_opt_level_codegen, gpu_execution_parameters);
+                    if (llvm_gpu_name == "cuda") {
+                        const GPUExecutionParameters gpu_execution_parameters{
+                            llvm_cuda_grid_dim_x,
+                            llvm_cuda_grid_dim_y,
+                            llvm_cuda_grid_dim_z,
+                            llvm_cuda_block_dim_x,
+                            llvm_cuda_block_dim_y,
+                            llvm_cuda_block_dim_z};
+                        benchmark::LLVMBenchmark benchmark(visitor,
+                                                           modfile,
+                                                           output_dir,
+                                                           shared_lib_paths,
+                                                           num_experiments,
+                                                           instance_size,
+                                                           platform,
+                                                           llvm_opt_level_ir,
+                                                           llvm_opt_level_codegen,
+                                                           gpu_execution_parameters);
                         benchmark.run(ast);
                     } else {
-                        benchmark::LLVMBenchmark benchmark(
-                            visitor, modfile, output_dir, shared_lib_paths,
-                            num_experiments, instance_size, platform,
-                            llvm_opt_level_ir, llvm_opt_level_codegen);
+                        benchmark::LLVMBenchmark benchmark(visitor,
+                                                           modfile,
+                                                           output_dir,
+                                                           shared_lib_paths,
+                                                           num_experiments,
+                                                           instance_size,
+                                                           platform,
+                                                           llvm_opt_level_ir,
+                                                           llvm_opt_level_codegen);
                         benchmark.run(ast);
                     }
                 }
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index 684ca0c58c..2c83f2326c 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -3,6 +3,7 @@
 # =============================================================================
 set(LLVM_BENCHMARK_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
 
 # =============================================================================
diff --git a/test/benchmark/benchmark_info.hpp b/test/benchmark/benchmark_info.hpp
new file mode 100644
index 0000000000..c63219e4e8
--- /dev/null
+++ b/test/benchmark/benchmark_info.hpp
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+/// A struct to hold the information for benchmarking.
+struct BenchmarkInfo {
+    /// Object filename to dump.
+    std::string filename;
+
+    /// Object file output directory.
+    std::string output_dir;
+
+    /// Shared libraries' paths to link against.
+    std::vector<std::string> shared_lib_paths;
+
+    /// Optimisation level for IT.
+    int opt_level_ir;
+
+    /// Optimisation level for machine code generation.
+    int opt_level_codegen;
+};
\ No newline at end of file
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
new file mode 100644
index 0000000000..b9ec0e3eca
--- /dev/null
+++ b/test/benchmark/cuda_driver.cpp
@@ -0,0 +1,93 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#ifdef NMODL_LLVM_CUDA_BACKEND
+
+#include "cuda_driver.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/llvm_utils.hpp"
+#include "utils/common_utils.hpp"
+
+namespace nmodl {
+namespace runner {
+
+void checkCudaErrors(CUresult err) {
+    if (err != CUDA_SUCCESS) {
+        const char* ret = NULL;
+        cuGetErrorName(err, &ret);
+        throw std::runtime_error("CUDA error: " + std::string(ret));
+    }
+}
+
+void checkNVVMErrors(nvvmResult err) {
+    if (err != NVVM_SUCCESS) {
+        throw std::runtime_error("NVVM Error: " + std::string(nvvmGetErrorString(err)));
+    }
+}
+void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
+    // CUDA initialization
+    checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuDeviceGetCount(&device_info.count));
+    checkCudaErrors(cuDeviceGet(&device, 0));
+
+    char name[128];
+    checkCudaErrors(cuDeviceGetName(name, 128, device));
+    device_info.name = name;
+    std::cout << "Using CUDA Device [0]: " << device_info.name << "\n";
+
+    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_major,
+                                         CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                                         device));
+    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_minor,
+                                         CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                                         device));
+    std::cout << "Device Compute Capability: " << device_info.compute_version_major << "."
+              << device_info.compute_version_minor << "\n";
+    if (device_info.compute_version_major < 2) {
+        throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
+    }
+
+    // Save the LLVM IR module to string
+    std::string kernel_llvm_ir;
+    llvm::raw_string_ostream os(kernel_llvm_ir);
+    os << *module;
+    os.flush();
+
+    // Create NVVM program object
+    nvvmCreateProgram(&prog);
+
+    // Add custom IR to program
+    nvvmAddModuleToProgram(prog, kernel_llvm_ir.c_str(), kernel_llvm_ir.size(), "nmodl_llvm_ir");
+
+    // Declare compile options
+    const char* options[] = {"-arch=compute_60"};
+
+    // Compile the program
+    nvvmCompileProgram(prog, 1, options);
+
+    // Get compiled module
+    char* compiled_module;
+    size_t compiled_module_size;
+    nvvmGetCompiledResultSize(prog, &compiled_module_size);
+    compiled_module = (char*) malloc(compiled_module_size);
+    nvvmGetCompiledResult(prog, compiled_module);
+    ptx_compiled_module = std::string(compiled_module, compiled_module_size);
+
+    // Create driver context
+    checkCudaErrors(cuCtxCreate(&context, 0, device));
+
+    // Create module for object
+    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, compiled_module, 0, 0, 0));
+
+    // // Get kernel function
+    // checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));
+}
+
+}  // namespace runner
+}  // namespace nmodl
+
+#endif
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
new file mode 100644
index 0000000000..574fc7305b
--- /dev/null
+++ b/test/benchmark/cuda_driver.hpp
@@ -0,0 +1,212 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief Implementation of CUDA and NVVM-based execution engine to run functions from MOD files
+ *
+ * \file
+ * \brief \copybrief nmodl::runner::CUDADriver
+ */
+
+#ifdef NMODL_LLVM_CUDA_BACKEND
+
+#include <memory>
+#include <string>
+
+#include "llvm/IR/Module.h"
+
+#include "benchmark_info.hpp"
+#include "cuda.h"
+#include "gpu_parameters.hpp"
+#include "nvvm.h"
+
+using nmodl::cuda_details::GPUExecutionParameters;
+
+namespace nmodl {
+namespace runner {
+
+struct DeviceInfo {
+    int count;
+    std::string name;
+    int compute_version_major;
+    int compute_version_minor;
+};
+
+/**
+ * @brief Throw meaningful error in case CUDA API call fails
+ *
+ * Checks whether a call to the CUDA API was succsful and if not it throws a runntime_error with
+ * the error message from CUDA.
+ *
+ * @param err Return value of the CUDA API call
+ */
+void checkCudaErrors(CUresult err);
+
+/**
+ * @brief Throw meaningful error in case NVVM API call fails
+ *
+ * Checks whether a call to the NVVM API was succsful and if not it throws a runntime_error with
+ * the error message from NVVM.
+ *
+ * @param err Return value of the NVVM API call
+ */
+void checkNVVMErrors(nvvmResult err);
+
+/**
+ * \class CUDADriver
+ * \brief Driver to execute a MOD file function via the CUDA and NVVM backend.
+ */
+class CUDADriver {
+    /// LLVM IR module to execute.
+    std::unique_ptr<llvm::Module> module;
+    nvvmProgram prog;
+    CUdevice device;
+    CUmodule cudaModule;
+    CUcontext context;
+    CUfunction function;
+    CUlinkState linker;
+    DeviceInfo device_info;
+    std::string ptx_compiled_module;
+
+  public:
+    explicit CUDADriver(std::unique_ptr<llvm::Module> m)
+        : module(std::move(m)) {}
+
+    /// Initializes the CUDA GPU JIT driver.
+    void init(const std::string& gpu, BenchmarkInfo* benchmark_info = nullptr);
+
+    /// Lookups the entry-point without arguments in the CUDA module and executes it.
+    void execute_without_arguments(const std::string& entry_point,
+                                   const GPUExecutionParameters& gpu_execution_parameters) {
+        // Get kernel function
+        checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
+
+        // Kernel launch
+        void* kernel_parameters[] = {};
+        checkCudaErrors(cuLaunchKernel(function,
+                                       gpu_execution_parameters.gridDimX,
+                                       gpu_execution_parameters.gridDimY,
+                                       gpu_execution_parameters.gridDimY,
+                                       gpu_execution_parameters.blockDimX,
+                                       gpu_execution_parameters.blockDimY,
+                                       gpu_execution_parameters.blockDimY,
+                                       gpu_execution_parameters.sharedMemBytes,
+                                       nullptr,
+                                       kernel_parameters,
+                                       nullptr));
+    }
+
+    /// Lookups the entry-point with arguments in the CUDA module and executes it.
+    template <typename ArgType>
+    void execute_with_arguments(const std::string& entry_point,
+                                ArgType arg,
+                                const GPUExecutionParameters& gpu_execution_parameters) {
+        // Get kernel function
+        checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
+
+        // Kernel launch
+        void* kernel_parameters[] = {&arg};
+        checkCudaErrors(cuLaunchKernel(function,
+                                       gpu_execution_parameters.gridDimX,
+                                       gpu_execution_parameters.gridDimY,
+                                       gpu_execution_parameters.gridDimY,
+                                       gpu_execution_parameters.blockDimX,
+                                       gpu_execution_parameters.blockDimY,
+                                       gpu_execution_parameters.blockDimY,
+                                       gpu_execution_parameters.sharedMemBytes,
+                                       nullptr,
+                                       kernel_parameters,
+                                       nullptr));
+    }
+};
+
+/**
+ * \class BaseGPURunner
+ * \brief A base runner class that provides functionality to execute an
+ * entry point in the CUDA module.
+ */
+class BaseGPURunner {
+  protected:
+    std::unique_ptr<CUDADriver> driver;
+
+    explicit BaseGPURunner(std::unique_ptr<llvm::Module> m)
+        : driver(std::make_unique<CUDADriver>(std::move(m))) {}
+
+  public:
+    /// Sets up the CUDA driver.
+    virtual void initialize_driver() = 0;
+
+    /// Runs the entry-point function without arguments.
+    void run_without_arguments(const std::string& entry_point,
+                               const GPUExecutionParameters& gpu_execution_parameters) {
+        return driver->execute_without_arguments(entry_point, gpu_execution_parameters);
+    }
+
+    /// Runs the entry-point function with a pointer to the data as an argument.
+    template <typename ArgType>
+    void run_with_argument(const std::string& entry_point,
+                           ArgType arg,
+                           const GPUExecutionParameters& gpu_execution_parameters) {
+        return driver->template execute_with_arguments(entry_point, arg, gpu_execution_parameters);
+    }
+};
+
+/**
+ * \class TestGPURunner
+ * \brief A simple runner for testing purposes.
+ */
+class TestGPURunner: public BaseGPURunner {
+    /// GPU backend to target.
+    std::string backend;
+
+  public:
+    explicit TestGPURunner(std::unique_ptr<llvm::Module> m, std::string backend)
+        : BaseGPURunner(std::move(m)) {}
+
+    virtual void initialize_driver() {
+        this->driver->init(backend);
+    }
+};
+
+/**
+ * \class BenchmarkGPURunner
+ * \brief A runner with benchmarking functionality. It takes user-specified GPU
+ * features into account, as well as it can link against shared libraries.
+ */
+class BenchmarkGPURunner: public BaseGPURunner {
+  private:
+    /// Benchmarking information passed to JIT driver.
+    BenchmarkInfo benchmark_info;
+
+    /// Beckend to target.
+    std::string backend;
+
+  public:
+    BenchmarkGPURunner(std::unique_ptr<llvm::Module> m,
+                       std::string filename,
+                       std::string output_dir,
+                       std::string backend,
+                       std::vector<std::string> lib_paths = {},
+                       int opt_level_ir = 0,
+                       int opt_level_codegen = 0)
+        : BaseGPURunner(std::move(m))
+        , backend(backend)
+        , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
+
+    virtual void initialize_driver() {
+        this->driver->init(backend, &benchmark_info);
+    }
+};
+
+
+}  // namespace runner
+}  // namespace nmodl
+
+#endif
diff --git a/test/benchmark/gpu_parameters.hpp b/test/benchmark/gpu_parameters.hpp
index fa32bf7602..7d52b28757 100644
--- a/test/benchmark/gpu_parameters.hpp
+++ b/test/benchmark/gpu_parameters.hpp
@@ -28,5 +28,5 @@ struct GPUExecutionParameters {
     int sharedMemBytes;
 };
 
-} // namespace cuda_details
-} // namespace nmodl
+}  // namespace cuda_details
+}  // namespace nmodl
diff --git a/test/benchmark/jit_driver.cpp b/test/benchmark/jit_driver.cpp
index 96b73c3613..f91b41cda0 100644
--- a/test/benchmark/jit_driver.cpp
+++ b/test/benchmark/jit_driver.cpp
@@ -201,76 +201,5 @@ void JITDriver::init(const std::string& cpu, BenchmarkInfo* benchmark_info) {
             llvm::orc::DumpObjects(benchmark_info->output_dir, benchmark_info->filename));
     }
 }
-
-#ifdef NMODL_LLVM_CUDA_BACKEND
-void checkCudaErrors(CUresult err) {
-    if (err != CUDA_SUCCESS) {
-        const char *ret = NULL;
-        cuGetErrorName(err, &ret);
-        throw std::runtime_error("CUDA error: " + std::string(ret));
-    }
-}
-
-void checkNVVMErrors(nvvmResult err) {
-    if (err != NVVM_SUCCESS) {
-        throw std::runtime_error("NVVM Error: " + std::string(nvvmGetErrorString(err)));
-    }
-}
-void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
-    // CUDA initialization
-    checkCudaErrors(cuInit(0));
-    checkCudaErrors(cuDeviceGetCount(&device_info.count));
-    checkCudaErrors(cuDeviceGet(&device, 0));
-
-    char name[128];
-    checkCudaErrors(cuDeviceGetName(name, 128, device));
-    device_info.name = name;
-    std::cout << "Using CUDA Device [0]: " << device_info.name << "\n";
-
-    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
-    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
-    std::cout << "Device Compute Capability: "
-                << device_info.compute_version_major << "." << device_info.compute_version_minor << "\n";
-    if (device_info.compute_version_major < 2) {
-        throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
-    }
-
-    // Save the LLVM IR module to string
-    std::string kernel_llvm_ir;
-    llvm::raw_string_ostream os(kernel_llvm_ir);
-    os << *module;
-    os.flush();
-
-    // Create NVVM program object
-    nvvmCreateProgram(&prog);
-
-    // Add custom IR to program
-    nvvmAddModuleToProgram(prog, kernel_llvm_ir.c_str(), kernel_llvm_ir.size(), "nmodl_llvm_ir");
-
-    // Declare compile options
-    const char *options[] = { "-arch=compute_60" };
-
-    // Compile the program
-    nvvmCompileProgram(prog, 1, options);
-
-    // Get compiled module
-    char* compiled_module;
-    size_t compiled_module_size;
-    nvvmGetCompiledResultSize(prog, &compiled_module_size);
-    compiled_module = (char*)malloc(compiled_module_size);
-    nvvmGetCompiledResult(prog, compiled_module);
-    ptx_compiled_module = std::string(compiled_module, compiled_module_size);
-
-    // Create driver context
-    checkCudaErrors(cuCtxCreate(&context, 0, device));
-
-    // Create module for object
-    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, compiled_module, 0, 0, 0));
-
-    // // Get kernel function
-    // checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));
-}
-#endif
-
 }  // namespace runner
 }  // namespace nmodl
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index f04093dd06..96b46a447c 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -15,45 +15,21 @@
  * \brief \copybrief nmodl::runner::JITDriver
  */
 
+#include "benchmark_info.hpp"
+
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/Support/Host.h"
 
-#ifdef NMODL_LLVM_CUDA_BACKEND
-#include "cuda.h"
-#include "nvvm.h"
-#include "gpu_parameters.hpp"
-#endif
-
-using nmodl::cuda_details::GPUExecutionParameters;
-
 namespace nmodl {
 namespace runner {
 
-/// A struct to hold the information for benchmarking.
-struct BenchmarkInfo {
-    /// Object filename to dump.
-    std::string filename;
-
-    /// Object file output directory.
-    std::string output_dir;
-
-    /// Shared libraries' paths to link against.
-    std::vector<std::string> shared_lib_paths;
-
-    /// Optimisation level for IT.
-    int opt_level_ir;
-
-    /// Optimisation level for machine code generation.
-    int opt_level_codegen;
-};
-
 /**
  * \class JITDriver
  * \brief Driver to execute a MOD file function via LLVM IR backend.
  */
 class JITDriver {
-  protected:
+  private:
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
 
     std::unique_ptr<llvm::orc::LLJIT> jit;
@@ -100,107 +76,19 @@ class JITDriver {
         ReturnType result = res(arg);
         return result;
     }
-
-    /// Lookups the entry-point with an argument in the JIT and executes it, returning the result.
-    template <typename ReturnType, typename ArgType1, typename ArgType2>
-    ReturnType execute_with_arguments(const std::string& entry_point, ArgType1 arg1, ArgType2 arg2) {
-        auto expected_symbol = jit->lookup(entry_point);
-        if (!expected_symbol)
-            throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
-
-        auto (*res)(ArgType1, ArgType2) = (ReturnType(*)(ArgType1, ArgType2))(intptr_t) expected_symbol->getAddress();
-        ReturnType result = res(arg1, arg2);
-        return result;
-    }
-};
-
-#ifdef NMODL_LLVM_CUDA_BACKEND
-struct DeviceInfo {
-    int count;
-    std::string name;
-    int compute_version_major;
-    int compute_version_minor;
 };
 
-/**
- * @brief Throw meaningful error in case CUDA API call fails
- *
- * Checks whether a call to the CUDA API was succsful and if not it throws a runntime_error with
- * the error message from CUDA.
- *
- * @param err Return value of the CUDA API call
- */
-void checkCudaErrors(CUresult err);
-
-/**
- * @brief Throw meaningful error in case NVVM API call fails
- *
- * Checks whether a call to the NVVM API was succsful and if not it throws a runntime_error with
- * the error message from NVVM.
- *
- * @param err Return value of the NVVM API call
- */
-void checkNVVMErrors(nvvmResult err);
-
-class CUDADriver {
-    /// LLVM IR module to execute.
-    std::unique_ptr<llvm::Module> module;
-    nvvmProgram prog;
-    CUdevice    device;
-    CUmodule    cudaModule;
-    CUcontext   context;
-    CUfunction  function;
-    CUlinkState linker;
-    DeviceInfo device_info;
-    std::string ptx_compiled_module;
-
-    public:
-        explicit CUDADriver(std::unique_ptr<llvm::Module> m)
-            : module(std::move(m)) {}
-    
-        /// Initializes the CUDA GPU JIT driver.
-        void init(const std::string& gpu, BenchmarkInfo* benchmark_info = nullptr);
-
-        /// Lookups the entry-point without arguments in the CUDA module and executes it.
-        template <typename ReturnType, typename ArgType2>
-        ReturnType execute_with_arguments(const std::string& entry_point, ArgType2 gpu_execution_parameters) {
-            // Get kernel function
-            checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
-
-            // Kernel launch
-            void *kernel_parameters[] = {};
-            checkCudaErrors(cuLaunchKernel(function, gpu_execution_parameters.gridDimX, gpu_execution_parameters.gridDimY, gpu_execution_parameters.gridDimY,
-                                    gpu_execution_parameters.blockDimX, gpu_execution_parameters.blockDimY, gpu_execution_parameters.blockDimY,
-                                    gpu_execution_parameters.sharedMemBytes, nullptr, kernel_parameters, nullptr));
-        }
-
-        /// Lookups the entry-point with arguments in the CUDA module and executes it.
-        template <typename ReturnType, typename ArgType1, typename ArgType2>
-        ReturnType execute_with_arguments(const std::string& entry_point, ArgType1 arg1, ArgType2 gpu_execution_parameters) {
-            // Get kernel function
-            checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
-
-            // Kernel launch
-            void *kernel_parameters[] = {&arg1};
-            checkCudaErrors(cuLaunchKernel(function, gpu_execution_parameters.gridDimX, gpu_execution_parameters.gridDimY, gpu_execution_parameters.gridDimY,
-                                    gpu_execution_parameters.blockDimX, gpu_execution_parameters.blockDimY, gpu_execution_parameters.blockDimY,
-                                    gpu_execution_parameters.sharedMemBytes, nullptr, kernel_parameters, nullptr));
-        }
-};
-#endif
-
 /**
  * \class BaseRunner
  * \brief A base runner class that provides functionality to execute an
  * entry point in the LLVM IR module.
  */
-template<typename DriverType = JITDriver>
 class BaseRunner {
   protected:
-    std::unique_ptr<DriverType> driver;
+    std::unique_ptr<JITDriver> driver;
 
-    explicit BaseRunner<DriverType>(std::unique_ptr<llvm::Module> m)
-        : driver(std::make_unique<DriverType>(std::move(m))) {}
+    explicit BaseRunner(std::unique_ptr<llvm::Module> m)
+        : driver(std::make_unique<JITDriver>(std::move(m))) {}
 
   public:
     /// Sets up the JIT driver.
@@ -217,26 +105,19 @@ class BaseRunner {
     ReturnType run_with_argument(const std::string& entry_point, ArgType arg) {
         return driver->template execute_with_arguments<ReturnType, ArgType>(entry_point, arg);
     }
-
-    /// Runs the entry-point function with a pointer to the data as an argument.
-    template <typename ReturnType, typename ArgType1, typename ArgType2>
-    ReturnType run_with_argument(const std::string& entry_point, ArgType1 arg1, ArgType2 arg2) {
-        return driver->template execute_with_arguments<ReturnType, ArgType1, ArgType2>(entry_point, arg1, arg2);
-    }
 };
 
 /**
  * \class TestRunner
  * \brief A simple runner for testing purposes.
  */
-template<typename DriverType = JITDriver>
-class TestRunner: public BaseRunner<DriverType> {
+class TestRunner: public BaseRunner {
   public:
-    explicit TestRunner<DriverType>(std::unique_ptr<llvm::Module> m)
-        : BaseRunner<DriverType>(std::move(m)) {}
+    explicit TestRunner(std::unique_ptr<llvm::Module> m)
+        : BaseRunner(std::move(m)) {}
 
     virtual void initialize_driver() {
-        this->driver->init(llvm::sys::getHostCPUName().str());
+        driver->init(llvm::sys::getHostCPUName().str());
     }
 };
 
@@ -245,29 +126,28 @@ class TestRunner: public BaseRunner<DriverType> {
  * \brief A runner with benchmarking functionality. It takes user-specified CPU
  * features into account, as well as it can link against shared libraries.
  */
-template<typename DriverType = JITDriver>
-class BenchmarkRunner: public BaseRunner<DriverType> {
+class BenchmarkRunner: public BaseRunner {
   private:
     /// Benchmarking information passed to JIT driver.
     BenchmarkInfo benchmark_info;
 
-    /// Beckend to target.
-    std::string backend;
+    /// CPU to target.
+    std::string cpu;
 
   public:
-    BenchmarkRunner<DriverType>(std::unique_ptr<llvm::Module> m,
+    BenchmarkRunner(std::unique_ptr<llvm::Module> m,
                     std::string filename,
                     std::string output_dir,
-                    std::string backend,
+                    std::string cpu,
                     std::vector<std::string> lib_paths = {},
                     int opt_level_ir = 0,
                     int opt_level_codegen = 0)
-        : BaseRunner<DriverType>(std::move(m))
-        , backend(backend)
+        : BaseRunner(std::move(m))
+        , cpu(cpu)
         , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
     virtual void initialize_driver() {
-        this->driver->init(backend, &benchmark_info);
+        driver->init(cpu, &benchmark_info);
     }
 };
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index a454d040b5..cd85ae7b35 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -9,6 +9,7 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "llvm_benchmark.hpp"
+#include "test/benchmark/cuda_driver.hpp"
 #include "test/benchmark/jit_driver.hpp"
 #include "llvm/Support/Host.h"
 
@@ -55,7 +56,7 @@ void LLVMBenchmark::run_benchmark_on_cpu(const std::shared_ptr<ast::Program>& no
     // Create the benchmark runner and initialize it.
     std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
                            mod_filename;
-    runner::BenchmarkRunner<> runner(
+    runner::BenchmarkRunner runner(
         std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
@@ -116,7 +117,7 @@ void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& no
 
     // Create the benchmark runner and initialize it.
     std::string filename = "cuda_" + mod_filename;
-    runner::BenchmarkRunner<runner::CUDADriver> runner(
+    runner::BenchmarkGPURunner runner(
         std::move(m), filename, output_dir, gpu_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
@@ -140,7 +141,9 @@ void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& no
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            runner.run_with_argument<void, void*, const GPUExecutionParameters&>(kernel_name, instance_data.base_ptr, gpu_execution_parameters);
+            runner.run_with_argument<void*>(kernel_name,
+                                            instance_data.base_ptr,
+                                            gpu_execution_parameters);
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 4ab271e740..4c9515f814 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -134,7 +134,7 @@ SCENARIO("Arithmetic expression", "[llvm][runner]") {
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
-        TestRunner<> runner(std::move(m));
+        TestRunner runner(std::move(m));
         runner.initialize_driver();
 
         THEN("functions are evaluated correctly") {
@@ -240,7 +240,7 @@ SCENARIO("Optimised arithmetic expression", "[llvm][runner]") {
         llvm_visitor.visit_program(*ast);
 
         std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
-        TestRunner<> runner(std::move(m));
+        TestRunner runner(std::move(m));
         runner.initialize_driver();
 
         THEN("optimizations preserve function results") {
@@ -337,7 +337,7 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        TestRunner<> runner(std::move(module));
+        TestRunner runner(std::move(module));
         runner.initialize_driver();
 
         THEN("Values in struct have changed according to the formula") {
@@ -427,7 +427,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        TestRunner<> runner(std::move(module));
+        TestRunner runner(std::move(module));
         runner.initialize_driver();
 
         THEN("Values in struct have changed according to the formula") {
@@ -505,7 +505,7 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        TestRunner<> runner(std::move(module));
+        TestRunner runner(std::move(module));
         runner.initialize_driver();
 
         THEN("Ion values in struct have been updated correctly") {
@@ -606,7 +606,7 @@ SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
 
         // Set up the JIT runner.
         std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
-        TestRunner<> runner(std::move(module));
+        TestRunner runner(std::move(module));
         runner.initialize_driver();
 
         THEN("Masked instructions are generated") {

From 0882fc33952b1408b60b2073d1c77e7647fa5f5c Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 18 Mar 2022 18:27:57 +0100
Subject: [PATCH 210/331] Small fixes and setting the compute arch in options

---
 src/codegen/llvm/main.cpp      | 2 +-
 test/benchmark/cuda_driver.cpp | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 4f25474dbd..92d8a486c1 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -67,7 +67,7 @@ int main(int argc, const char* argv[]) {
         throw std::runtime_error(
             "Error: entry-point functions with non-double return type are not supported\n");
 
-    TestRunner<> runner(std::move(module));
+    TestRunner runner(std::move(module));
     runner.initialize_driver();
 
     // Since only double type is supported, provide explicit double type to the running function.
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index b9ec0e3eca..438c8a80da 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -10,8 +10,11 @@
 #include "cuda_driver.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "codegen/llvm/llvm_utils.hpp"
+#include "fmt/format.h"
 #include "utils/common_utils.hpp"
 
+using fmt::literals::operator""_format;
+
 namespace nmodl {
 namespace runner {
 
@@ -64,7 +67,8 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     nvvmAddModuleToProgram(prog, kernel_llvm_ir.c_str(), kernel_llvm_ir.size(), "nmodl_llvm_ir");
 
     // Declare compile options
-    const char* options[] = {"-arch=compute_60"};
+    const auto arch_option = "-arch=compute_{}0"_format(device_info.compute_version_major);
+    const char* options[] = {arch_option.c_str()};
 
     // Compile the program
     nvvmCompileProgram(prog, 1, options);
@@ -82,9 +86,6 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
 
     // Create module for object
     checkCudaErrors(cuModuleLoadDataEx(&cudaModule, compiled_module, 0, 0, 0));
-
-    // // Get kernel function
-    // checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));
 }
 
 }  // namespace runner

From 0ec082625842d6cc88efab040e1126612e761175 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 21 Mar 2022 12:29:46 +0100
Subject: [PATCH 211/331] Tried workflow with test kernel and linking with
 libdevice

---
 test/benchmark/cuda_driver.cpp | 38 ++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index 438c8a80da..3610fec858 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -7,12 +7,16 @@
 
 #ifdef NMODL_LLVM_CUDA_BACKEND
 
+#include <regex>
+
 #include "cuda_driver.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "codegen/llvm/llvm_utils.hpp"
 #include "fmt/format.h"
 #include "utils/common_utils.hpp"
 
+#include <fstream>
+
 using fmt::literals::operator""_format;
 
 namespace nmodl {
@@ -31,6 +35,31 @@ void checkNVVMErrors(nvvmResult err) {
         throw std::runtime_error("NVVM Error: " + std::string(nvvmGetErrorString(err)));
     }
 }
+
+std::string load_file_to_string(const std::string& filename) {
+    std::ifstream t(filename);
+    if (!t.is_open()) {
+        throw std::runtime_error("File {} not found"_format(filename));
+    }
+    std::string str((std::istreambuf_iterator<char>(t)),
+                    std::istreambuf_iterator<char>());
+    return str;
+}
+
+void load_libraries(const nvvmProgram& program, const BenchmarkInfo& benchmark_info) {
+    for (const auto& lib_path : benchmark_info.shared_lib_paths) {
+        const auto lib_name = lib_path.substr(lib_path.find_last_of("/\\") + 1);
+        std::regex libdevice_bitcode_name{"libdevice.*.bc"};
+        if (!std::regex_match(lib_name, libdevice_bitcode_name)) {
+            throw std::runtime_error("Only libdevice is supported for now");
+        }
+        // Load libdevice module to the NVVM program
+        const auto libdevice_module = load_file_to_string(lib_path);
+        const auto libdevice_module_size = libdevice_module.size();
+        checkNVVMErrors(nvvmAddModuleToProgram(program, libdevice_module.c_str(), libdevice_module_size, "libdevice"));
+    }
+}
+
 void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     // CUDA initialization
     checkCudaErrors(cuInit(0));
@@ -63,6 +92,10 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     // Create NVVM program object
     nvvmCreateProgram(&prog);
 
+    // Load the external libraries modules to the NVVM program
+    // Currently only libdevice is supported
+    load_libraries(prog, *benchmark_info);
+
     // Add custom IR to program
     nvvmAddModuleToProgram(prog, kernel_llvm_ir.c_str(), kernel_llvm_ir.size(), "nmodl_llvm_ir");
 
@@ -79,13 +112,14 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     nvvmGetCompiledResultSize(prog, &compiled_module_size);
     compiled_module = (char*) malloc(compiled_module_size);
     nvvmGetCompiledResult(prog, compiled_module);
-    ptx_compiled_module = std::string(compiled_module, compiled_module_size);
+    ptx_compiled_module = std::string(compiled_module);
+    free(compiled_module);
 
     // Create driver context
     checkCudaErrors(cuCtxCreate(&context, 0, device));
 
     // Create module for object
-    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, compiled_module, 0, 0, 0));
+    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx_compiled_module.c_str(), 0, 0, 0));
 }
 
 }  // namespace runner

From 0f72a74a9b82b355b7a04538cd77d738c1ef1cea Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 21 Mar 2022 12:38:47 +0100
Subject: [PATCH 212/331] Make clang-format happy and only compile gpu
 benchmark if cuda backend is enabled

---
 test/benchmark/cuda_driver.cpp    | 13 ++++++-------
 test/benchmark/llvm_benchmark.cpp |  7 +++++++
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index 3610fec858..b7f23db50c 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -7,16 +7,15 @@
 
 #ifdef NMODL_LLVM_CUDA_BACKEND
 
+#include <fstream>
 #include <regex>
 
-#include "cuda_driver.hpp"
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "codegen/llvm/llvm_utils.hpp"
+#include "cuda_driver.hpp"
 #include "fmt/format.h"
 #include "utils/common_utils.hpp"
 
-#include <fstream>
-
 using fmt::literals::operator""_format;
 
 namespace nmodl {
@@ -41,13 +40,12 @@ std::string load_file_to_string(const std::string& filename) {
     if (!t.is_open()) {
         throw std::runtime_error("File {} not found"_format(filename));
     }
-    std::string str((std::istreambuf_iterator<char>(t)),
-                    std::istreambuf_iterator<char>());
+    std::string str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
     return str;
 }
 
 void load_libraries(const nvvmProgram& program, const BenchmarkInfo& benchmark_info) {
-    for (const auto& lib_path : benchmark_info.shared_lib_paths) {
+    for (const auto& lib_path: benchmark_info.shared_lib_paths) {
         const auto lib_name = lib_path.substr(lib_path.find_last_of("/\\") + 1);
         std::regex libdevice_bitcode_name{"libdevice.*.bc"};
         if (!std::regex_match(lib_name, libdevice_bitcode_name)) {
@@ -56,7 +54,8 @@ void load_libraries(const nvvmProgram& program, const BenchmarkInfo& benchmark_i
         // Load libdevice module to the NVVM program
         const auto libdevice_module = load_file_to_string(lib_path);
         const auto libdevice_module_size = libdevice_module.size();
-        checkNVVMErrors(nvvmAddModuleToProgram(program, libdevice_module.c_str(), libdevice_module_size, "libdevice"));
+        checkNVVMErrors(nvvmAddModuleToProgram(
+            program, libdevice_module.c_str(), libdevice_module_size, "libdevice"));
     }
 }
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index cd85ae7b35..6442acf61f 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -103,6 +103,7 @@ void LLVMBenchmark::run_benchmark_on_cpu(const std::shared_ptr<ast::Program>& no
     }
 }
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
 void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& node) {
     // Set the codegen data helper and find the kernels.
     auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
@@ -165,6 +166,12 @@ void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& no
         logger->info("Maximum compute time = {:.6f}\n", time_max);
     }
 }
+#else
+void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& node) {
+    throw std::runtime_error(
+        "GPU benchmarking is not supported if NMODL is not built with CUDA backend enabled.");
+}
+#endif
 
 }  // namespace benchmark
 }  // namespace nmodl

From 881d85eba6773d2b920b28e9cf2c6c1507c60997 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 21 Mar 2022 13:02:43 +0100
Subject: [PATCH 213/331] Improved a bit logs

---
 test/benchmark/cuda_driver.cpp    | 6 +++---
 test/benchmark/llvm_benchmark.cpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index b7f23db50c..2c545949c8 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -68,7 +68,7 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     char name[128];
     checkCudaErrors(cuDeviceGetName(name, 128, device));
     device_info.name = name;
-    std::cout << "Using CUDA Device [0]: " << device_info.name << "\n";
+    logger->info("Using CUDA Device [0]: {}"_format(device_info.name));
 
     checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_major,
                                          CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
@@ -76,8 +76,8 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_minor,
                                          CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                                          device));
-    std::cout << "Device Compute Capability: " << device_info.compute_version_major << "."
-              << device_info.compute_version_minor << "\n";
+    logger->info("Device Compute Capability: {}.{}"_format(device_info.compute_version_major,
+                                                           device_info.compute_version_minor));
     if (device_info.compute_version_major < 2) {
         throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
     }
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 6442acf61f..c15d0e873c 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -112,7 +112,7 @@ void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& no
 
     // Get feature's string and turn them off depending on the cpu.
     const auto gpu_name = platform.get_name();
-    logger->info("GPU: {}", gpu_name);
+    logger->info("GPU backend: {}", gpu_name);
 
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 

From 7de726cc93efeae4b1741bfcd6f0a642a64a0488 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 21 Mar 2022 13:10:17 +0100
Subject: [PATCH 214/331] Added newline in EOF

---
 test/benchmark/benchmark_info.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/benchmark_info.hpp b/test/benchmark/benchmark_info.hpp
index c63219e4e8..4fddf21015 100644
--- a/test/benchmark/benchmark_info.hpp
+++ b/test/benchmark/benchmark_info.hpp
@@ -26,4 +26,4 @@ struct BenchmarkInfo {
 
     /// Optimisation level for machine code generation.
     int opt_level_codegen;
-};
\ No newline at end of file
+};

From 18df661cb2e6d8896601e7b34466b7c05979ba81 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 21 Mar 2022 13:20:43 +0100
Subject: [PATCH 215/331] Use cmake 3.18 in the CI

---
 azure-pipelines.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 4a490b9cb4..ece44244f3 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -52,8 +52,8 @@ jobs:
       chmod +x llvm.sh
       sudo ./llvm.sh 13
     env:
-      CMAKE_VER: 'v3.17.0'
-      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
+      CMAKE_VER: 'v3.18.0'
+      CMAKE_PKG: 'cmake-3.18.0-Linux-x86_64'
     displayName: 'Install Dependencies'
   - script: |
       export PATH=$(pwd)/$CMAKE_PKG/bin:/home/vsts/.local/bin:$PATH
@@ -71,7 +71,7 @@ jobs:
       make install #this is needed for the integration tests
       env CTEST_OUTPUT_ON_FAILURE=1 make test
     env:
-      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
+      CMAKE_PKG: 'cmake-3.18.0-Linux-x86_64'
     displayName: 'Build and Run Unit Tests'
   - script: |
       export PATH=$(pwd)/$CMAKE_PKG/bin:/home/vsts/.local/bin:$PATH
@@ -94,7 +94,7 @@ jobs:
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
     env:
-      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
+      CMAKE_PKG: 'cmake-3.18.0-Linux-x86_64'
       SHELL: 'bash'
     displayName: 'Build Neuron and Run Integration Tests'
   - script: |
@@ -118,7 +118,7 @@ jobs:
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
     env:
-      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
+      CMAKE_PKG: 'cmake-3.18.0-Linux-x86_64'
     displayName: 'Build CoreNEURON and Run Integration Tests with ISPC compiler'
 - job: 'osx1015'
   pool:

From 7aceb5bee2a3e28372ed452bd4078b9315865110 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 21 Mar 2022 17:40:41 +0100
Subject: [PATCH 216/331] Added optimization option and printing PTX to file

---
 test/benchmark/benchmark_info.hpp |  2 +-
 test/benchmark/cuda_driver.cpp    | 57 ++++++++++++++++++++++++-------
 test/benchmark/cuda_driver.hpp    |  8 +++--
 3 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/test/benchmark/benchmark_info.hpp b/test/benchmark/benchmark_info.hpp
index 4fddf21015..d02d33ce2e 100644
--- a/test/benchmark/benchmark_info.hpp
+++ b/test/benchmark/benchmark_info.hpp
@@ -12,7 +12,7 @@
 
 /// A struct to hold the information for benchmarking.
 struct BenchmarkInfo {
-    /// Object filename to dump.
+    /// Object or PTX filename to dump.
     std::string filename;
 
     /// Object file output directory.
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index 2c545949c8..f479b3614e 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -21,7 +21,7 @@ using fmt::literals::operator""_format;
 namespace nmodl {
 namespace runner {
 
-void checkCudaErrors(CUresult err) {
+void CUDADriver::checkCudaErrors(CUresult err) {
     if (err != CUDA_SUCCESS) {
         const char* ret = NULL;
         cuGetErrorName(err, &ret);
@@ -29,9 +29,14 @@ void checkCudaErrors(CUresult err) {
     }
 }
 
-void checkNVVMErrors(nvvmResult err) {
+void CUDADriver::checkNVVMErrors(nvvmResult err) {
     if (err != NVVM_SUCCESS) {
-        throw std::runtime_error("NVVM Error: " + std::string(nvvmGetErrorString(err)));
+        size_t program_log_size;
+        nvvmGetProgramLogSize(prog, &program_log_size);
+        auto program_log = (char*) malloc(program_log_size);
+        nvvmGetProgramLog(prog, program_log);
+        throw std::runtime_error(
+            "Compilation Log:\n {}\nNVVM Error: {}\n"_format(program_log, nvvmGetErrorString(err)));
     }
 }
 
@@ -44,8 +49,8 @@ std::string load_file_to_string(const std::string& filename) {
     return str;
 }
 
-void load_libraries(const nvvmProgram& program, const BenchmarkInfo& benchmark_info) {
-    for (const auto& lib_path: benchmark_info.shared_lib_paths) {
+void CUDADriver::load_libraries(BenchmarkInfo* benchmark_info) {
+    for (const auto& lib_path: benchmark_info->shared_lib_paths) {
         const auto lib_name = lib_path.substr(lib_path.find_last_of("/\\") + 1);
         std::regex libdevice_bitcode_name{"libdevice.*.bc"};
         if (!std::regex_match(lib_name, libdevice_bitcode_name)) {
@@ -55,9 +60,26 @@ void load_libraries(const nvvmProgram& program, const BenchmarkInfo& benchmark_i
         const auto libdevice_module = load_file_to_string(lib_path);
         const auto libdevice_module_size = libdevice_module.size();
         checkNVVMErrors(nvvmAddModuleToProgram(
-            program, libdevice_module.c_str(), libdevice_module_size, "libdevice"));
+            prog, libdevice_module.c_str(), libdevice_module_size, "libdevice"));
     }
 }
+auto get_compilation_options(int compute_version_major, BenchmarkInfo* benchmark_info) {
+    std::vector<std::string> compilation_options;
+    // Set the correct architecture to generate the PTX for
+    // Architectures should be based on the major compute capability of the GPU
+    const std::string arch_option{"-arch=compute_{}0"_format(compute_version_major)};
+    compilation_options.push_back(arch_option);
+    // Set the correct optimization level
+    const std::string optimization_option{"-opt={}"_format(benchmark_info->opt_level_codegen)};
+    compilation_options.push_back(optimization_option);
+    return compilation_options;
+}
+
+void print_ptx_to_file(const std::string& ptx_compiled_module, const std::string& filename) {
+    std::ofstream ptx_file(filename);
+    ptx_file << ptx_compiled_module;
+    ptx_file.close();
+}
 
 void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     // CUDA initialization
@@ -89,21 +111,28 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     os.flush();
 
     // Create NVVM program object
-    nvvmCreateProgram(&prog);
+    checkNVVMErrors(nvvmCreateProgram(&prog));
 
     // Load the external libraries modules to the NVVM program
     // Currently only libdevice is supported
-    load_libraries(prog, *benchmark_info);
+    load_libraries(benchmark_info);
 
     // Add custom IR to program
-    nvvmAddModuleToProgram(prog, kernel_llvm_ir.c_str(), kernel_llvm_ir.size(), "nmodl_llvm_ir");
+    checkNVVMErrors(nvvmAddModuleToProgram(
+        prog, kernel_llvm_ir.c_str(), kernel_llvm_ir.size(), "nmodl_llvm_ir"));
 
     // Declare compile options
-    const auto arch_option = "-arch=compute_{}0"_format(device_info.compute_version_major);
-    const char* options[] = {arch_option.c_str()};
-
+    auto compilation_options = get_compilation_options(device_info.compute_version_major,
+                                                       benchmark_info);
+    // transform compilation options to vector of const char*
+    std::vector<const char*> compilation_options_c_str;
+    for (const auto& option: compilation_options) {
+        compilation_options_c_str.push_back(option.c_str());
+    }
     // Compile the program
-    nvvmCompileProgram(prog, 1, options);
+    checkNVVMErrors(nvvmCompileProgram(prog,
+                                       compilation_options_c_str.size(),
+                                       compilation_options_c_str.data()));
 
     // Get compiled module
     char* compiled_module;
@@ -113,6 +142,8 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     nvvmGetCompiledResult(prog, compiled_module);
     ptx_compiled_module = std::string(compiled_module);
     free(compiled_module);
+    print_ptx_to_file(ptx_compiled_module,
+                      benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
 
     // Create driver context
     checkCudaErrors(cuCtxCreate(&context, 0, device));
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index 574fc7305b..91c91c0370 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -75,6 +75,10 @@ class CUDADriver {
     DeviceInfo device_info;
     std::string ptx_compiled_module;
 
+    void checkCudaErrors(CUresult err);
+    void checkNVVMErrors(nvvmResult err);
+    void load_libraries(BenchmarkInfo* benchmark_info);
+
   public:
     explicit CUDADriver(std::unique_ptr<llvm::Module> m)
         : module(std::move(m)) {}
@@ -171,7 +175,7 @@ class TestGPURunner: public BaseGPURunner {
         : BaseGPURunner(std::move(m)) {}
 
     virtual void initialize_driver() {
-        this->driver->init(backend);
+        driver->init(backend);
     }
 };
 
@@ -201,7 +205,7 @@ class BenchmarkGPURunner: public BaseGPURunner {
         , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
     virtual void initialize_driver() {
-        this->driver->init(backend, &benchmark_info);
+        driver->init(backend, &benchmark_info);
     }
 };
 

From d0af9b4fccc792447f83635bc4b0861c53f18a45 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 21 Mar 2022 17:54:46 +0100
Subject: [PATCH 217/331] Use the kernel real name in nvvm anotation in the
 generated LLVM IR

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 13 ++++++-------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 86fe5b5443..0c7a72ee47 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -64,12 +64,11 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
-void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
-    llvm::Metadata* metadata[] = {
-        llvm::ValueAsMetadata::get(kernel),
-        llvm::MDString::get(*context, "kernel"),
-        llvm::ValueAsMetadata::get(
-            llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(const std::string& kernel_name, llvm::Function* kernel) {
+    llvm::Metadata* metadata[] = {llvm::ValueAsMetadata::get(kernel),
+                                  llvm::MDString::get(*context, kernel_name),
+                                  llvm::ValueAsMetadata::get(
+                                      llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
     llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
     module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
 }
@@ -684,7 +683,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
             ir_builder.generate_scalar_ir();
         } else if (platform.is_gpu()) {
             block->accept(*this);
-            annotate_kernel_with_nvvm(func);
+            annotate_kernel_with_nvvm(name, func);
         } else { // scalar
             block->accept(*this);
         }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 67a3a6fab6..ba3781e14e 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -158,7 +158,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
   private:
     // Annotates kernel function with NVVM metadata.
-    void annotate_kernel_with_nvvm(llvm::Function* kernel);
+    void annotate_kernel_with_nvvm(const std::string& kernel_name, llvm::Function* kernel);
 
 #if LLVM_VERSION_MAJOR >= 13
     /// Populates target library info with the vector library definitions.

From 44b7f005cdf7c551847b32451da1920bfced9638 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 21 Mar 2022 22:09:14 +0100
Subject: [PATCH 218/331] Revert "Use the kernel real name in nvvm anotation in
 the generated LLVM IR"

This reverts commit d0af9b4fccc792447f83635bc4b0861c53f18a45.
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 13 +++++++------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0c7a72ee47..86fe5b5443 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -64,11 +64,12 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
-void CodegenLLVMVisitor::annotate_kernel_with_nvvm(const std::string& kernel_name, llvm::Function* kernel) {
-    llvm::Metadata* metadata[] = {llvm::ValueAsMetadata::get(kernel),
-                                  llvm::MDString::get(*context, kernel_name),
-                                  llvm::ValueAsMetadata::get(
-                                      llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
+    llvm::Metadata* metadata[] = {
+        llvm::ValueAsMetadata::get(kernel),
+        llvm::MDString::get(*context, "kernel"),
+        llvm::ValueAsMetadata::get(
+            llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
     llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
     module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
 }
@@ -683,7 +684,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
             ir_builder.generate_scalar_ir();
         } else if (platform.is_gpu()) {
             block->accept(*this);
-            annotate_kernel_with_nvvm(name, func);
+            annotate_kernel_with_nvvm(func);
         } else { // scalar
             block->accept(*this);
         }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index ba3781e14e..67a3a6fab6 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -158,7 +158,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
 
   private:
     // Annotates kernel function with NVVM metadata.
-    void annotate_kernel_with_nvvm(const std::string& kernel_name, llvm::Function* kernel);
+    void annotate_kernel_with_nvvm(llvm::Function* kernel);
 
 #if LLVM_VERSION_MAJOR >= 13
     /// Populates target library info with the vector library definitions.

From 8bdefb906118cd81610fc8000ff9813eb19778b3 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 22 Mar 2022 08:05:21 +0100
Subject: [PATCH 219/331] Replaced IF with FOR loop

---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 97 ++++++++++---------
 .../llvm/codegen_llvm_helper_visitor.hpp      |  9 ++
 src/language/code_generator.cmake             |  1 +
 src/language/codegen.yaml                     | 37 ++++---
 test/unit/codegen/codegen_llvm_ir.cpp         | 23 ++---
 5 files changed, 90 insertions(+), 77 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index be64784d33..bfadc24857 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -549,48 +549,64 @@ void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
     create_function_for_node(node);
 }
 
-/**
- * Create loop increment expression `id = id + width`
- * \todo : same as int_initialization_expression()
- */
-static std::shared_ptr<ast::Expression> loop_increment_expression(const std::string& induction_var,
-                                                                  int vector_width) {
-    // first create id + x
+std::shared_ptr<ast::Expression>
+CodegenLLVMHelperVisitor::loop_initialization_expression(const std::string& induction_var,
+                                                         bool is_remainder_loop) {
+    if (platform.is_gpu()) {
+        const auto& id = create_varname(induction_var);
+        const auto& tid = new ast::CodegenThreadId();
+        return std::make_shared<ast::BinaryExpression>(id, ast::BinaryOperator(ast::BOP_ASSIGN), tid);
+    }
+
+  // Otherwise, platfrom is CPU. Since the loop can be a remainder loop, check if
+  // we need to initialize at all.
+    if (is_remainder_loop)
+        return nullptr;
+    return int_initialization_expression(induction_var);
+}
+
+std::shared_ptr<ast::Expression>
+CodegenLLVMHelperVisitor::loop_increment_expression(const std::string& induction_var,
+                                                    bool is_remainder_loop) {
     const auto& id = create_varname(induction_var);
-    const auto& inc = new ast::Integer(vector_width, nullptr);
+
+    // For GPU platforms, increment by frid stride.
+    if (platform.is_gpu()) {
+        const auto& stride = new ast::CodegenGridStride();
+        const auto& inc_expr =
+            new ast::BinaryExpression(id, ast::BinaryOperator(ast::BOP_ADDITION), stride);
+        return std::make_shared<ast::BinaryExpression>(id->clone(),
+                                                    ast::BinaryOperator(ast::BOP_ASSIGN),
+                                                    inc_expr);
+    }
+
+    // Otherwise, proceed with increment for CPU loop.
+    const int width = is_remainder_loop ? 1 : platform.get_instruction_width();
+    const auto& inc = new ast::Integer(width, nullptr);
     const auto& inc_expr =
         new ast::BinaryExpression(id, ast::BinaryOperator(ast::BOP_ADDITION), inc);
-    // now create id = id + x
     return std::make_shared<ast::BinaryExpression>(id->clone(),
                                                    ast::BinaryOperator(ast::BOP_ASSIGN),
                                                    inc_expr);
 }
 
-/**
- * Create loop count comparison expression
- *
- * Based on if loop is vectorised or not, the condition for loop
- * is different. For example:
- *  - serial loop : `id < node_count`
- *  - vector loop : `id < (node_count - vector_width + 1)`
- *
- * \todo : same as int_initialization_expression()
- */
-static std::shared_ptr<ast::Expression> loop_count_expression(const std::string& induction_var,
-                                                              const std::string& node_count,
-                                                              int vector_width) {
+std::shared_ptr<ast::Expression>
+CodegenLLVMHelperVisitor::loop_count_expression(const std::string& induction_var,
+                                                const std::string& node_count,
+                                                bool is_remainder_loop) {
+    const int width = is_remainder_loop ? 1 : platform.get_instruction_width();
     const auto& id = create_varname(induction_var);
     const auto& mech_node_count = create_varname(node_count);
 
     // For non-vectorised loop, the condition is id < mech->node_count
-    if (vector_width == 1) {
+    if (width == 1) {
         return std::make_shared<ast::BinaryExpression>(id->clone(),
                                                        ast::BinaryOperator(ast::BOP_LESS),
                                                        mech_node_count);
     }
 
-    // For vectorised loop, the condition is id < mech->node_count - vector_width + 1
-    const auto& remainder = new ast::Integer(vector_width - 1, /*macro=*/nullptr);
+    // For vectorised loop, the condition is id < mech->node_count - width + 1
+    const auto& remainder = new ast::Integer(width - 1, /*macro=*/nullptr);
     const auto& count = new ast::BinaryExpression(mech_node_count,
                                                   ast::BinaryOperator(ast::BOP_SUBTRACTION),
                                                   remainder);
@@ -667,15 +683,13 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     compute_body.insert(compute_body.end(), index_statements.begin(), index_statements.end());
     compute_body.insert(compute_body.end(), body_statements.begin(), body_statements.end());
 
+    std::vector<std::string> induction_variables{INDUCTION_VAR};
+    function_statements.push_back(
+            create_local_variable_statement(induction_variables, INTEGER_TYPE));
+
     if (platform.is_gpu()) {
-        const auto& id_statement = std::make_shared<ast::CodegenThreadId>(create_varname(INDUCTION_VAR));
-        function_statements.push_back(id_statement);
         create_gpu_compute_body(compute_body, function_statements, int_variables, double_variables);
     } else {
-        // Create induction variable
-        std::vector<std::string> induction_variables{INDUCTION_VAR};
-        function_statements.push_back(
-                create_local_variable_statement(induction_variables, INTEGER_TYPE));
         create_cpu_compute_body(compute_body, function_statements, int_variables, double_variables);
     }
 
@@ -707,18 +721,10 @@ void CodegenLLVMHelperVisitor::create_gpu_compute_body(ast::StatementVector& bod
                                                        ast::StatementVector& function_statements,
                                                        std::vector<std::string>& int_variables,
                                                        std::vector<std::string>& double_variables) {
-    // Then, create condition for thread id. For now reuse the functionality from `loop_count_expression`.
     auto kernel_block = std::make_shared<ast::StatementBlock>(body);
-    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, 1);
-    ast::ElseIfStatementVector else_ifs = {};
-    auto if_statement = std::make_shared<ast::IfStatement>(condition, kernel_block, else_ifs, nullptr);
-
-    convert_to_instance_variable(*if_statement, INDUCTION_VAR);
 
-    // Push variables and the loop to the function statements vector.
-    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
-    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
-    function_statements.push_back(if_statement);
+    // dispatch loop creation with right parameters
+    create_compute_body_loop(kernel_block, function_statements, int_variables, double_variables);
 }
 
 void CodegenLLVMHelperVisitor::create_cpu_compute_body(ast::StatementVector& body,
@@ -736,12 +742,9 @@ void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::Sta
                                                         std::vector<std::string>& int_variables,
                                                         std::vector<std::string>& double_variables,
                                                         bool is_remainder_loop) {
-    // First, check if we are creating a main or remainder loop. If it is a remainder loop, then
-    // no initialization is needed and instruction width is simply 1.
-    int width = is_remainder_loop ? 1 : platform.get_instruction_width();
-    const auto& initialization = is_remainder_loop ? nullptr : int_initialization_expression(INDUCTION_VAR);
-    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, width);
-    const auto& increment = loop_increment_expression(INDUCTION_VAR, width);
+    const auto& initialization = loop_initialization_expression(INDUCTION_VAR, is_remainder_loop);
+    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, is_remainder_loop);
+    const auto& increment = loop_increment_expression(INDUCTION_VAR, is_remainder_loop);
 
     // Clone the statement block if needed since it can be used by the remainder loop.
     auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index a40d7923cc..2aa7f2fe03 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -176,6 +176,15 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void visit_program(ast::Program& node) override;
 
   private:
+    /// Methods to create target-specific loop constructs.
+    std::shared_ptr<ast::Expression> loop_initialization_expression(const std::string& induction_var,
+                                                                    bool is_remainder_loop);
+    std::shared_ptr<ast::Expression> loop_count_expression(const std::string& induction_var,
+                                                           const std::string& node_count,
+                                                           bool is_remainder_loop);
+    std::shared_ptr<ast::Expression> loop_increment_expression(const std::string& induction_var,
+                                                               bool is_remainder_loop);
+
     /// Methods to populate`function_statements` with necessary AST constructs to form
     /// a kernel for a specific target.
     void create_gpu_compute_body(ast::StatementVector& body,
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 72b2754b1a..24953e9182 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -68,6 +68,7 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/codegen_atomic_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_for_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_function.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_grid_stride.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_instance_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_struct.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 245010f054..93f1fca50d 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -199,6 +199,27 @@
                             brief: "member functions of the class/struct"
                             type: CodegenFunction
                             vector: true
+                  - CodegenThreadId:
+                      brief: "Represents thread id expression for GPU code generation"
+                      description: |
+                        For GPU code generation, we use a special AST node to enocde the initial
+                        thread id calculation. In NMODL, this expression is usually of the form:
+                        \code{.cpp}
+                            tid = blockId.x * blockDim.x + threadId.x
+                        \endcode
+                        To be able to support multiple GPU backends, we choose to have a custom AST
+                        node. Therefore, the code generation for this node is kept very simple,
+                        mapping expression to target-specific GPU inrinsics.
+                      nmodl: "THREAD_ID"
+                  - CodegenGridStride:
+                      brief: "Represents grid stride for GPU code generation"
+                      description: |
+                        For GPU code generation, we use a special AST node to enocde the loop
+                        increment expression. In NMODL, this expression is usually of the form:
+                        \code{.cpp}
+                            for (int i = tid; i < n; i += blockDim.x * gridDim.x)
+                        \endcode
+                      nmodl: "GRID_STRIDE"
             - Statement:
                 brief: "Statement base class"
                 children:
@@ -286,19 +307,3 @@
                         - rhs:
                             brief: "Expression for atomic operation"
                             type: Expression
-                  - CodegenThreadId:
-                      brief: "Represents a generic thread id expression for GPU code generation"
-                      description: |
-                        For GPU code generation, we use a special AST node to enocde the thread
-                        id calculation. In NMODL, this expression is usually of the form:
-                        \code{.cpp}
-                            id = blockId.x * blockDim.x + threadId.x
-                        \endcode
-                        To be able to support multiple GPU backends, we choose to have a custom AST
-                        node. Therefore, the code generation for this node is kept very simple,
-                        mapping expression to target-specific GPU inrinsics.
-                      nmodl: "GPU_ID "
-                      members:
-                        - name:
-                            brief: "Name of the thread id variable"
-                            type: Identifier
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index e723c850a8..bf87c868e1 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1552,29 +1552,24 @@ SCENARIO("GPU kernel body", "[visitor][llvm][gpu]") {
         )";
 
 
-        std::string expected_kernel = R"(
-            VOID nrn_state_test(INSTANCE_STRUCT *mech){
-                GPU_ID id
-                INTEGER node_id
-                DOUBLE v
-                IF (id<mech->node_count) {
-                    node_id = mech->node_index[id]
-                    v = mech->voltage[node_id]
-                    mech->m[id] = mech->y[id]+2
-                }
+        std::string expected_loop = R"(
+            for(id = THREAD_ID; id<mech->node_count; id = id+GRID_STRIDE) {
+                node_id = mech->node_index[id]
+                v = mech->voltage[node_id]
+                mech->m[id] = mech->y[id]+2
             })";
 
-        THEN("a kernel with thread id and if statement is created") {
+        THEN("a loop with GPU-specific AST nodes is constructed") {
             std::string name = "default";
             std::string math_library = "none";
             codegen::Platform gpu_platform(codegen::PlatformID::GPU, name, math_library);
             auto result = run_llvm_visitor_helper(nmodl_text,
                                                   gpu_platform,
-                                                  {ast::AstNodeType::CODEGEN_FUNCTION});
+                                                  {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 1);
 
-            auto kernel = reindent_text(to_nmodl(result[0]));
-            REQUIRE(kernel == reindent_text(expected_kernel));
+            auto loop = reindent_text(to_nmodl(result[0]));
+            REQUIRE(loop == reindent_text(expected_loop));
         }
     }
 }

From 2c57d4daa1e9a0692a22b98a73a200130f303127 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 13:04:26 +0100
Subject: [PATCH 220/331] Added code generation for thread id

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 12 +++++-------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  1 +
 src/codegen/llvm/llvm_ir_builder.cpp      | 22 ++++++++++++++++++++++
 src/codegen/llvm/llvm_ir_builder.hpp      |  3 +++
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 2f677cfbec..0bd233ecbb 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -39,7 +39,7 @@ static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
            statement.is_if_statement() || statement.is_codegen_return_statement() ||
            statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_while_statement();
+           statement.is_while_statement() || statement.is_codegen_thread_id();
 }
 
 /// A utility to check that the kernel body can be vectorised.
@@ -694,6 +694,10 @@ void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturn
     ir_builder.create_return(ret_value);
 }
 
+void CodegenLLVMVisitor::visit_codegen_thread_id(const ast::CodegenThreadId& node) {
+    ir_builder.create_thread_id();
+}
+
 void CodegenLLVMVisitor::visit_codegen_var_list_statement(
     const ast::CodegenVarListStatement& node) {
     llvm::Type* scalar_type = get_codegen_var_type(*node.get_var_type());
@@ -821,12 +825,6 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     sym_tab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
-    // \todo: implement GPU codegen functionality.
-    if (platform.is_gpu()) {
-      logger->warn("GPU code generation is not supported yet, aborting!");
-      return;
-    }
-
     // Initialize the builder for this NMODL program.
     ir_builder.initialize(*sym_tab, kernel_id);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 396d8cbb67..6ff79a0ddb 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -138,6 +138,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
+    void visit_codegen_thread_id(const ast::CodegenThreadId& node) override;
     void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index e7a6a4a60b..b88e995771 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/ValueSymbolTable.h"
 
 namespace nmodl {
@@ -554,6 +555,27 @@ void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     }
 }
 
+void IRBuilder::create_thread_id() {
+    llvm::Value* alloca_ptr = create_alloca(kernel_id, get_i32_type());
+
+    llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
+    auto create_call = [&](llvm::Intrinsic::ID id) {
+      llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
+      return builder.CreateCall(intrinsic, {});
+    };
+
+    // For now, this function only supports NVPTX backend, however it can be easily
+    // adjusted to generate thread id variable for any other platform.
+    llvm::Value* block_id = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
+    llvm::Value* block_dim = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+    llvm::Value* tmp = builder.CreateMul(block_id, block_dim);
+
+    llvm::Value* tid = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
+    llvm::Value* id = builder.CreateAdd(tmp, tid);
+
+    builder.CreateStore(id, alloca_ptr);
+}
+
 
 /****************************************************************************************/
 /*                                 LLVM block utilities                                 */
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index cf9e7f936d..aa9c7ab1e3 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -230,6 +230,9 @@ class IRBuilder {
     void create_scalar_or_vector_alloca(const std::string& name,
                                         llvm::Type* element_or_scalar_type);
 
+    /// Creates a variable of the form: id = blockIdx.x * blockDim.x + threadIdx.x
+    void create_thread_id();
+
     /// Generates LLVM IR for the given unary operator.
     void create_unary_op(llvm::Value* value, ast::UnaryOp op);
 

From 97e3940881f981a5584ca4b8ed34d9687ff802c8 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 13:23:10 +0100
Subject: [PATCH 221/331] Added kernel annotation generation

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 28 +++++++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp |  3 +++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0bd233ecbb..86fe5b5443 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -64,6 +64,16 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
+    llvm::Metadata* metadata[] = {
+        llvm::ValueAsMetadata::get(kernel),
+        llvm::MDString::get(*context, "kernel"),
+        llvm::ValueAsMetadata::get(
+            llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
+    llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
+    module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
+}
+
 #if LLVM_VERSION_MAJOR >= 13
 void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
                                                                  llvm::Triple& triple) {
@@ -665,11 +675,19 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     ir_builder.allocate_function_arguments(func, arguments);
 
     // Process function or procedure body. If the function is a compute kernel, enable
-    // vectorization. If so, the return statement is handled in a separate visitor.
-    if (platform.is_cpu_with_simd() && is_kernel_function(name)) {
-        ir_builder.generate_vector_ir();
-        block->accept(*this);
-        ir_builder.generate_scalar_ir();
+    // vectorization or add NVVM annotations. If this is the case, the return statement is
+    // handled in a separate visitor.
+    if (is_kernel_function(name)) {
+        if (platform.is_cpu_with_simd()) {
+            ir_builder.generate_vector_ir();
+            block->accept(*this);
+            ir_builder.generate_scalar_ir();
+        } else if (platform.is_gpu()) {
+            block->accept(*this);
+            annotate_kernel_with_nvvm(func);
+        } else { // scalar
+            block->accept(*this);
+        }
     } else {
         block->accept(*this);
     }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 6ff79a0ddb..67a3a6fab6 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -157,6 +157,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void wrap_kernel_functions();
 
   private:
+    // Annotates kernel function with NVVM metadata.
+    void annotate_kernel_with_nvvm(llvm::Function* kernel);
+
 #if LLVM_VERSION_MAJOR >= 13
     /// Populates target library info with the vector library definitions.
     void add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,

From dbd8cc9df5940baf4a96aa3fd8beb0c0acf7e6a1 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Sun, 13 Mar 2022 13:54:10 +0100
Subject: [PATCH 222/331] Added tests for annotations/intrinsics

---
 test/unit/codegen/codegen_llvm_ir.cpp | 74 +++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index bf87c868e1..e16fdefdae 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -34,6 +34,32 @@ using nmodl::parser::NmodlDriver;
 // Utility to get LLVM module as a string
 //=============================================================================
 
+std::string run_gpu_llvm_visitor(const std::string& text,
+                                 int opt_level = 0,
+                                 bool use_single_precision = false,
+                                 std::string math_library = "none",
+                                 bool nmodl_inline = false) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    SymtabVisitor().visit_program(*ast);
+    if (nmodl_inline) {
+        InlineVisitor().visit_program(*ast);
+    }
+    NeuronSolveVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
+
+    codegen::Platform gpu_platform(codegen::PlatformID::GPU, /*name=*/"nvidia",
+                                   math_library, use_single_precision, 1);
+    codegen::CodegenLLVMVisitor llvm_visitor(
+        /*mod_filename=*/"unknown",
+        /*output_dir=*/".", gpu_platform, opt_level,
+        /*add_debug_information=*/false);
+
+    llvm_visitor.visit_program(*ast);
+    return llvm_visitor.dump_module();
+}
+
 std::string run_llvm_visitor(const std::string& text,
                              int opt_level = 0,
                              bool use_single_precision = false,
@@ -1573,3 +1599,51 @@ SCENARIO("GPU kernel body", "[visitor][llvm][gpu]") {
         }
     }
 }
+
+//=============================================================================
+// Basic NVVM/LLVM IR generation for GPU platforms
+//=============================================================================
+
+SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
+    GIVEN("For GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+        THEN("kernel annotations are added and thread id intrinsics generated") {
+            std::string module_string = run_gpu_llvm_visitor(nmodl_text,
+                                                             /*opt_level=*/0,
+                                                             /*use_single_precision=*/false);
+            std::smatch m;
+
+            // Check kernel annotations are correclty created.
+            std::regex annotations(R"(!nvvm\.annotations = !\{!0\})");
+            std::regex kernel_data(R"(!0 = !\{void \(%.*__instance_var__type\*\)\* @nrn_state_.*, !\"kernel\", i32 1\})");
+            REQUIRE(std::regex_search(module_string, m, annotations));
+            REQUIRE(std::regex_search(module_string, m, kernel_data));
+
+            // Check thread/block id/dim instrinsics are created.
+            std::regex block_id(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ctaid\.x\(\))");
+            std::regex block_dim(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ntid\.x\(\))");
+            std::regex tid(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.tid\.x\(\))");
+            REQUIRE(std::regex_search(module_string, m, block_id));
+            REQUIRE(std::regex_search(module_string, m, block_dim));
+            REQUIRE(std::regex_search(module_string, m, tid));
+        }
+    }
+}

From b562d19b3b35657e59e38d9f4032050fe9602c62 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 22 Mar 2022 08:22:28 +0100
Subject: [PATCH 223/331] Addapted code generation for GPU expressions

---
 src/codegen/llvm/codegen_llvm_visitor.cpp |  4 ++++
 src/codegen/llvm/codegen_llvm_visitor.hpp |  1 +
 src/codegen/llvm/llvm_ir_builder.cpp      | 18 +++++++++++++++---
 src/codegen/llvm/llvm_ir_builder.hpp      |  5 ++++-
 test/unit/codegen/codegen_llvm_ir.cpp     |  2 ++
 5 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 86fe5b5443..72c8be7cc5 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -703,6 +703,10 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     ir_builder.clear_function();
 }
 
+void CodegenLLVMVisitor::visit_codegen_grid_stride(const ast::CodegenGridStride& node) {
+    ir_builder.create_grid_stride();
+}
+
 void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturnStatement& node) {
     if (!node.get_statement()->is_name())
         throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 67a3a6fab6..27150ff296 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -137,6 +137,7 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_codegen_atomic_statement(const ast::CodegenAtomicStatement& node) override;
     void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
+    void visit_codegen_grid_stride(const ast::CodegenGridStride& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
     void visit_codegen_thread_id(const ast::CodegenThreadId& node) override;
     void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index b88e995771..c851f02970 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -555,9 +555,21 @@ void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     }
 }
 
-void IRBuilder::create_thread_id() {
-    llvm::Value* alloca_ptr = create_alloca(kernel_id, get_i32_type());
+void IRBuilder::create_grid_stride() {
+    llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
+    auto create_call = [&](llvm::Intrinsic::ID id) {
+      llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
+      return builder.CreateCall(intrinsic, {});
+    };
 
+    llvm::Value* block_dim = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+    llvm::Value* grid_dim = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
+    llvm::Value* stride = builder.CreateMul(block_dim, grid_dim);
+
+    value_stack.push_back(stride);
+}
+
+void IRBuilder::create_thread_id() {
     llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
     auto create_call = [&](llvm::Intrinsic::ID id) {
       llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
@@ -573,7 +585,7 @@ void IRBuilder::create_thread_id() {
     llvm::Value* tid = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
     llvm::Value* id = builder.CreateAdd(tmp, tid);
 
-    builder.CreateStore(id, alloca_ptr);
+    value_stack.push_back(id);
 }
 
 
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index aa9c7ab1e3..1b144afcfd 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -230,7 +230,10 @@ class IRBuilder {
     void create_scalar_or_vector_alloca(const std::string& name,
                                         llvm::Type* element_or_scalar_type);
 
-    /// Creates a variable of the form: id = blockIdx.x * blockDim.x + threadIdx.x
+    /// Creates an expression of the form: blockDim.x * gridDim.x
+    void create_grid_stride();
+
+    /// Creates an expression of the form: blockIdx.x * blockDim.x + threadIdx.x
     void create_thread_id();
 
     /// Generates LLVM IR for the given unary operator.
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index e16fdefdae..f15e924481 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1641,9 +1641,11 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             std::regex block_id(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ctaid\.x\(\))");
             std::regex block_dim(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ntid\.x\(\))");
             std::regex tid(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.tid\.x\(\))");
+            std::regex grid_dim(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.nctaid\.x\(\))");
             REQUIRE(std::regex_search(module_string, m, block_id));
             REQUIRE(std::regex_search(module_string, m, block_dim));
             REQUIRE(std::regex_search(module_string, m, tid));
+            REQUIRE(std::regex_search(module_string, m, grid_dim));
         }
     }
 }

From 483b3632c6bc1dd1d98dfa1f8293680126c919b1 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 22 Mar 2022 08:27:48 +0100
Subject: [PATCH 224/331] Fixed lists of supported statements

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 72c8be7cc5..25fb173842 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -39,7 +39,7 @@ static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
            statement.is_if_statement() || statement.is_codegen_return_statement() ||
            statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_while_statement() || statement.is_codegen_thread_id();
+           statement.is_while_statement();
 }
 
 /// A utility to check that the kernel body can be vectorised.

From f874146a674e031ee4e6d373def570e46a98f7b6 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 22 Mar 2022 12:04:42 +0100
Subject: [PATCH 225/331] [LLVM][GPU] Added GPU-specific AST transformations
 (#819)

This commit adds a new AST node: `CodegenThreadId` that
represents thread id used in GPU computation. Thanks to
the new platform class abstraction, the code to generate
compute body of NEURON block was readapted to support
AST transformations needed for GPU.

Example of the transformation:
```
GPU_ID id
INTEGER node_id
DOUBLE v
for(id = THREAD_ID; id<mech->node_count; id = id+GRID_STRIDE) {
    node_id = mech->node_index[id]
    v = mech->voltage[node_id]
    mech->m[id] = mech->y[id]+2
}
```

Co-authored-by: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>
---
 .../llvm/codegen_llvm_helper_visitor.cpp      | 261 +++++++++---------
 .../llvm/codegen_llvm_helper_visitor.hpp      |  37 ++-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   8 +-
 src/language/code_generator.cmake             |   2 +
 src/language/codegen.yaml                     |  21 ++
 test/unit/codegen/codegen_llvm_ir.cpp         |  58 +++-
 6 files changed, 250 insertions(+), 137 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 654afd8ef5..ae36c1bce2 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -443,7 +443,7 @@ void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
  * @param node Ast node under which variables to be converted to instance type
  */
 void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
-                                                            std::string& index_var) {
+                                                            const std::string& index_var) {
     /// collect all variables in the node of type ast::VarName
     auto variables = collect_nodes(node, {ast::AstNodeType::VAR_NAME});
     for (const auto& v: variables) {
@@ -549,48 +549,64 @@ void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
     create_function_for_node(node);
 }
 
-/**
- * Create loop increment expression `id = id + width`
- * \todo : same as int_initialization_expression()
- */
-static std::shared_ptr<ast::Expression> loop_increment_expression(const std::string& induction_var,
-                                                                  int vector_width) {
-    // first create id + x
+std::shared_ptr<ast::Expression>
+CodegenLLVMHelperVisitor::loop_initialization_expression(const std::string& induction_var,
+                                                         bool is_remainder_loop) {
+    if (platform.is_gpu()) {
+        const auto& id = create_varname(induction_var);
+        const auto& tid = new ast::CodegenThreadId();
+        return std::make_shared<ast::BinaryExpression>(id, ast::BinaryOperator(ast::BOP_ASSIGN), tid);
+    }
+
+  // Otherwise, platfrom is CPU. Since the loop can be a remainder loop, check if
+  // we need to initialize at all.
+    if (is_remainder_loop)
+        return nullptr;
+    return int_initialization_expression(induction_var);
+}
+
+std::shared_ptr<ast::Expression>
+CodegenLLVMHelperVisitor::loop_increment_expression(const std::string& induction_var,
+                                                    bool is_remainder_loop) {
     const auto& id = create_varname(induction_var);
-    const auto& inc = new ast::Integer(vector_width, nullptr);
+
+    // For GPU platforms, increment by grid stride.
+    if (platform.is_gpu()) {
+        const auto& stride = new ast::CodegenGridStride();
+        const auto& inc_expr =
+            new ast::BinaryExpression(id, ast::BinaryOperator(ast::BOP_ADDITION), stride);
+        return std::make_shared<ast::BinaryExpression>(id->clone(),
+                                                    ast::BinaryOperator(ast::BOP_ASSIGN),
+                                                    inc_expr);
+    }
+
+    // Otherwise, proceed with increment for CPU loop.
+    const int width = is_remainder_loop ? 1 : platform.get_instruction_width();
+    const auto& inc = new ast::Integer(width, nullptr);
     const auto& inc_expr =
         new ast::BinaryExpression(id, ast::BinaryOperator(ast::BOP_ADDITION), inc);
-    // now create id = id + x
     return std::make_shared<ast::BinaryExpression>(id->clone(),
                                                    ast::BinaryOperator(ast::BOP_ASSIGN),
                                                    inc_expr);
 }
 
-/**
- * Create loop count comparison expression
- *
- * Based on if loop is vectorised or not, the condition for loop
- * is different. For example:
- *  - serial loop : `id < node_count`
- *  - vector loop : `id < (node_count - vector_width + 1)`
- *
- * \todo : same as int_initialization_expression()
- */
-static std::shared_ptr<ast::Expression> loop_count_expression(const std::string& induction_var,
-                                                              const std::string& node_count,
-                                                              int vector_width) {
+std::shared_ptr<ast::Expression>
+CodegenLLVMHelperVisitor::loop_count_expression(const std::string& induction_var,
+                                                const std::string& node_count,
+                                                bool is_remainder_loop) {
+    const int width = is_remainder_loop ? 1 : platform.get_instruction_width();
     const auto& id = create_varname(induction_var);
     const auto& mech_node_count = create_varname(node_count);
 
     // For non-vectorised loop, the condition is id < mech->node_count
-    if (vector_width == 1) {
+    if (width == 1) {
         return std::make_shared<ast::BinaryExpression>(id->clone(),
                                                        ast::BinaryOperator(ast::BOP_LESS),
                                                        mech_node_count);
     }
 
-    // For vectorised loop, the condition is id < mech->node_count - vector_width + 1
-    const auto& remainder = new ast::Integer(vector_width - 1, /*macro=*/nullptr);
+    // For vectorised loop, the condition is id < mech->node_count - width + 1
+    const auto& remainder = new ast::Integer(width - 1, /*macro=*/nullptr);
     const auto& count = new ast::BinaryExpression(mech_node_count,
                                                   ast::BinaryOperator(ast::BOP_SUBTRACTION),
                                                   remainder);
@@ -612,35 +628,29 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     /// statements for new function to be generated
     ast::StatementVector function_statements;
 
-    /// create variable definition for loop index and insert at the beginning
-    std::string loop_index_var = "id";
-    std::vector<std::string> induction_variables{"id"};
-    function_statements.push_back(
-        create_local_variable_statement(induction_variables, INTEGER_TYPE));
-
     /// create vectors of local variables that would be used in compute part
     std::vector<std::string> int_variables{"node_id"};
     std::vector<std::string> double_variables{"v"};
 
-    /// create now main compute part : for loop over channel instances
+    /// create now main compute part
 
-    /// loop body : initialization + solve blocks
-    ast::StatementVector loop_def_statements;
-    ast::StatementVector loop_index_statements;
-    ast::StatementVector loop_body_statements;
+    /// compute body : initialization + solve blocks
+    ast::StatementVector def_statements;
+    ast::StatementVector index_statements;
+    ast::StatementVector body_statements;
     {
         /// access node index and corresponding voltage
-        loop_index_statements.push_back(
+        index_statements.push_back(
             visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
-        loop_body_statements.push_back(
+        body_statements.push_back(
             visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
 
         /// read ion variables
         ion_read_statements(BlockType::State,
                             int_variables,
                             double_variables,
-                            loop_index_statements,
-                            loop_body_statements);
+                            index_statements,
+                            body_statements);
 
         /// main compute node : extract solution expressions from the derivative block
         const auto& solutions = collect_nodes(node, {ast::AstNodeType::SOLUTION_EXPRESSION});
@@ -648,109 +658,39 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
             const auto& solution = std::dynamic_pointer_cast<ast::SolutionExpression>(statement);
             const auto& block = std::dynamic_pointer_cast<ast::StatementBlock>(
                 solution->get_node_to_solve());
-            append_statements_from_block(loop_body_statements, block);
+            append_statements_from_block(body_statements, block);
         }
 
         /// add breakpoint block if no current
         if (info.currents.empty() && info.breakpoint_node != nullptr) {
             auto block = info.breakpoint_node->get_statement_block();
-            append_statements_from_block(loop_body_statements, block);
+            append_statements_from_block(body_statements, block);
         }
 
         /// write ion statements
         ion_write_statements(BlockType::State,
                              int_variables,
                              double_variables,
-                             loop_index_statements,
-                             loop_body_statements);
+                             index_statements,
+                             body_statements);
 
         // \todo handle process_shadow_update_statement and wrote_conc_call yet
     }
 
-    ast::StatementVector loop_body;
-    loop_body.insert(loop_body.end(), loop_def_statements.begin(), loop_def_statements.end());
-    loop_body.insert(loop_body.end(), loop_index_statements.begin(), loop_index_statements.end());
-    loop_body.insert(loop_body.end(), loop_body_statements.begin(), loop_body_statements.end());
-
-    /// now construct a new code block which will become the body of the loop
-    auto loop_block = std::make_shared<ast::StatementBlock>(loop_body);
-
-    /// declare main FOR loop local variables
-    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
-    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
-
-    /// main loop possibly vectorized on vector_width
-    {
-        /// loop constructs : initialization, condition and increment
-        const auto& initialization = int_initialization_expression(INDUCTION_VAR);
-        const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, vector_width);
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, vector_width);
-
-        /// clone it
-        auto local_loop_block = std::shared_ptr<ast::StatementBlock>(loop_block->clone());
-
-        /// convert local statement to codegenvar statement
-        convert_local_statement(*local_loop_block);
-
-        auto for_loop_statement_main = std::make_shared<ast::CodegenForStatement>(initialization,
-                                                                                  condition,
-                                                                                  increment,
-                                                                                  local_loop_block);
-
-        /// convert all variables inside loop body to instance variables
-        convert_to_instance_variable(*for_loop_statement_main, loop_index_var);
-
-        /// loop itself becomes one of the statement in the function
-        function_statements.push_back(for_loop_statement_main);
-    }
-
-    /// vectors containing renamed FOR loop local variables
-    std::vector<std::string> renamed_int_variables;
-    std::vector<std::string> renamed_double_variables;
-
-    /// remainder loop possibly vectorized on vector_width
-    if (vector_width > 1) {
-        /// loop constructs : initialization, condition and increment
-        const auto& condition =
-            loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, /*vector_width=*/1);
-        const auto& increment = loop_increment_expression(INDUCTION_VAR, /*vector_width=*/1);
-
-        /// rename local variables to avoid conflict with main loop
-        rename_local_variables(*loop_block);
-
-        /// convert local statement to codegenvar statement
-        convert_local_statement(*loop_block);
-
-        auto for_loop_statement_remainder =
-            std::make_shared<ast::CodegenForStatement>(nullptr, condition, increment, loop_block);
+    /// create target-specific compute body
+    ast::StatementVector compute_body;
+    compute_body.insert(compute_body.end(), def_statements.begin(), def_statements.end());
+    compute_body.insert(compute_body.end(), index_statements.begin(), index_statements.end());
+    compute_body.insert(compute_body.end(), body_statements.begin(), body_statements.end());
 
-        const auto& loop_statements = for_loop_statement_remainder->get_statement_block();
-        // \todo: Change RenameVisitor to take a vector of names to which it would append a single
-        // prefix.
-        for (const auto& name: int_variables) {
-            std::string new_name = epilogue_variable_prefix + name;
-            renamed_int_variables.push_back(new_name);
-            visitor::RenameVisitor v(name, new_name);
-            loop_statements->accept(v);
-        }
-        for (const auto& name: double_variables) {
-            std::string new_name = epilogue_variable_prefix + name;
-            renamed_double_variables.push_back(new_name);
-            visitor::RenameVisitor v(name, epilogue_variable_prefix + name);
-            loop_statements->accept(v);
-        }
-
-        /// declare remainder FOR loop local variables
-        function_statements.push_back(
-            create_local_variable_statement(renamed_int_variables, INTEGER_TYPE));
-        function_statements.push_back(
-            create_local_variable_statement(renamed_double_variables, FLOAT_TYPE));
-
-        /// convert all variables inside loop body to instance variables
-        convert_to_instance_variable(*for_loop_statement_remainder, loop_index_var);
+    std::vector<std::string> induction_variables{INDUCTION_VAR};
+    function_statements.push_back(
+            create_local_variable_statement(induction_variables, INTEGER_TYPE));
 
-        /// loop itself becomes one of the statement in the function
-        function_statements.push_back(for_loop_statement_remainder);
+    if (platform.is_gpu()) {
+        create_gpu_compute_body(compute_body, function_statements, int_variables, double_variables);
+    } else {
+        create_cpu_compute_body(compute_body, function_statements, int_variables, double_variables);
     }
 
     /// new block for the function
@@ -777,6 +717,73 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     std::cout << nmodl::to_nmodl(function) << std::endl;
 }
 
+void CodegenLLVMHelperVisitor::create_gpu_compute_body(ast::StatementVector& body,
+                                                       ast::StatementVector& function_statements,
+                                                       std::vector<std::string>& int_variables,
+                                                       std::vector<std::string>& double_variables) {
+    auto kernel_block = std::make_shared<ast::StatementBlock>(body);
+
+    // dispatch loop creation with right parameters
+    create_compute_body_loop(kernel_block, function_statements, int_variables, double_variables);
+}
+
+void CodegenLLVMHelperVisitor::create_cpu_compute_body(ast::StatementVector& body,
+                                                       ast::StatementVector& function_statements,
+                                                       std::vector<std::string>& int_variables,
+                                                       std::vector<std::string>& double_variables) {
+    auto loop_block = std::make_shared<ast::StatementBlock>(body);
+    create_compute_body_loop(loop_block, function_statements, int_variables, double_variables);
+    if (platform.is_cpu_with_simd())
+        create_compute_body_loop(loop_block, function_statements, int_variables, double_variables, /*is_remainder_loop=*/true);
+}
+
+void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::StatementBlock>& block,
+                                                        ast::StatementVector& function_statements,
+                                                        std::vector<std::string>& int_variables,
+                                                        std::vector<std::string>& double_variables,
+                                                        bool is_remainder_loop) {
+    const auto& initialization = loop_initialization_expression(INDUCTION_VAR, is_remainder_loop);
+    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, is_remainder_loop);
+    const auto& increment = loop_increment_expression(INDUCTION_VAR, is_remainder_loop);
+
+    // Clone the statement block if needed since it can be used by the remainder loop.
+    auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
+
+    // Convert local statement to use CodegenVar statements and create a FOR loop node. Also, if creating
+    // a remainder loop then rename variables to avoid conflicts.
+    if (is_remainder_loop)
+        rename_local_variables(*loop_block);
+    convert_local_statement(*loop_block);
+    auto for_loop = std::make_shared<ast::CodegenForStatement>(initialization,
+                                                               condition,
+                                                               increment,
+                                                               loop_block);
+
+    // Convert all variables inside loop body to be instance variables.
+    convert_to_instance_variable(*for_loop, INDUCTION_VAR);
+
+    // Rename variables if processing remainder loop.
+    if (is_remainder_loop) {
+        const auto& loop_statements = for_loop->get_statement_block();
+        auto rename = [&](std::vector<std::string>& vars) {
+            for (int i = 0; i < vars.size(); ++i) {
+                std::string old_name = vars[i];
+                std::string new_name = epilogue_variable_prefix + vars[i];
+                vars[i] = new_name;
+                visitor::RenameVisitor v(old_name, new_name);
+                loop_statements->accept(v);
+            }
+        };
+        rename(int_variables);
+        rename(double_variables);
+    }
+
+    // Push variables and  the loop to the function statements vector.
+    function_statements.push_back(create_local_variable_statement(int_variables, INTEGER_TYPE));
+    function_statements.push_back(create_local_variable_statement(double_variables, FLOAT_TYPE));
+    function_statements.push_back(for_loop);
+}
+
 void CodegenLLVMHelperVisitor::remove_inlined_nodes(ast::Program& node) {
     auto program_symtab = node.get_model_symbol_table();
     const auto& func_proc_nodes =
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 21aff4a92d..2aa7f2fe03 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -16,6 +16,7 @@
 
 #include "ast/instance_struct.hpp"
 #include "codegen/codegen_info.hpp"
+#include "codegen/llvm/target_platform.hpp"
 #include "symtab/symbol_table.hpp"
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
@@ -101,8 +102,8 @@ struct InstanceVarHelper {
  * these will be common across all backends.
  */
 class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
-    /// explicit vectorisation width
-    int vector_width;
+    /// target platform
+    Platform platform;
 
     /// newly generated code generation specific functions
     CodegenFunctionVector codegen_functions;
@@ -135,8 +136,8 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     static const std::string VOLTAGE_VAR;
     static const std::string NODE_INDEX_VAR;
 
-    CodegenLLVMHelperVisitor(int vector_width)
-        : vector_width(vector_width) {}
+    CodegenLLVMHelperVisitor(Platform& platform)
+        : platform(platform) {}
 
     const InstanceVarHelper& get_instance_var_helper() {
         return instance_var_helper;
@@ -161,7 +162,7 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
                               ast::StatementVector& index_statements,
                               ast::StatementVector& body_statements);
 
-    void convert_to_instance_variable(ast::Node& node, std::string& index_var);
+    void convert_to_instance_variable(ast::Node& node, const std::string& index_var);
 
     void convert_local_statement(ast::StatementBlock& node);
     void rename_local_variables(ast::StatementBlock& node);
@@ -173,6 +174,32 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void visit_function_block(ast::FunctionBlock& node) override;
     void visit_nrn_state_block(ast::NrnStateBlock& node) override;
     void visit_program(ast::Program& node) override;
+
+  private:
+    /// Methods to create target-specific loop constructs.
+    std::shared_ptr<ast::Expression> loop_initialization_expression(const std::string& induction_var,
+                                                                    bool is_remainder_loop);
+    std::shared_ptr<ast::Expression> loop_count_expression(const std::string& induction_var,
+                                                           const std::string& node_count,
+                                                           bool is_remainder_loop);
+    std::shared_ptr<ast::Expression> loop_increment_expression(const std::string& induction_var,
+                                                               bool is_remainder_loop);
+
+    /// Methods to populate`function_statements` with necessary AST constructs to form
+    /// a kernel for a specific target.
+    void create_gpu_compute_body(ast::StatementVector& body,
+                                 ast::StatementVector& function_statements,
+                                 std::vector<std::string>& int_variables,
+                                 std::vector<std::string>& double_variables);
+    void create_cpu_compute_body(ast::StatementVector& body,
+                                 ast::StatementVector& function_statements,
+                                 std::vector<std::string>& int_variables,
+                                 std::vector<std::string>& double_variables);
+    void create_compute_body_loop(std::shared_ptr<ast::StatementBlock>& block,
+                                  ast::StatementVector& function_statements,
+                                  std::vector<std::string>& int_variables,
+                                  std::vector<std::string>& double_variables,
+                                  bool is_remainder_loop = false);
 };
 
 /** @} */  // end of llvm_codegen_details
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 0fa81de691..2f677cfbec 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -815,12 +815,18 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     //   - convert function and procedure blocks into CodegenFunctions
     //   - gather information about AST. For now, information about functions
     //     and procedures is used only.
-    CodegenLLVMHelperVisitor v{platform.get_instruction_width()};
+    CodegenLLVMHelperVisitor v{platform};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
     sym_tab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
+    // \todo: implement GPU codegen functionality.
+    if (platform.is_gpu()) {
+      logger->warn("GPU code generation is not supported yet, aborting!");
+      return;
+    }
+
     // Initialize the builder for this NMODL program.
     ir_builder.initialize(*sym_tab, kernel_id);
 
diff --git a/src/language/code_generator.cmake b/src/language/code_generator.cmake
index 17123fc833..24953e9182 100644
--- a/src/language/code_generator.cmake
+++ b/src/language/code_generator.cmake
@@ -68,9 +68,11 @@ set(AST_GENERATED_SOURCES
     ${PROJECT_BINARY_DIR}/src/ast/codegen_atomic_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_for_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_function.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_grid_stride.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_instance_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_return_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_struct.hpp
+    ${PROJECT_BINARY_DIR}/src/ast/codegen_thread_id.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_list_statement.hpp
     ${PROJECT_BINARY_DIR}/src/ast/codegen_var_type.hpp
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index 30bae4c5c5..93f1fca50d 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -199,6 +199,27 @@
                             brief: "member functions of the class/struct"
                             type: CodegenFunction
                             vector: true
+                  - CodegenThreadId:
+                      brief: "Represents thread id expression for GPU code generation"
+                      description: |
+                        For GPU code generation, we use a special AST node to enocde the initial
+                        thread id calculation. In NMODL, this expression is usually of the form:
+                        \code{.cpp}
+                            tid = blockId.x * blockDim.x + threadId.x
+                        \endcode
+                        To be able to support multiple GPU backends, we choose to have a custom AST
+                        node. Therefore, the code generation for this node is kept very simple,
+                        mapping expression to target-specific GPU inrinsics.
+                      nmodl: "THREAD_ID"
+                  - CodegenGridStride:
+                      brief: "Represents grid stride for GPU code generation"
+                      description: |
+                        For GPU code generation, we use a special AST node to enocde the loop
+                        increment expression. In NMODL, this expression is usually of the form:
+                        \code{.cpp}
+                            for (int i = tid; i < n; i += blockDim.x * gridDim.x)
+                        \endcode
+                      nmodl: "GRID_STRIDE"
             - Statement:
                 brief: "Statement base class"
                 children:
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 34fcd8b0da..bf87c868e1 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -68,14 +68,14 @@ std::string run_llvm_visitor(const std::string& text,
 
 std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
     const std::string& text,
-    int vector_width,
+    codegen::Platform& platform,
     const std::vector<ast::AstNodeType>& nodes_to_collect) {
     NmodlDriver driver;
     const auto& ast = driver.parse_string(text);
 
     SymtabVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
-    CodegenLLVMHelperVisitor(vector_width).visit_program(*ast);
+    CodegenLLVMHelperVisitor(platform).visit_program(*ast);
 
     const auto& nodes = collect_nodes(*ast, nodes_to_collect);
 
@@ -1228,8 +1228,9 @@ SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
             })";
 
         THEN("a single scalar loops is constructed") {
+            codegen::Platform default_platform;
             auto result = run_llvm_visitor_helper(nmodl_text,
-                                                  /*vector_width=*/1,
+                                                  default_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 1);
 
@@ -1279,8 +1280,9 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
 
 
         THEN("vector and epilogue scalar loops are constructed") {
+            codegen::Platform simd_platform(/*use_single_precision=*/false, /*instruction_width=*/8);
             auto result = run_llvm_visitor_helper(nmodl_text,
-                                                  /*vector_width=*/8,
+                                                  simd_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
             REQUIRE(result.size() == 2);
 
@@ -1523,3 +1525,51 @@ SCENARIO("Removal of inlined functions and procedures", "[visitor][llvm][inline]
         }
     }
 }
+
+//=============================================================================
+// Basic GPU kernel AST generation
+//=============================================================================
+
+SCENARIO("GPU kernel body", "[visitor][llvm][gpu]") {
+    GIVEN("For GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+
+        std::string expected_loop = R"(
+            for(id = THREAD_ID; id<mech->node_count; id = id+GRID_STRIDE) {
+                node_id = mech->node_index[id]
+                v = mech->voltage[node_id]
+                mech->m[id] = mech->y[id]+2
+            })";
+
+        THEN("a loop with GPU-specific AST nodes is constructed") {
+            std::string name = "default";
+            std::string math_library = "none";
+            codegen::Platform gpu_platform(codegen::PlatformID::GPU, name, math_library);
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  gpu_platform,
+                                                  {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
+            REQUIRE(result.size() == 1);
+
+            auto loop = reindent_text(to_nmodl(result[0]));
+            REQUIRE(loop == reindent_text(expected_loop));
+        }
+    }
+}

From 295fa25cc7ae4ce4de30c209b5e85636a774547a Mon Sep 17 00:00:00 2001
From: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
Date: Tue, 22 Mar 2022 13:55:44 +0100
Subject: [PATCH 226/331] fix merge issue: gpu code generation is now enabled
 by this PR

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 4d41b521e6..25fb173842 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -847,12 +847,6 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     sym_tab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
-    // \todo: implement GPU codegen functionality.
-    if (platform.is_gpu()) {
-      logger->warn("GPU code generation is not supported yet, aborting!");
-      return;
-    }
-
     // Initialize the builder for this NMODL program.
     ir_builder.initialize(*sym_tab, kernel_id);
 

From 400aaec8e73096fb4efda41792597a75e57fd83a Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 22 Mar 2022 14:22:09 +0100
Subject: [PATCH 227/331] Small fix from merge

---
 test/unit/codegen/codegen_llvm_ir.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index c8d72b23db..f15e924481 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1641,17 +1641,11 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             std::regex block_id(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ctaid\.x\(\))");
             std::regex block_dim(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ntid\.x\(\))");
             std::regex tid(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.tid\.x\(\))");
-<<<<<<< HEAD
-            REQUIRE(std::regex_search(module_string, m, block_id));
-            REQUIRE(std::regex_search(module_string, m, block_dim));
-            REQUIRE(std::regex_search(module_string, m, tid));
-=======
             std::regex grid_dim(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.nctaid\.x\(\))");
             REQUIRE(std::regex_search(module_string, m, block_id));
             REQUIRE(std::regex_search(module_string, m, block_dim));
             REQUIRE(std::regex_search(module_string, m, tid));
             REQUIRE(std::regex_search(module_string, m, grid_dim));
->>>>>>> georgemitenkov/llvm-gpu-codegen
         }
     }
 }

From b3de1e1f2eb8f667d89c4fc440e28bb9fd700aeb Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 22 Mar 2022 14:38:14 +0100
Subject: [PATCH 228/331] [LLVM][GPU] Basic code generation for NVPTX backend
 (#820)

* Kernel annotations are now generated when targeting GPU platforms
* Lowering of `CodegenThreadId` and `CodegenGridStride` was implemented using NVVM intrinsics to get thread/block id/dimensions and grid stride
* Adapted code generation for GPU expressions
* Added tests for annotations/intrinsics
* GPU code generation is now enabled by this PR

Co-authored-by: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Co-authored-by: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 42 +++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp |  5 ++
 src/codegen/llvm/llvm_ir_builder.cpp      | 34 ++++++++++
 src/codegen/llvm/llvm_ir_builder.hpp      |  6 ++
 test/unit/codegen/codegen_llvm_ir.cpp     | 76 +++++++++++++++++++++++
 5 files changed, 152 insertions(+), 11 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 2f677cfbec..25fb173842 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -64,6 +64,16 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
+    llvm::Metadata* metadata[] = {
+        llvm::ValueAsMetadata::get(kernel),
+        llvm::MDString::get(*context, "kernel"),
+        llvm::ValueAsMetadata::get(
+            llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
+    llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
+    module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
+}
+
 #if LLVM_VERSION_MAJOR >= 13
 void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
                                                                  llvm::Triple& triple) {
@@ -665,11 +675,19 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     ir_builder.allocate_function_arguments(func, arguments);
 
     // Process function or procedure body. If the function is a compute kernel, enable
-    // vectorization. If so, the return statement is handled in a separate visitor.
-    if (platform.is_cpu_with_simd() && is_kernel_function(name)) {
-        ir_builder.generate_vector_ir();
-        block->accept(*this);
-        ir_builder.generate_scalar_ir();
+    // vectorization or add NVVM annotations. If this is the case, the return statement is
+    // handled in a separate visitor.
+    if (is_kernel_function(name)) {
+        if (platform.is_cpu_with_simd()) {
+            ir_builder.generate_vector_ir();
+            block->accept(*this);
+            ir_builder.generate_scalar_ir();
+        } else if (platform.is_gpu()) {
+            block->accept(*this);
+            annotate_kernel_with_nvvm(func);
+        } else { // scalar
+            block->accept(*this);
+        }
     } else {
         block->accept(*this);
     }
@@ -685,6 +703,10 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     ir_builder.clear_function();
 }
 
+void CodegenLLVMVisitor::visit_codegen_grid_stride(const ast::CodegenGridStride& node) {
+    ir_builder.create_grid_stride();
+}
+
 void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturnStatement& node) {
     if (!node.get_statement()->is_name())
         throw std::runtime_error("Error: CodegenReturnStatement must contain a name node\n");
@@ -694,6 +716,10 @@ void CodegenLLVMVisitor::visit_codegen_return_statement(const ast::CodegenReturn
     ir_builder.create_return(ret_value);
 }
 
+void CodegenLLVMVisitor::visit_codegen_thread_id(const ast::CodegenThreadId& node) {
+    ir_builder.create_thread_id();
+}
+
 void CodegenLLVMVisitor::visit_codegen_var_list_statement(
     const ast::CodegenVarListStatement& node) {
     llvm::Type* scalar_type = get_codegen_var_type(*node.get_var_type());
@@ -821,12 +847,6 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     sym_tab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
-    // \todo: implement GPU codegen functionality.
-    if (platform.is_gpu()) {
-      logger->warn("GPU code generation is not supported yet, aborting!");
-      return;
-    }
-
     // Initialize the builder for this NMODL program.
     ir_builder.initialize(*sym_tab, kernel_id);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 396d8cbb67..27150ff296 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -137,7 +137,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_codegen_atomic_statement(const ast::CodegenAtomicStatement& node) override;
     void visit_codegen_for_statement(const ast::CodegenForStatement& node) override;
     void visit_codegen_function(const ast::CodegenFunction& node) override;
+    void visit_codegen_grid_stride(const ast::CodegenGridStride& node) override;
     void visit_codegen_return_statement(const ast::CodegenReturnStatement& node) override;
+    void visit_codegen_thread_id(const ast::CodegenThreadId& node) override;
     void visit_codegen_var_list_statement(const ast::CodegenVarListStatement& node) override;
     void visit_double(const ast::Double& node) override;
     void visit_function_block(const ast::FunctionBlock& node) override;
@@ -156,6 +158,9 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void wrap_kernel_functions();
 
   private:
+    // Annotates kernel function with NVVM metadata.
+    void annotate_kernel_with_nvvm(llvm::Function* kernel);
+
 #if LLVM_VERSION_MAJOR >= 13
     /// Populates target library info with the vector library definitions.
     void add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index e7a6a4a60b..c851f02970 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/ValueSymbolTable.h"
 
 namespace nmodl {
@@ -554,6 +555,39 @@ void IRBuilder::maybe_replicate_value(llvm::Value* value) {
     }
 }
 
+void IRBuilder::create_grid_stride() {
+    llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
+    auto create_call = [&](llvm::Intrinsic::ID id) {
+      llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
+      return builder.CreateCall(intrinsic, {});
+    };
+
+    llvm::Value* block_dim = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+    llvm::Value* grid_dim = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
+    llvm::Value* stride = builder.CreateMul(block_dim, grid_dim);
+
+    value_stack.push_back(stride);
+}
+
+void IRBuilder::create_thread_id() {
+    llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
+    auto create_call = [&](llvm::Intrinsic::ID id) {
+      llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
+      return builder.CreateCall(intrinsic, {});
+    };
+
+    // For now, this function only supports NVPTX backend, however it can be easily
+    // adjusted to generate thread id variable for any other platform.
+    llvm::Value* block_id = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
+    llvm::Value* block_dim = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+    llvm::Value* tmp = builder.CreateMul(block_id, block_dim);
+
+    llvm::Value* tid = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
+    llvm::Value* id = builder.CreateAdd(tmp, tid);
+
+    value_stack.push_back(id);
+}
+
 
 /****************************************************************************************/
 /*                                 LLVM block utilities                                 */
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index cf9e7f936d..1b144afcfd 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -230,6 +230,12 @@ class IRBuilder {
     void create_scalar_or_vector_alloca(const std::string& name,
                                         llvm::Type* element_or_scalar_type);
 
+    /// Creates an expression of the form: blockDim.x * gridDim.x
+    void create_grid_stride();
+
+    /// Creates an expression of the form: blockIdx.x * blockDim.x + threadIdx.x
+    void create_thread_id();
+
     /// Generates LLVM IR for the given unary operator.
     void create_unary_op(llvm::Value* value, ast::UnaryOp op);
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index bf87c868e1..f15e924481 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -34,6 +34,32 @@ using nmodl::parser::NmodlDriver;
 // Utility to get LLVM module as a string
 //=============================================================================
 
+std::string run_gpu_llvm_visitor(const std::string& text,
+                                 int opt_level = 0,
+                                 bool use_single_precision = false,
+                                 std::string math_library = "none",
+                                 bool nmodl_inline = false) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    SymtabVisitor().visit_program(*ast);
+    if (nmodl_inline) {
+        InlineVisitor().visit_program(*ast);
+    }
+    NeuronSolveVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
+
+    codegen::Platform gpu_platform(codegen::PlatformID::GPU, /*name=*/"nvidia",
+                                   math_library, use_single_precision, 1);
+    codegen::CodegenLLVMVisitor llvm_visitor(
+        /*mod_filename=*/"unknown",
+        /*output_dir=*/".", gpu_platform, opt_level,
+        /*add_debug_information=*/false);
+
+    llvm_visitor.visit_program(*ast);
+    return llvm_visitor.dump_module();
+}
+
 std::string run_llvm_visitor(const std::string& text,
                              int opt_level = 0,
                              bool use_single_precision = false,
@@ -1573,3 +1599,53 @@ SCENARIO("GPU kernel body", "[visitor][llvm][gpu]") {
         }
     }
 }
+
+//=============================================================================
+// Basic NVVM/LLVM IR generation for GPU platforms
+//=============================================================================
+
+SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
+    GIVEN("For GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+        THEN("kernel annotations are added and thread id intrinsics generated") {
+            std::string module_string = run_gpu_llvm_visitor(nmodl_text,
+                                                             /*opt_level=*/0,
+                                                             /*use_single_precision=*/false);
+            std::smatch m;
+
+            // Check kernel annotations are correclty created.
+            std::regex annotations(R"(!nvvm\.annotations = !\{!0\})");
+            std::regex kernel_data(R"(!0 = !\{void \(%.*__instance_var__type\*\)\* @nrn_state_.*, !\"kernel\", i32 1\})");
+            REQUIRE(std::regex_search(module_string, m, annotations));
+            REQUIRE(std::regex_search(module_string, m, kernel_data));
+
+            // Check thread/block id/dim instrinsics are created.
+            std::regex block_id(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ctaid\.x\(\))");
+            std::regex block_dim(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.ntid\.x\(\))");
+            std::regex tid(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.tid\.x\(\))");
+            std::regex grid_dim(R"(call i32 @llvm\.nvvm\.read\.ptx\.sreg\.nctaid\.x\(\))");
+            REQUIRE(std::regex_search(module_string, m, block_id));
+            REQUIRE(std::regex_search(module_string, m, block_dim));
+            REQUIRE(std::regex_search(module_string, m, tid));
+            REQUIRE(std::regex_search(module_string, m, grid_dim));
+        }
+    }
+}

From fad728d2b1da4542619ae1a9a58e52c8e2cc202e Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 23 Mar 2022 18:10:34 +0100
Subject: [PATCH 229/331] More changes to make the LLVM IR generated by NMODL
 executed on GPU

- Added hardcoded triple for GPU
- Run without debug flags
- Removed kernel attributes on GPU kernels

Execute with:
./bin/nmodl ../test.mod --output "llvm_cuda" llvm --no-debug --ir gpu --name "cuda" --math-library libdevice benchmark --run --libs "/home/magkanar/spack_software/linux-ubuntu20.04-skylake/gcc-11.1.0/cuda-11.4.2-ygshzt/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 1
---
 src/codegen/llvm/codegen_llvm_visitor.cpp |  8 +++++++-
 test/benchmark/cuda_driver.cpp            | 11 +++++++++++
 test/benchmark/cuda_driver.hpp            |  4 ++++
 test/benchmark/llvm_benchmark.cpp         |  3 +--
 4 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 72c8be7cc5..c27a8443e0 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -468,6 +468,10 @@ void CodegenLLVMVisitor::write_to_variable(const ast::VarName& node, llvm::Value
 }
 
 void CodegenLLVMVisitor::wrap_kernel_functions() {
+    // Wrapper doesn't work on GPU
+    if (platform.is_gpu()) {
+        return;
+    }
     // First, identify all kernels.
     std::vector<std::string> kernel_names;
     find_kernel_names(kernel_names);
@@ -695,7 +699,9 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     // If function is a compute kernel, add a void terminator explicitly, since there is no
     // `CodegenReturnVar` node. Also, set the necessary attributes.
     if (is_kernel_function(name)) {
-        ir_builder.set_kernel_attributes();
+        if (!platform.is_gpu()) {
+            ir_builder.set_kernel_attributes();
+        }
         ir_builder.create_return();
     }
 
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index f479b3614e..b78a7274d3 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -63,6 +63,7 @@ void CUDADriver::load_libraries(BenchmarkInfo* benchmark_info) {
             prog, libdevice_module.c_str(), libdevice_module_size, "libdevice"));
     }
 }
+
 auto get_compilation_options(int compute_version_major, BenchmarkInfo* benchmark_info) {
     std::vector<std::string> compilation_options;
     // Set the correct architecture to generate the PTX for
@@ -81,6 +82,14 @@ void print_ptx_to_file(const std::string& ptx_compiled_module, const std::string
     ptx_file.close();
 }
 
+/// Sets the target triple and the data layout of the module.
+void set_triple_and_data_layout(llvm::Module& module) {
+    module.setDataLayout(
+        "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-"
+        "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
+    module.setTargetTriple("nvptx64-nvidia-cuda");
+}
+
 void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     // CUDA initialization
     checkCudaErrors(cuInit(0));
@@ -104,6 +113,8 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
         throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
     }
 
+    set_triple_and_data_layout(*module);
+
     // Save the LLVM IR module to string
     std::string kernel_llvm_ir;
     llvm::raw_string_ostream os(kernel_llvm_ir);
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index 91c91c0370..2874b046c6 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -24,6 +24,7 @@
 
 #include "benchmark_info.hpp"
 #include "cuda.h"
+#include "cuda_runtime.h"
 #include "gpu_parameters.hpp"
 #include "nvvm.h"
 
@@ -105,6 +106,7 @@ class CUDADriver {
                                        nullptr,
                                        kernel_parameters,
                                        nullptr));
+        cudaDeviceSynchronize();
     }
 
     /// Lookups the entry-point with arguments in the CUDA module and executes it.
@@ -113,6 +115,7 @@ class CUDADriver {
                                 ArgType arg,
                                 const GPUExecutionParameters& gpu_execution_parameters) {
         // Get kernel function
+        logger->info("Executing kernel {}", entry_point);
         checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
 
         // Kernel launch
@@ -128,6 +131,7 @@ class CUDADriver {
                                        nullptr,
                                        kernel_parameters,
                                        nullptr));
+        cudaDeviceSynchronize();
     }
 };
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index c15d0e873c..958189c4a4 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -80,7 +80,7 @@ void LLVMBenchmark::run_benchmark_on_cpu(const std::shared_ptr<ast::Program>& no
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+            runner.run_with_argument<int, void*>(wrapper_name, instance_data.base_ptr);
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
@@ -140,7 +140,6 @@ void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& no
             }
 
             // Record the execution time of the kernel.
-            std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
             runner.run_with_argument<void*>(kernel_name,
                                             instance_data.base_ptr,

From 26161f90aa73ec95d1437a5fd662ab965e21a476 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 23 Mar 2022 18:17:52 +0100
Subject: [PATCH 230/331] Small cleanup

---
 src/codegen/llvm/codegen_llvm_visitor.cpp |  2 +-
 src/language/codegen.yaml                 | 16 ----------------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index c27a8443e0..63278b63e4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -39,7 +39,7 @@ static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
            statement.is_if_statement() || statement.is_codegen_return_statement() ||
            statement.is_codegen_var_list_statement() || statement.is_expression_statement() ||
-           statement.is_while_statement() || statement.is_codegen_thread_id();
+           statement.is_while_statement();
 }
 
 /// A utility to check that the kernel body can be vectorised.
diff --git a/src/language/codegen.yaml b/src/language/codegen.yaml
index dd0d034356..93f1fca50d 100644
--- a/src/language/codegen.yaml
+++ b/src/language/codegen.yaml
@@ -307,19 +307,3 @@
                         - rhs:
                             brief: "Expression for atomic operation"
                             type: Expression
-                  - CodegenThreadId:
-                      brief: "Represents a generic thread id expression for GPU code generation"
-                      description: |
-                        For GPU code generation, we use a special AST node to enocde the thread
-                        id calculation. In NMODL, this expression is usually of the form:
-                        \code{.cpp}
-                            id = blockId.x * blockDim.x + threadId.x
-                        \endcode
-                        To be able to support multiple GPU backends, we choose to have a custom AST
-                        node. Therefore, the code generation for this node is kept very simple,
-                        mapping expression to target-specific GPU inrinsics.
-                      nmodl: "GPU_ID "
-                      members:
-                        - name:
-                            brief: "Name of the thread id variable"
-                            type: Identifier

From d68671e86fe98482f0028dc66af65391f813d1ae Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 23 Mar 2022 18:37:41 +0100
Subject: [PATCH 231/331] Print kernel wrappers and nrn_init based on Instance
 Struct (#551)

* Setup all the parameters of `<mod>__instance_var__type` in `setup_instance`
* Generate wrapper functions for `nrn_cur_<mod>`, `nrn_init_<mod>` and `nrn_state_<mod>`
* Print LLVM IR code to `<mod_filename>.ll`
* Only assign the correct pointer to the index variables
* Added unit test and integration tests for oacc and ispc backend

Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
---
 src/codegen/codegen_c_visitor.cpp             |  42 +++-
 src/codegen/codegen_c_visitor.hpp             |  33 ++-
 src/codegen/codegen_info.cpp                  |  16 +-
 src/codegen/codegen_info.hpp                  |   2 +-
 src/codegen/codegen_ispc_visitor.hpp          |   2 +-
 src/codegen/codegen_naming.hpp                |   6 +
 .../llvm/codegen_llvm_helper_visitor.cpp      |  18 +-
 .../llvm/codegen_llvm_helper_visitor.hpp      |   8 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 228 ++++++++++++++++-
 src/codegen/llvm/codegen_llvm_visitor.hpp     | 156 +++++++++++-
 src/printer/code_printer.cpp                  |   7 +
 src/printer/code_printer.hpp                  |   2 +
 test/integration/CMakeLists.txt               |   3 +
 test/unit/CMakeLists.txt                      |  11 +-
 test/unit/codegen/codegen_llvm_visitor.cpp    | 231 ++++++++++++++++++
 15 files changed, 709 insertions(+), 56 deletions(-)
 create mode 100644 test/unit/codegen/codegen_llvm_visitor.cpp

diff --git a/src/codegen/codegen_c_visitor.cpp b/src/codegen/codegen_c_visitor.cpp
index 79ee3967e0..2dfdedec51 100644
--- a/src/codegen/codegen_c_visitor.cpp
+++ b/src/codegen/codegen_c_visitor.cpp
@@ -712,6 +712,22 @@ bool CodegenCVisitor::is_constant_variable(const std::string& name) const {
             is_constant = true;
         }
     }
+    // Check whether the variable exists in the codegen_int_variables of the CodegenInfo struct
+    // which hold information whether the variables are const or not
+    const auto& int_variable_it = std::find_if(info.codegen_int_variables.begin(),
+                                               info.codegen_int_variables.end(),
+                                               [&name](const IndexVariableInfo& var) {
+                                                   return var.symbol->get_name() == name;
+                                               });
+    const auto& const_variable_it = std::find_if(info.constant_variables.begin(),
+                                                 info.constant_variables.end(),
+                                                 [&name](const IndexVariableInfo& var) {
+                                                     return var.symbol->get_name() == name;
+                                                 });
+    is_constant = is_constant ||
+                  (int_variable_it != info.codegen_int_variables.end() &&
+                   int_variable_it->is_constant) ||
+                  const_variable_it != info.constant_variables.end();
     return is_constant;
 }
 
@@ -803,6 +819,9 @@ std::string CodegenCVisitor::get_parameter_str(const ParamVector& params) {
     return param;
 }
 
+void CodegenCVisitor::print_backend_compute_routine_decl() {
+    // backend specific, do nothing
+}
 
 void CodegenCVisitor::print_channel_iteration_task_begin(BlockType type) {
     // backend specific, do nothing
@@ -915,13 +934,19 @@ bool CodegenCVisitor::shadow_vector_setup_required() {
 }
 
 
+void CodegenCVisitor::print_channel_iteration_loop(const std::string& start = "start",
+                                                   const std::string& end = "end") {
+    printer->start_block("for (int id = {}; id < {}; id++)"_format(start, end));
+}
+
+
 /**
  * \details For CPU backend we iterate over all node counts. For cuda we use thread
  * index to check if block needs to be executed or not.
  */
 void CodegenCVisitor::print_channel_iteration_block_begin(BlockType type) {
     print_channel_iteration_block_parallel_hint(type);
-    printer->start_block("for (int id = start; id < end; id++)");
+    print_channel_iteration_loop();
 }
 
 
@@ -989,7 +1014,7 @@ void CodegenCVisitor::print_atomic_reduction_pragma() {
 
 
 void CodegenCVisitor::print_shadow_reduction_block_begin() {
-    printer->start_block("for (int id = start; id < end; id++)");
+    print_channel_iteration_loop();
 }
 
 
@@ -4431,11 +4456,13 @@ void CodegenCVisitor::print_g_unused() const {
 void CodegenCVisitor::print_compute_functions() {
     print_top_verbatim_blocks();
     print_function_prototypes();
-    for (const auto& procedure: info.procedures) {
-        print_procedure(*procedure);
-    }
-    for (const auto& function: info.functions) {
-        print_function(*function);
+    if (print_procedures_and_functions) {
+        for (const auto& procedure: info.procedures) {
+            print_procedure(*procedure);
+        }
+        for (const auto& function: info.functions) {
+            print_function(*function);
+        }
     }
     for (size_t i = 0; i < info.before_after_blocks.size(); i++) {
         print_before_after_block(info.before_after_blocks[i], i);
@@ -4444,6 +4471,7 @@ void CodegenCVisitor::print_compute_functions() {
         auto block = callback->get_node_to_solve().get();
         print_derivimplicit_kernel(block);
     }
+    print_backend_compute_routine_decl();
     print_net_send_buffering();
     print_net_init();
     print_watch_activate();
diff --git a/src/codegen/codegen_c_visitor.hpp b/src/codegen/codegen_c_visitor.hpp
index 096b0b845d..b02c2941c7 100644
--- a/src/codegen/codegen_c_visitor.hpp
+++ b/src/codegen/codegen_c_visitor.hpp
@@ -165,6 +165,11 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
      */
     int current_watch_statement = 0;
 
+    /**
+     * Bool to select whether procedures and functions should be printed in the generated file
+     */
+    bool print_procedures_and_functions = true;
+
     /**
      * Data type of floating point variables
      */
@@ -262,6 +267,10 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
         return codegen::naming::DEFAULT_INTEGER_TYPE;
     }
 
+    /**
+     * Instance Struct type name suffix
+     */
+    std::string instance_struct_type_suffix = "Instance";
 
     /**
      * Checks if given function name is \c net_send
@@ -295,7 +304,7 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
      * Name of structure that wraps range variables
      */
     std::string instance_struct() const {
-        return "{}_Instance"_format(info.mod_suffix);
+        return "{}_{}"_format(info.mod_suffix, instance_struct_type_suffix);
     }
 
 
@@ -1090,6 +1099,18 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     void print_net_event_call(const ast::FunctionCall& node);
 
 
+    /**
+     * Print the for loop statement going through all the mechanism instances
+     */
+    void print_channel_iteration_loop(const std::string& start, const std::string& end);
+
+
+    /**
+     * Print backend compute routines declaration for various backends
+     */
+    virtual void print_backend_compute_routine_decl();
+
+
     /**
      * Print channel iterations from which tasks are created
      *
@@ -1643,19 +1664,19 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
      * \param skip_init_check \c true if we want the generated code to execute the initialization
      *                        conditionally
      */
-    void print_nrn_init(bool skip_init_check = true);
+    virtual void print_nrn_init(bool skip_init_check = true);
 
 
     /**
      * Print nrn_state / state update function definition
      */
-    void print_nrn_state();
+    virtual void print_nrn_state();
 
 
     /**
      * Print nrn_cur / current update function definition
      */
-    void print_nrn_cur();
+    virtual void print_nrn_cur();
 
     /**
      * Print fast membrane current calculation code
@@ -1744,12 +1765,12 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
     /**
      * Print the structure that wraps all range and int variables required for the NMODL
      */
-    void print_mechanism_range_var_structure();
+    virtual void print_mechanism_range_var_structure();
 
     /**
      * Print the function that initialize instance structure
      */
-    void print_instance_variable_setup();
+    virtual void print_instance_variable_setup();
 
     void visit_binary_expression(const ast::BinaryExpression& node) override;
     void visit_binary_operator(const ast::BinaryOperator& node) override;
diff --git a/src/codegen/codegen_info.cpp b/src/codegen/codegen_info.cpp
index 26696fbc18..522922552e 100644
--- a/src/codegen/codegen_info.cpp
+++ b/src/codegen/codegen_info.cpp
@@ -251,7 +251,7 @@ void CodegenInfo::get_int_variables() {
                                                         // not have doubles between read/write. Same
                                                         // name variables are allowed
         for (const auto& var: ion.reads) {
-            const std::string name = "ion_" + var;
+            const std::string name = naming::ION_VARNAME_PREFIX + var;
             codegen_int_variables.emplace_back(make_symbol(name));
             codegen_int_variables.back().is_constant = true;
             ion_vars[name] = codegen_int_variables.size() - 1;
@@ -261,16 +261,17 @@ void CodegenInfo::get_int_variables() {
         std::shared_ptr<symtab::Symbol> ion_di_dv_var = nullptr;
 
         for (const auto& var: ion.writes) {
-            const std::string name = "ion_" + var;
+            const std::string name = naming::ION_VARNAME_PREFIX + var;
 
             const auto ion_vars_it = ion_vars.find(name);
             if (ion_vars_it != ion_vars.end()) {
                 codegen_int_variables[ion_vars_it->second].is_constant = false;
             } else {
-                codegen_int_variables.emplace_back(make_symbol("ion_" + var));
+                codegen_int_variables.emplace_back(make_symbol(naming::ION_VARNAME_PREFIX + var));
             }
             if (ion.is_ionic_current(var)) {
-                ion_di_dv_var = make_symbol("ion_di" + ion.name + "dv");
+                ion_di_dv_var = make_symbol(std::string(naming::ION_VARNAME_PREFIX) + "di" +
+                                            ion.name + "dv");
             }
             if (ion.is_intra_cell_conc(var) || ion.is_extra_cell_conc(var)) {
                 need_style = true;
@@ -347,10 +348,11 @@ void CodegenInfo::get_int_variables() {
 void CodegenInfo::get_shadow_variables() {
     for (const auto& ion: ions) {
         for (const auto& var: ion.writes) {
-            codegen_shadow_variables.push_back({make_symbol(shadow_varname("ion_" + var))});
+            codegen_shadow_variables.push_back(
+                {make_symbol(shadow_varname(naming::ION_VARNAME_PREFIX + var))});
             if (ion.is_ionic_current(var)) {
-                codegen_shadow_variables.push_back(
-                    {make_symbol(shadow_varname("ion_di" + ion.name + "dv"))});
+                codegen_shadow_variables.push_back({make_symbol(shadow_varname(
+                    std::string(naming::ION_VARNAME_PREFIX) + "di" + ion.name + "dv"))});
             }
         }
     }
diff --git a/src/codegen/codegen_info.hpp b/src/codegen/codegen_info.hpp
index 70940e7428..5cbb8eb2aa 100644
--- a/src/codegen/codegen_info.hpp
+++ b/src/codegen/codegen_info.hpp
@@ -565,7 +565,7 @@ struct CodegenInfo {
         if (artificial_cell) {
             return false;
         }
-        return nrn_state_block != nullptr || currents.empty();
+        return nrn_state_block != nullptr || breakpoint_exist();
     }
 
     /**
diff --git a/src/codegen/codegen_ispc_visitor.hpp b/src/codegen/codegen_ispc_visitor.hpp
index e14b05e000..d5d11cd113 100644
--- a/src/codegen/codegen_ispc_visitor.hpp
+++ b/src/codegen/codegen_ispc_visitor.hpp
@@ -165,7 +165,7 @@ class CodegenIspcVisitor: public CodegenCVisitor {
     void print_procedure(const ast::ProcedureBlock& node) override;
 
 
-    void print_backend_compute_routine_decl();
+    void print_backend_compute_routine_decl() override;
 
 
     /// print wrapper function that calls ispc kernel
diff --git a/src/codegen/codegen_naming.hpp b/src/codegen/codegen_naming.hpp
index 910d35e4c1..b6c8aa9df1 100644
--- a/src/codegen/codegen_naming.hpp
+++ b/src/codegen/codegen_naming.hpp
@@ -158,6 +158,12 @@ static constexpr char THREAD_ARGS_PROTO[] = "_threadargsproto_";
 /// prefix for ion variable
 static constexpr char ION_VARNAME_PREFIX[] = "ion_";
 
+/// name of the mechanism instance parameter in LLVM IR
+static constexpr char MECH_INSTANCE_VAR[] = "mech";
+static constexpr char MECH_NODECOUNT_VAR[] = "node_count";
+
+/// name of induction variable used in the kernel.
+static constexpr char INDUCTION_VAR[] = "id";
 
 /// commonly used variables in verbatim block and how they
 /// should be mapped to new code generation backends
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index ae36c1bce2..22ce0c3de8 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -244,7 +244,7 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
     add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
     add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
-    add_var_with_type(NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
 
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
@@ -462,7 +462,7 @@ void CodegenLLVMHelperVisitor::convert_to_instance_variable(ast::Node& node,
         /// instance_var_helper check of instance variables from mod file as well
         /// as extra variables like ion index variables added for code generation
         if (instance_var_helper.is_an_instance_variable(variable_name)) {
-            auto name = new ast::Name(new ast::String(MECH_INSTANCE_VAR));
+            auto name = new ast::Name(new ast::String(naming::MECH_INSTANCE_VAR));
             auto var = std::make_shared<ast::CodegenInstanceVar>(name, variable->clone());
             variable->set_name(var);
         }
@@ -641,7 +641,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     {
         /// access node index and corresponding voltage
         index_statements.push_back(
-            visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
+            visitor::create_statement("node_id = node_index[{}]"_format(naming::INDUCTION_VAR)));
         body_statements.push_back(
             visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
 
@@ -683,7 +683,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     compute_body.insert(compute_body.end(), index_statements.begin(), index_statements.end());
     compute_body.insert(compute_body.end(), body_statements.begin(), body_statements.end());
 
-    std::vector<std::string> induction_variables{INDUCTION_VAR};
+    std::vector<std::string> induction_variables{naming::INDUCTION_VAR};
     function_statements.push_back(
             create_local_variable_statement(induction_variables, INTEGER_TYPE));
 
@@ -705,7 +705,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     ast::CodegenVarWithTypeVector code_arguments;
 
     auto instance_var_type = new ast::CodegenVarType(ast::AstNodeType::INSTANCE_STRUCT);
-    auto instance_var_name = new ast::Name(new ast::String(MECH_INSTANCE_VAR));
+    auto instance_var_name = new ast::Name(new ast::String(naming::MECH_INSTANCE_VAR));
     auto instance_var = new ast::CodegenVarWithType(instance_var_type, 1, instance_var_name);
     code_arguments.emplace_back(instance_var);
 
@@ -742,9 +742,9 @@ void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::Sta
                                                         std::vector<std::string>& int_variables,
                                                         std::vector<std::string>& double_variables,
                                                         bool is_remainder_loop) {
-    const auto& initialization = loop_initialization_expression(INDUCTION_VAR, is_remainder_loop);
-    const auto& condition = loop_count_expression(INDUCTION_VAR, NODECOUNT_VAR, is_remainder_loop);
-    const auto& increment = loop_increment_expression(INDUCTION_VAR, is_remainder_loop);
+    const auto& initialization = loop_initialization_expression(naming::INDUCTION_VAR, is_remainder_loop);
+    const auto& condition = loop_count_expression(naming::INDUCTION_VAR, NODECOUNT_VAR, is_remainder_loop);
+    const auto& increment = loop_increment_expression(naming::INDUCTION_VAR, is_remainder_loop);
 
     // Clone the statement block if needed since it can be used by the remainder loop.
     auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
@@ -760,7 +760,7 @@ void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::Sta
                                                                loop_block);
 
     // Convert all variables inside loop body to be instance variables.
-    convert_to_instance_variable(*for_loop, INDUCTION_VAR);
+    convert_to_instance_variable(*for_loop, naming::INDUCTION_VAR);
 
     // Rename variables if processing remainder loop.
     if (is_remainder_loop) {
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index 2aa7f2fe03..c2eb415cb2 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -114,12 +114,6 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     /// mechanism data helper
     InstanceVarHelper instance_var_helper;
 
-    /// name of the mechanism instance parameter
-    const std::string MECH_INSTANCE_VAR = "mech";
-
-    /// name of induction variable used in the kernel.
-    const std::string INDUCTION_VAR = "id";
-
     /// create new function for FUNCTION or PROCEDURE block
     void create_function_for_node(ast::Block& node);
 
@@ -144,7 +138,7 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     }
 
     std::string get_kernel_id() {
-        return INDUCTION_VAR;
+        return naming::INDUCTION_VAR;
     }
 
     /// run visitor and return code generation functions
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 25fb173842..589e069ec4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -27,9 +27,6 @@ namespace nmodl {
 namespace codegen {
 
 
-static constexpr const char instance_struct_type_name[] = "__instance_var__type";
-
-
 /****************************************************************************************/
 /*                                  Helper routines                                     */
 /****************************************************************************************/
@@ -318,7 +315,7 @@ llvm::Type* CodegenLLVMVisitor::get_instance_struct_type() {
         }
     }
 
-    return ir_builder.get_struct_ptr_type(mod_filename + instance_struct_type_name, member_types);
+    return ir_builder.get_struct_ptr_type(instance_struct(), member_types);
 }
 
 int CodegenLLVMVisitor::get_num_elements(const ast::IndexedName& node) {
@@ -332,7 +329,7 @@ int CodegenLLVMVisitor::get_num_elements(const ast::IndexedName& node) {
         return integer->get_value();
 
     // Otherwise, the length is taken from the macro.
-    const auto& macro = sym_tab->lookup(integer->get_macro()->get_node_name());
+    const auto& macro = program_symtab->lookup(integer->get_macro()->get_node_name());
     return static_cast<int>(*macro->get_value());
 }
 
@@ -755,7 +752,7 @@ void CodegenLLVMVisitor::visit_function_call(const ast::FunctionCall& node) {
     if (func) {
         create_function_call(func, name, node.get_arguments());
     } else {
-        auto symbol = sym_tab->lookup(name);
+        auto symbol = program_symtab->lookup(name);
         if (symbol && symbol->has_any_property(symtab::syminfo::NmodlType::extern_method)) {
             create_external_function_call(name, node.get_arguments());
         } else {
@@ -844,11 +841,11 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     CodegenLLVMHelperVisitor v{platform};
     const auto& functions = v.get_codegen_functions(node);
     instance_var_helper = v.get_instance_var_helper();
-    sym_tab = node.get_symbol_table();
+    program_symtab = node.get_symbol_table();
     std::string kernel_id = v.get_kernel_id();
 
     // Initialize the builder for this NMODL program.
-    ir_builder.initialize(*sym_tab, kernel_id);
+    ir_builder.initialize(*program_symtab, kernel_id);
 
     // Create compile unit if adding debug information to the module.
     if (add_debug_information) {
@@ -861,6 +858,9 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         create_function_declaration(*func);
     }
 
+    // Set the AST symbol table.
+    program_symtab = node.get_symbol_table();
+
     // Proceed with code generation. Right now, we do not do
     //     node.visit_children(*this);
     // The reason is that the node may contain AST nodes for which the visitor functions have been
@@ -921,6 +921,218 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     }
 
     logger->debug("Dumping generated IR...\n" + dump_module());
+    // Setup CodegenHelper for C++ wrapper file
+    setup(node);
+    print_wrapper_routines();
+    print_target_file();
+}
+
+void CodegenLLVMVisitor::print_mechanism_range_var_structure() {
+    printer->add_newline(2);
+    printer->add_line("/** Instance Struct passed as argument to LLVM IR kernels */");
+    printer->start_block("struct {} "_format(instance_struct()));
+    for (const auto& variable: instance_var_helper.instance->get_codegen_vars()) {
+        auto is_pointer = variable->get_is_pointer();
+        auto name = to_nmodl(variable->get_name());
+        auto qualifier = is_constant_variable(name) ? k_const() : "";
+        auto nmodl_type = variable->get_type()->get_type();
+        auto pointer = is_pointer ? "*" : "";
+        auto var_name = variable->get_node_name();
+        switch (nmodl_type) {
+#define DISPATCH(type, c_type)                                                              \
+    case type:                                                                              \
+        printer->add_line("{}{}{} {}{};"_format(                                            \
+            qualifier, c_type, pointer, is_pointer ? ptr_type_qualifier() : "", var_name)); \
+        break;
+
+            DISPATCH(ast::AstNodeType::DOUBLE, "double");
+            DISPATCH(ast::AstNodeType::INTEGER, "int");
+
+#undef DISPATCH
+        default:
+            throw std::runtime_error("Error: unsupported type found in instance struct");
+        }
+    }
+    printer->end_block();
+    printer->add_text(";");
+    printer->add_newline();
+}
+
+void CodegenLLVMVisitor::print_instance_variable_setup() {
+    if (range_variable_setup_required()) {
+        print_setup_range_variable();
+    }
+
+    if (shadow_vector_setup_required()) {
+        print_shadow_vector_setup();
+    }
+    printer->add_newline(2);
+    printer->add_line("/** initialize mechanism instance variables */");
+    printer->start_block("static inline void setup_instance(NrnThread* nt, Memb_list* ml) ");
+    printer->add_line("{0}* inst = ({0}*) mem_alloc(1, sizeof({0}));"_format(instance_struct()));
+    if (channel_task_dependency_enabled() && !info.codegen_shadow_variables.empty()) {
+        printer->add_line("setup_shadow_vectors(inst, ml);");
+    }
+
+    std::string stride;
+    printer->add_line("int pnodecount = ml->_nodecount_padded;");
+    stride = "*pnodecount";
+
+    printer->add_line("Datum* indexes = ml->pdata;");
+
+    std::string float_type = default_float_data_type();
+    std::string int_type = default_int_data_type();
+    std::string float_type_pointer = float_type + "*";
+    std::string int_type_pointer = int_type + "*";
+
+    int id = 0;
+    std::vector<std::string> variables_to_free;
+
+    for (auto& var: info.codegen_float_variables) {
+        auto name = var->get_name();
+        auto range_var_type = get_range_var_float_type(var);
+        if (float_type == range_var_type) {
+            auto variable = "ml->data+{}{}"_format(id, stride);
+            auto device_variable = get_variable_device_pointer(variable, float_type_pointer);
+            printer->add_line("inst->{} = {};"_format(name, device_variable));
+        } else {
+            printer->add_line("inst->{} = setup_range_variable(ml->data+{}{}, pnodecount);"_format(
+                name, id, stride));
+            variables_to_free.push_back(name);
+        }
+        id += var->get_length();
+    }
+
+    for (auto& var: info.codegen_int_variables) {
+        auto name = var.symbol->get_name();
+        std::string variable = name;
+        std::string type = "";
+        if (var.is_index || var.is_integer) {
+            variable = "ml->pdata";
+            type = int_type_pointer;
+        } else if (var.is_vdata) {
+            variable = "nt->_vdata";
+            type = "void**";
+        } else {
+            variable = "nt->_data";
+            type = info.artificial_cell ? "void*" : float_type_pointer;
+        }
+        auto device_variable = get_variable_device_pointer(variable, type);
+        printer->add_line("inst->{} = {};"_format(name, device_variable));
+    }
+
+    int index_id = 0;
+    // for integer variables, there should be index
+    for (const auto& int_var: info.codegen_int_variables) {
+        std::string var_name = int_var.symbol->get_name() + "_index";
+        // Create for loop that instantiates the ion_<var>_index with
+        // indexes[<var_id>*pdnodecount]
+        printer->add_line("inst->{} = indexes+{}*pnodecount;"_format(var_name, index_id));
+        index_id++;
+    }
+
+    // Pass voltage pointer to the the instance struct
+    printer->add_line("inst->voltage = nt->_actual_v;");
+
+    // Pass ml->nodeindices pointer to node_index
+    printer->add_line("inst->node_index = ml->nodeindices;");
+
+    // Setup global variables
+    printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_T_VARIABLE));
+    printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_DT_VARIABLE));
+    printer->add_line("inst->{0} = {0};"_format(naming::CELSIUS_VARIABLE));
+    printer->add_line("inst->{0} = {0};"_format(naming::SECOND_ORDER_VARIABLE));
+    printer->add_line("inst->{} = ml->nodecount;"_format(naming::MECH_NODECOUNT_VAR));
+
+    printer->add_line("ml->instance = inst;");
+    printer->end_block(3);
+
+    printer->add_line("/** cleanup mechanism instance variables */");
+    printer->start_block("static inline void cleanup_instance(Memb_list* ml) ");
+    printer->add_line("{0}* inst = ({0}*) ml->instance;"_format(instance_struct()));
+    if (range_variable_setup_required()) {
+        for (auto& var: variables_to_free) {
+            printer->add_line("mem_free((void*)inst->{});"_format(var));
+        }
+    }
+    printer->add_line("mem_free((void*)inst);");
+    printer->end_block(1);
+}
+
+CodegenLLVMVisitor::ParamVector CodegenLLVMVisitor::get_compute_function_parameter() {
+    auto params = ParamVector();
+    params.emplace_back(param_type_qualifier(),
+                        "{}*"_format(instance_struct()),
+                        ptr_type_qualifier(),
+                        "inst");
+    return params;
+}
+
+void CodegenLLVMVisitor::print_backend_compute_routine_decl() {
+    auto params = get_compute_function_parameter();
+    auto compute_function = compute_method_name(BlockType::Initial);
+
+    printer->add_newline(2);
+    printer->add_line("extern void {}({});"_format(compute_function, get_parameter_str(params)));
+
+    if (info.nrn_cur_required()) {
+        compute_function = compute_method_name(BlockType::Equation);
+        printer->add_line(
+            "extern void {}({});"_format(compute_function, get_parameter_str(params)));
+    }
+
+    if (info.nrn_state_required()) {
+        compute_function = compute_method_name(BlockType::State);
+        printer->add_line(
+            "extern void {}({});"_format(compute_function, get_parameter_str(params)));
+    }
+}
+
+// Copied from CodegenIspcVisitor
+void CodegenLLVMVisitor::print_wrapper_routine(const std::string& wrapper_function,
+                                               BlockType type) {
+    static const auto args = "NrnThread* nt, Memb_list* ml, int type";
+    const auto function_name = method_name(wrapper_function);
+    auto compute_function = compute_method_name(type);
+
+    printer->add_newline(2);
+    printer->start_block("void {}({})"_format(function_name, args));
+    printer->add_line("int nodecount = ml->nodecount;");
+    // clang-format off
+    printer->add_line("{0}* {1}inst = ({0}*) ml->instance;"_format(instance_struct(), ptr_type_qualifier()));
+    // clang-format on
+
+    if (type == BlockType::Initial) {
+        printer->add_newline();
+        printer->add_line("setup_instance(nt, ml);");
+        printer->add_newline();
+        printer->start_block("if (_nrn_skip_initmodel)");
+        printer->add_line("return;");
+        printer->end_block();
+        printer->add_newline();
+    }
+
+    printer->add_line("{}(inst);"_format(compute_function));
+    printer->end_block();
+    printer->add_newline();
+}
+
+void CodegenLLVMVisitor::print_nrn_init(bool skip_init_check) {
+    print_wrapper_routine(naming::NRN_INIT_METHOD, BlockType::Initial);
+}
+
+void CodegenLLVMVisitor::print_nrn_cur() {
+    print_wrapper_routine(naming::NRN_CUR_METHOD, BlockType::Equation);
+}
+
+void CodegenLLVMVisitor::print_nrn_state() {
+    print_wrapper_routine(naming::NRN_STATE_METHOD, BlockType::State);
+}
+
+void CodegenLLVMVisitor::print_wrapper_routines() {
+    printer = wrapper_printer;
+    wrapper_codegen = true;
+    CodegenCVisitor::print_codegen_routines();
 }
 
 void CodegenLLVMVisitor::visit_procedure_block(const ast::ProcedureBlock& node) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 27150ff296..683cc7972a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -18,6 +18,7 @@
 #include <ostream>
 #include <string>
 
+#include "codegen/codegen_c_visitor.hpp"
 #include "codegen/llvm/codegen_llvm_helper_visitor.hpp"
 #include "codegen/llvm/llvm_debug_builder.hpp"
 #include "codegen/llvm/llvm_ir_builder.hpp"
@@ -49,13 +50,16 @@ namespace codegen {
  * \class CodegenLLVMVisitor
  * \brief %Visitor for transforming NMODL AST to LLVM IR
  */
-class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
+class CodegenLLVMVisitor: public CodegenCVisitor {
     /// Name of mod file (without .mod suffix).
     std::string mod_filename;
 
     /// Output directory for code generation.
     std::string output_dir;
 
+    /// flag to indicate if visitor should print the the wrapper code
+    bool wrapper_codegen = false;
+
   private:
     /// Underlying LLVM context.
     std::unique_ptr<llvm::LLVMContext> context = std::make_unique<llvm::LLVMContext>();
@@ -72,9 +76,6 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     /// Add debug information to the module.
     bool add_debug_information;
 
-    /// Pointer to AST symbol table.
-    symtab::SymbolTable* sym_tab;
-
     /// Instance variable helper.
     InstanceVarHelper instance_var_helper;
 
@@ -91,13 +92,45 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
                        int opt_level_ir,
                        bool add_debug_information = false,
                        std::vector<std::string> fast_math_flags = {})
-        : mod_filename(mod_filename)
+        : CodegenCVisitor(mod_filename,
+                          output_dir,
+                          platform.is_single_precision() ? "float" : "double",
+                          false,
+                          ".ll",
+                          ".cpp")
+        , mod_filename(mod_filename)
         , output_dir(output_dir)
         , platform(platform)
         , opt_level_ir(opt_level_ir)
         , add_debug_information(add_debug_information)
         , ir_builder(*context, platform, fast_math_flags)
-        , debug_builder(*module) {}
+        , debug_builder(*module) {
+        instance_struct_type_suffix = "_instance_var__type";
+        print_procedures_and_functions = false;
+    }
+
+    CodegenLLVMVisitor(const std::string& mod_filename,
+                       std::ostream& stream,
+                       Platform& platform,
+                       int opt_level_ir,
+                       bool add_debug_information = false,
+                       std::vector<std::string> fast_math_flags = {})
+        : CodegenCVisitor(mod_filename,
+                          stream,
+                          platform.is_single_precision() ? "float" : "double",
+                          false,
+                          ".ll",
+                          ".cpp")
+        , mod_filename(mod_filename)
+        , output_dir(".")
+        , platform(platform)
+        , opt_level_ir(opt_level_ir)
+        , add_debug_information(add_debug_information)
+        , ir_builder(*context, platform, fast_math_flags)
+        , debug_builder(*module) {
+        instance_struct_type_suffix = "_instance_var__type";
+        print_procedures_and_functions = false;
+    }
 
     /// Dumps the generated LLVM IR module to string.
     std::string dump_module() const {
@@ -108,6 +141,10 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
         return str;
     }
 
+    void print_target_file() const {
+        target_printer->add_multi_line(dump_module());
+    }
+
     /// Fills the container with the names of kernel functions from the MOD file.
     void find_kernel_names(std::vector<std::string>& container);
 
@@ -153,6 +190,113 @@ class CodegenLLVMVisitor: public visitor::ConstAstVisitor {
     void visit_var_name(const ast::VarName& node) override;
     void visit_while_statement(const ast::WhileStatement& node) override;
 
+    /*
+     * Override functions from CodegenCVisitor to the ones from visitor::ConstsAstVisitor as it was
+     * originally for CodegenLLVMVisitor
+     */
+    void visit_binary_operator(const ast::BinaryOperator& node) override {
+        visitor::ConstAstVisitor::visit_binary_operator(node);
+    }
+    void visit_else_if_statement(const ast::ElseIfStatement& node) override {
+        visitor::ConstAstVisitor::visit_else_if_statement(node);
+    }
+    void visit_else_statement(const ast::ElseStatement& node) override {
+        visitor::ConstAstVisitor::visit_else_statement(node);
+    }
+    void visit_float(const ast::Float& node) override {
+        visitor::ConstAstVisitor::visit_float(node);
+    }
+    void visit_from_statement(const ast::FromStatement& node) override {
+        visitor::ConstAstVisitor::visit_from_statement(node);
+    }
+    void visit_eigen_newton_solver_block(const ast::EigenNewtonSolverBlock& node) override {
+        visitor::ConstAstVisitor::visit_eigen_newton_solver_block(node);
+    }
+    void visit_eigen_linear_solver_block(const ast::EigenLinearSolverBlock& node) override {
+        visitor::ConstAstVisitor::visit_eigen_linear_solver_block(node);
+    }
+    void visit_indexed_name(const ast::IndexedName& node) override {
+        visitor::ConstAstVisitor::visit_indexed_name(node);
+    }
+    void visit_local_list_statement(const ast::LocalListStatement& node) override {
+        visitor::ConstAstVisitor::visit_local_list_statement(node);
+    }
+    void visit_name(const ast::Name& node) override {
+        visitor::ConstAstVisitor::visit_name(node);
+    }
+    void visit_paren_expression(const ast::ParenExpression& node) override {
+        visitor::ConstAstVisitor::visit_paren_expression(node);
+    }
+    void visit_prime_name(const ast::PrimeName& node) override {
+        visitor::ConstAstVisitor::visit_prime_name(node);
+    }
+    void visit_string(const ast::String& node) override {
+        visitor::ConstAstVisitor::visit_string(node);
+    }
+    void visit_solution_expression(const ast::SolutionExpression& node) override {
+        visitor::ConstAstVisitor::visit_solution_expression(node);
+    }
+    void visit_unary_operator(const ast::UnaryOperator& node) override {
+        visitor::ConstAstVisitor::visit_unary_operator(node);
+    }
+    void visit_unit(const ast::Unit& node) override {
+        visitor::ConstAstVisitor::visit_unit(node);
+    }
+    void visit_verbatim(const ast::Verbatim& node) override {
+        visitor::ConstAstVisitor::visit_verbatim(node);
+    }
+    void visit_watch_statement(const ast::WatchStatement& node) override {
+        visitor::ConstAstVisitor::visit_watch_statement(node);
+    }
+    void visit_derivimplicit_callback(const ast::DerivimplicitCallback& node) override {
+        visitor::ConstAstVisitor::visit_derivimplicit_callback(node);
+    }
+    void visit_for_netcon(const ast::ForNetcon& node) override {
+        visitor::ConstAstVisitor::visit_for_netcon(node);
+    }
+
+    /*
+     * Functions related to printing the wrapper cpp file
+     */
+    void print_wrapper_routines() override;
+    void print_wrapper_headers_include();
+    void print_data_structures();
+    void print_mechanism_range_var_structure() override;
+    void print_instance_variable_setup() override;
+
+    /**
+     * Print the \c nrn\_init function definition
+     * \param skip_init_check \c true if we want the generated code to execute the initialization
+     *                        conditionally
+     */
+    void print_nrn_init(bool skip_init_check = true) override;
+    /**
+     * Print nrn_state / state update function definition
+     */
+    void print_nrn_state() override;
+    /**
+     * Print nrn_cur / current update function definition
+     */
+    void print_nrn_cur() override;
+    /*
+     * Declare the external compute functions (nrn_init, nrn_cur and nrn_state)
+     */
+    void print_backend_compute_routine_decl() override;
+    /*
+     * Define the wrappers for the external compute functions (nrn_init, nrn_cur and nrn_state)
+     */
+    void print_backend_compute_routine();
+    /*
+     * Print the wrapper routine based on the parameters given
+     * \param wrapper_function The name of the function to wrap
+     * \param type The \c BlockType that this function is based on
+     */
+    void print_wrapper_routine(const std::string& wrapper_function, BlockType type);
+    /*
+     * Function that returns a vector of Parameters needed to be passed to the compute routines.
+     * The first argument should be an object of \c mechanism_instance_struct_type_name
+     */
+    CodegenLLVMVisitor::ParamVector get_compute_function_parameter();
     /// Wraps all kernel function calls into wrapper functions that use `void*` to pass the data to
     /// the kernel.
     void wrap_kernel_functions();
diff --git a/src/printer/code_printer.cpp b/src/printer/code_printer.cpp
index a754ccff30..1f8a0ec9a1 100644
--- a/src/printer/code_printer.cpp
+++ b/src/printer/code_printer.cpp
@@ -40,6 +40,13 @@ void CodePrinter::start_block(std::string&& text) {
     indent_level++;
 }
 
+void CodePrinter::start_block(const std::string& text) {
+    add_indent();
+    *result << text << " {";
+    add_newline();
+    indent_level++;
+}
+
 void CodePrinter::add_indent() {
     *result << std::string(indent_level * NUM_SPACES, ' ');
 }
diff --git a/src/printer/code_printer.hpp b/src/printer/code_printer.hpp
index b0d6d5d1fd..505a153bc6 100644
--- a/src/printer/code_printer.hpp
+++ b/src/printer/code_printer.hpp
@@ -66,6 +66,8 @@ class CodePrinter {
 
     void start_block(std::string&&);
 
+    void start_block(const std::string& text);
+
     void add_text(const std::string&);
 
     void add_line(const std::string&, int num_new_lines = 1);
diff --git a/test/integration/CMakeLists.txt b/test/integration/CMakeLists.txt
index 3343748dcf..cedbdf2f3e 100644
--- a/test/integration/CMakeLists.txt
+++ b/test/integration/CMakeLists.txt
@@ -11,4 +11,7 @@ file(GLOB modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/integration/mod/*.mod")
 foreach(modfile ${modfiles})
   get_filename_component(modfile_name "${modfile}" NAME)
   add_test(NAME ${modfile_name} COMMAND ${PROJECT_BINARY_DIR}/bin/nmodl ${modfile})
+  add_test(NAME ${modfile_name}_oacc COMMAND ${PROJECT_BINARY_DIR}/bin/nmodl ${modfile} host --c
+                                             acc --oacc)
+  add_test(NAME ${modfile_name}_ispc COMMAND ${PROJECT_BINARY_DIR}/bin/nmodl ${modfile} host --ispc)
 endforeach()
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 4e30d48f1e..8174215729 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -111,8 +111,9 @@ if(NMODL_ENABLE_LLVM)
   add_library(benchmark_data STATIC codegen/codegen_data_helper.cpp)
   add_dependencies(benchmark_data lexer)
 
-  add_executable(testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp
-                          codegen/codegen_data_helper.cpp codegen/codegen_llvm_instance_struct.cpp)
+  add_executable(
+    testllvm visitor/main.cpp codegen/codegen_llvm_ir.cpp codegen/codegen_data_helper.cpp
+             codegen/codegen_llvm_instance_struct.cpp codegen/codegen_llvm_visitor.cpp)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_data_helper.cpp
                                   codegen/codegen_llvm_execution.cpp)
   if(NMODL_ENABLE_LLVM_CUDA)
@@ -156,8 +157,10 @@ endif()
 set(testvisitor_env "PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH}")
 if(NOT LINK_AGAINST_PYTHON)
   list(APPEND testvisitor_env "NMODL_PYLIB=$ENV{NMODL_PYLIB}")
-  list(APPEND testvisitor_env
-       "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  list(
+    APPEND
+      testvisitor_env
+      "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif()
 
 foreach(
diff --git a/test/unit/codegen/codegen_llvm_visitor.cpp b/test/unit/codegen/codegen_llvm_visitor.cpp
new file mode 100644
index 0000000000..d2a058b3c5
--- /dev/null
+++ b/test/unit/codegen/codegen_llvm_visitor.cpp
@@ -0,0 +1,231 @@
+/*************************************************************************
+ * Copyright (C) 2019-2021 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <catch/catch.hpp>
+
+#include "ast/program.hpp"
+#include "codegen/codegen_helper_visitor.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "config/config.h"
+#include "parser/nmodl_driver.hpp"
+#include "test/unit/utils/test_utils.hpp"
+#include "visitors/neuron_solve_visitor.hpp"
+#include "visitors/solve_block_visitor.hpp"
+#include "visitors/symtab_visitor.hpp"
+#include "visitors/units_visitor.hpp"
+
+using namespace nmodl;
+using namespace visitor;
+using namespace codegen;
+
+using nmodl::NrnUnitsLib;
+using nmodl::parser::NmodlDriver;
+using nmodl::test_utils::reindent_text;
+
+/// Run LLVM codegen visitor and get instance struct declaration and setup of C++ wrapper
+std::string get_wrapper_instance_struct(const std::string& nmodl_text) {
+    const auto& ast = NmodlDriver().parse_string(nmodl_text);
+    std::stringbuf strbuf;
+    std::ostream oss(&strbuf);
+    /// directory where units lib file is located
+    std::string units_dir(NrnUnitsLib::get_path());
+    /// parse units of text
+    UnitsVisitor(units_dir).visit_program(*ast);
+    SymtabVisitor().visit_program(*ast);
+    NeuronSolveVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
+
+    /// create LLVM and C++ wrapper code generation visitor
+    codegen::Platform cpu_platform(/*use_single_precision=*/false, /*instruction_width=*/1);
+    codegen::CodegenLLVMVisitor llvm_visitor("hh.mod", oss, cpu_platform, 0);
+    llvm_visitor.visit_program(*ast);
+    strbuf.str("");
+    llvm_visitor.print_mechanism_range_var_structure();
+    llvm_visitor.print_instance_variable_setup();
+    return strbuf.str();
+}
+
+SCENARIO("Check instance struct declaration and setup in wrapper",
+         "[codegen][llvm][instance_struct]") {
+    GIVEN("hh: simple mod file") {
+        std::string nmodl_text = R"(
+            TITLE hh.mod   squid sodium, potassium, and leak channels
+
+            UNITS {
+                (mA) = (milliamp)
+                (mV) = (millivolt)
+                (S) = (siemens)
+            }
+
+            NEURON {
+                SUFFIX hh
+                USEION na READ ena WRITE ina
+                USEION k READ ek WRITE ik
+                NONSPECIFIC_CURRENT il
+                RANGE gnabar, gkbar, gl, el, gna, gk
+                RANGE minf, hinf, ninf, mtau, htau, ntau
+                THREADSAFE : assigned GLOBALs will be per thread
+            }
+
+            PARAMETER {
+                gnabar = .12 (S/cm2)    <0,1e9>
+                gkbar = .036 (S/cm2)    <0,1e9>
+                gl = .0003 (S/cm2)    <0,1e9>
+                el = -54.3 (mV)
+            }
+
+            STATE {
+                m h n
+            }
+
+            ASSIGNED {
+                v (mV)
+                celsius (degC)
+                ena (mV)
+                ek (mV)
+                gna (S/cm2)
+                gk (S/cm2)
+                ina (mA/cm2)
+                ik (mA/cm2)
+                il (mA/cm2)
+                minf hinf ninf
+                mtau (ms) htau (ms) ntau (ms)
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                gna = gnabar*m*m*m*h
+                ina = gna*(v - ena)
+                gk = gkbar*n*n*n*n
+                ik = gk*(v - ek)
+                il = gl*(v - el)
+            }
+
+            DERIVATIVE states {
+                m' =  (minf-m)/mtau
+                h' = (hinf-h)/htau
+                n' = (ninf-n)/ntau
+            }
+        )";
+
+        std::string generated_instance_struct_declaration = R"(
+            struct hh__instance_var__type  {
+                const double* __restrict__ gnabar;
+                const double* __restrict__ gkbar;
+                const double* __restrict__ gl;
+                const double* __restrict__ el;
+                double* __restrict__ gna;
+                double* __restrict__ gk;
+                double* __restrict__ il;
+                double* __restrict__ minf;
+                double* __restrict__ hinf;
+                double* __restrict__ ninf;
+                double* __restrict__ mtau;
+                double* __restrict__ htau;
+                double* __restrict__ ntau;
+                double* __restrict__ m;
+                double* __restrict__ h;
+                double* __restrict__ n;
+                double* __restrict__ Dm;
+                double* __restrict__ Dh;
+                double* __restrict__ Dn;
+                double* __restrict__ ena;
+                double* __restrict__ ek;
+                double* __restrict__ ina;
+                double* __restrict__ ik;
+                double* __restrict__ v_unused;
+                double* __restrict__ g_unused;
+                const double* __restrict__ ion_ena;
+                double* __restrict__ ion_ina;
+                double* __restrict__ ion_dinadv;
+                const double* __restrict__ ion_ek;
+                double* __restrict__ ion_ik;
+                double* __restrict__ ion_dikdv;
+                int* __restrict__ ion_ena_index;
+                int* __restrict__ ion_ina_index;
+                int* __restrict__ ion_dinadv_index;
+                int* __restrict__ ion_ek_index;
+                int* __restrict__ ion_ik_index;
+                int* __restrict__ ion_dikdv_index;
+                double* __restrict__ voltage;
+                int* __restrict__ node_index;
+                double t;
+                double dt;
+                double celsius;
+                int secondorder;
+                int node_count;
+            };
+        )";
+        std::string generated_instance_struct_setup = R"(
+            static inline void setup_instance(NrnThread* nt, Memb_list* ml)  {
+                hh__instance_var__type* inst = (hh__instance_var__type*) mem_alloc(1, sizeof(hh__instance_var__type));
+                int pnodecount = ml->_nodecount_padded;
+                Datum* indexes = ml->pdata;
+                inst->gnabar = ml->data+0*pnodecount;
+                inst->gkbar = ml->data+1*pnodecount;
+                inst->gl = ml->data+2*pnodecount;
+                inst->el = ml->data+3*pnodecount;
+                inst->gna = ml->data+4*pnodecount;
+                inst->gk = ml->data+5*pnodecount;
+                inst->il = ml->data+6*pnodecount;
+                inst->minf = ml->data+7*pnodecount;
+                inst->hinf = ml->data+8*pnodecount;
+                inst->ninf = ml->data+9*pnodecount;
+                inst->mtau = ml->data+10*pnodecount;
+                inst->htau = ml->data+11*pnodecount;
+                inst->ntau = ml->data+12*pnodecount;
+                inst->m = ml->data+13*pnodecount;
+                inst->h = ml->data+14*pnodecount;
+                inst->n = ml->data+15*pnodecount;
+                inst->Dm = ml->data+16*pnodecount;
+                inst->Dh = ml->data+17*pnodecount;
+                inst->Dn = ml->data+18*pnodecount;
+                inst->ena = ml->data+19*pnodecount;
+                inst->ek = ml->data+20*pnodecount;
+                inst->ina = ml->data+21*pnodecount;
+                inst->ik = ml->data+22*pnodecount;
+                inst->v_unused = ml->data+23*pnodecount;
+                inst->g_unused = ml->data+24*pnodecount;
+                inst->ion_ena = nt->_data;
+                inst->ion_ina = nt->_data;
+                inst->ion_dinadv = nt->_data;
+                inst->ion_ek = nt->_data;
+                inst->ion_ik = nt->_data;
+                inst->ion_dikdv = nt->_data;
+                inst->ion_ena_index = indexes+0*pnodecount;
+                inst->ion_ina_index = indexes+1*pnodecount;
+                inst->ion_dinadv_index = indexes+2*pnodecount;
+                inst->ion_ek_index = indexes+3*pnodecount;
+                inst->ion_ik_index = indexes+4*pnodecount;
+                inst->ion_dikdv_index = indexes+5*pnodecount;
+                inst->voltage = nt->_actual_v;
+                inst->node_index = ml->nodeindices;
+                inst->t = nt->t;
+                inst->dt = nt->dt;
+                inst->celsius = celsius;
+                inst->secondorder = secondorder;
+                inst->node_count = ml->nodecount;
+                ml->instance = inst;
+            }
+        )";
+
+        THEN("index and nt variables") {
+            auto result_instance_struct_declaration_setup = reindent_text(
+                get_wrapper_instance_struct(nmodl_text));
+            std::cout << "Result\n" << result_instance_struct_declaration_setup << std::endl;
+
+            auto expected_instance_struct_declaration = reindent_text(
+                generated_instance_struct_declaration);
+            auto expected_instance_struct_setup = reindent_text(generated_instance_struct_setup);
+
+            REQUIRE(result_instance_struct_declaration_setup.find(
+                        expected_instance_struct_declaration) != std::string::npos);
+            REQUIRE(result_instance_struct_declaration_setup.find(expected_instance_struct_setup) !=
+                    std::string::npos);
+        }
+    }
+}

From b6463d328af632dbacb6ef0e04a519f89d9fcd87 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 24 Mar 2022 10:20:52 +0100
Subject: [PATCH 232/331] Added NVPTX-specific optimization passes for PTX 
 generation

---
 cmake/LLVMHelper.cmake                    |  3 +
 src/codegen/llvm/codegen_llvm_visitor.cpp | 21 ++++++-
 src/codegen/llvm/llvm_utils.cpp           | 67 +++++++++++++++++++++++
 src/codegen/llvm/llvm_utils.hpp           |  6 ++
 4 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 9e4af5d503..717a597f95 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -15,6 +15,9 @@ set(NMODL_LLVM_COMPONENTS
     ipo
     mc
     native
+    nvptxcodegen
+    nvptxdesc
+    nvptxinfo
     orcjit
     target
     transformutils
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 589e069ec4..b2f2f04e90 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -883,8 +883,9 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         throw std::runtime_error("Error: incorrect IR has been generated!\n" + ostream.str());
     }
 
-    if (opt_level_ir) {
-        logger->info("Running LLVM optimisation passes");
+    // Handle optimization passes for GPUs separately.
+    if (platform.is_cpu() && opt_level_ir) {
+        logger->info("Running LLVM optimisation passes for CPU platforms");
         utils::initialise_optimisation_passes();
         utils::optimise_module(*module, opt_level_ir);
     }
@@ -915,12 +916,26 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 #endif
     }
 
+    // Handle GPU optimizations (CUDA platfroms only for now).
+    if (platform.is_gpu() && opt_level_ir) {
+        // We only support CUDA backends anyway, so this works for now.
+        utils::initialise_nvptx_passes();
+
+        std::string target_asm;
+        utils::optimise_module_for_nvptx(*module, opt_level_ir, target_asm);
+
+        logger->debug("Dumping generated IR...\n" + dump_module());
+        logger->debug("Dumping generated PTX...\n" + target_asm);
+    } else {
+        // Workaround for debug outputs.
+        logger->debug("Dumping generated IR...\n" + dump_module());
+    }
+
     // If the output directory is specified, save the IR to .ll file.
     if (output_dir != ".") {
         utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
     }
 
-    logger->debug("Dumping generated IR...\n" + dump_module());
     // Setup CodegenHelper for C++ wrapper file
     setup(node);
     print_wrapper_routines();
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index 59967c59c1..cae212aeb2 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -13,8 +13,10 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 
 namespace nmodl {
@@ -61,6 +63,71 @@ static void run_optimisation_passes(llvm::Module& module,
 /*                             Optimisation utils                                       */
 /****************************************************************************************/
 
+void initialise_nvptx_passes() {
+    // Register targets.
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXAsmPrinter();
+
+    // Initialize passes.
+    initialise_optimisation_passes();
+}
+
+void optimise_module_for_nvptx(llvm::Module& module, int opt_level, std::string& target_asm) {
+    // CUDA target machine we generating code for.
+    std::unique_ptr<llvm::TargetMachine> tm;
+
+    // Hardcode target infromation for now. Change if necessary.
+    llvm::Triple triple("nvptx64-nvidia-cuda");
+    std::string subtarget = "sm_20";
+    std::string features = "+ptx60";
+
+    // Find the specified target in registry.
+    std::string error_msg;
+    auto* target = llvm::TargetRegistry::lookupTarget("", triple, error_msg);
+    if (!target)
+        throw std::runtime_error("Error: " + error_msg + "\n");
+
+    tm.reset(target->createTargetMachine(triple.str(), subtarget, features, {}, {}));
+    if (!tm)
+        throw std::runtime_error("Error: creating target machine failed! Aborting.");
+
+    // Set data layout and target triple information for the module. Note
+    // that  we may want to have a more elaborate layout than the one
+    // created by `createDataLayout()`.
+    module.setDataLayout(tm->createDataLayout());
+    module.setTargetTriple("nvptx64-nvidia-cuda");
+
+    llvm::legacy::FunctionPassManager func_pm(&module);
+    llvm::legacy::PassManager module_pm;
+    llvm::PassManagerBuilder pm_builder;
+    pm_builder.OptLevel = opt_level;
+    pm_builder.SizeLevel = 0;
+    pm_builder.Inliner = llvm::createFunctionInliningPass();
+
+    // Do not vectorize!
+    pm_builder.LoopVectorize = false;
+
+    // Adjusting pass manager adds target-specific IR transformations, e.g.
+    // inferring address spaces.
+    tm->adjustPassManager(pm_builder);
+    pm_builder.populateFunctionPassManager(func_pm);
+    pm_builder.populateModulePassManager(module_pm);
+
+    // This runs target-indepependent optimizations.
+    run_optimisation_passes(module, func_pm, module_pm);
+
+    // Now, we want to run target-specific (e.g. NVPTX) passes. In LLVM, this
+    // is done via `addPassesToEmitFile`.
+    llvm::raw_string_ostream stream(target_asm);
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegen_pm;
+
+    tm->addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
+    codegen_pm.run(module);
+}
+
 void initialise_optimisation_passes() {
     auto& registry = *llvm::PassRegistry::getPassRegistry();
     llvm::initializeCore(registry);
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index 8e1e6e48dc..90b9aed385 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -16,6 +16,12 @@ namespace utils {
 /// Initialises some LLVM optimisation passes.
 void initialise_optimisation_passes();
 
+/// Initialises NVPTX-specific optimisation passes.
+void initialise_nvptx_passes();
+
+/// Optimises the given LLVM IR module for NVPTX targets.
+void optimise_module_for_nvptx(llvm::Module& module, int opt_level, std::string& target_asm);
+
 /// Optimises the given LLVM IR module.
 void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm = nullptr);
 

From a1513f81955b878cf0ee888d26fba34cbc0aa601 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Thu, 24 Mar 2022 10:30:11 +0100
Subject: [PATCH 233/331] Added tests

---
 test/unit/codegen/codegen_llvm_ir.cpp | 43 +++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index f15e924481..675ceaf170 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1648,4 +1648,47 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             REQUIRE(std::regex_search(module_string, m, grid_dim));
         }
     }
+
+    GIVEN("When optimizing for GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+        THEN("address spaces are inferred and target information added") {
+            std::string module_string = run_gpu_llvm_visitor(nmodl_text,
+                                                             /*opt_level=*/3,
+                                                             /*use_single_precision=*/false);
+            std::smatch m;
+
+            // Check target information.
+            // TODO: this may change when more platforms are supported.
+            std::regex data_layout(R"(target datalayout = \"e-i64:64-i128:128-v16:16-v32:32-n16:32:64\")");
+            std::regex triple(R"(nvptx64-nvidia-cuda)");
+            REQUIRE(std::regex_search(module_string, m, data_layout));
+            REQUIRE(std::regex_search(module_string, m, triple));
+
+            // Check for address space casts and address spaces in general when loading data.
+            std::regex as_cast(R"(addrspacecast %.*__instance_var__type\* %.* to %.*__instance_var__type addrspace\(1\)\*)");
+            std::regex gep_as1(R"(getelementptr inbounds %.*__instance_var__type, %.*__instance_var__type addrspace\(1\)\* %.*, i64 0, i32 .*)");
+            std::regex load_as1(R"(load double\*, double\* addrspace\(1\)\* %.*)");
+            REQUIRE(std::regex_search(module_string, m, as_cast));
+            REQUIRE(std::regex_search(module_string, m, gep_as1));
+            REQUIRE(std::regex_search(module_string, m, load_as1));
+        }
+    }
 }

From af5ed380ad24f3ef390d340325fe09dbb3a0e081 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 24 Mar 2022 13:14:50 +0100
Subject: [PATCH 234/331] Print LLVM IR to file once

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 9 +++------
 src/codegen/llvm/codegen_llvm_visitor.hpp | 4 ----
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index afead0e126..a71caf91fd 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -921,16 +921,13 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 #endif
     }
 
-    // If the output directory is specified, save the IR to .ll file.
-    if (output_dir != ".") {
-        utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
-    }
-
     logger->debug("Dumping generated IR...\n" + dump_module());
     // Setup CodegenHelper for C++ wrapper file
     setup(node);
+    // Print C++ wrapper file
     print_wrapper_routines();
-    print_target_file();
+    // Print LLVM IR module to <mod_filename>.ll file
+    utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
 }
 
 void CodegenLLVMVisitor::print_mechanism_range_var_structure() {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 683cc7972a..9d005c71c4 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -141,10 +141,6 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
         return str;
     }
 
-    void print_target_file() const {
-        target_printer->add_multi_line(dump_module());
-    }
-
     /// Fills the container with the names of kernel functions from the MOD file.
     void find_kernel_names(std::vector<std::string>& container);
 

From fba92e20db6f50b2e638068ebed8637f696adbc2 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 24 Mar 2022 15:05:57 +0100
Subject: [PATCH 235/331] Changes to generate proper code

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 2 +-
 src/codegen/llvm/llvm_utils.cpp           | 2 +-
 test/benchmark/cuda_driver.cpp            | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 46ab6b5ff9..6734eceb5d 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -923,7 +923,7 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     }
 
     // Handle GPU optimizations (CUDA platfroms only for now).
-    if (platform.is_gpu() && opt_level_ir) {
+    if (platform.is_gpu()) {
         // We only support CUDA backends anyway, so this works for now.
         utils::initialise_nvptx_passes();
 
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index cae212aeb2..b652c30aa1 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -80,7 +80,7 @@ void optimise_module_for_nvptx(llvm::Module& module, int opt_level, std::string&
 
     // Hardcode target infromation for now. Change if necessary.
     llvm::Triple triple("nvptx64-nvidia-cuda");
-    std::string subtarget = "sm_20";
+    std::string subtarget = "sm_60";
     std::string features = "+ptx60";
 
     // Find the specified target in registry.
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index b78a7274d3..ee3d4bff6d 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -113,7 +113,7 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
         throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
     }
 
-    set_triple_and_data_layout(*module);
+    // set_triple_and_data_layout(*module);
 
     // Save the LLVM IR module to string
     std::string kernel_llvm_ir;

From b42ebe7b536ce79932b45bd9ac9783db359805d3 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 25 Mar 2022 09:16:11 +0100
Subject: [PATCH 236/331] Addressed comments for GPU code generation

- Added CLI options to select 32- or 64-bit
targets.

- Added CLI option to select  target chip (e.g. sm_35).

- Changed data layout strings to the right ones.
---
 src/codegen/llvm/codegen_llvm_visitor.cpp |  7 ++--
 src/codegen/llvm/llvm_utils.cpp           | 42 +++++++++++++++--------
 src/codegen/llvm/llvm_utils.hpp           |  7 +++-
 src/codegen/llvm/target_platform.cpp      | 10 ++++++
 src/codegen/llvm/target_platform.hpp      | 25 +++++++++++++-
 src/main.cpp                              |  7 ++--
 test/unit/codegen/codegen_llvm_ir.cpp     |  4 +--
 7 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index b2f2f04e90..d906e9bd44 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -917,12 +917,15 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
     }
 
     // Handle GPU optimizations (CUDA platfroms only for now).
-    if (platform.is_gpu() && opt_level_ir) {
+    if (platform.is_gpu()) {
+        if (!platform.is_CUDA_gpu())
+            throw std::runtime_error("Error: unsupported GPU architecture!\n");
+
         // We only support CUDA backends anyway, so this works for now.
         utils::initialise_nvptx_passes();
 
         std::string target_asm;
-        utils::optimise_module_for_nvptx(*module, opt_level_ir, target_asm);
+        utils::optimise_module_for_nvptx(platform, *module, opt_level_ir, target_asm);
 
         logger->debug("Dumping generated IR...\n" + dump_module());
         logger->debug("Dumping generated PTX...\n" + target_asm);
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index cae212aeb2..2b5092b851 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -74,31 +74,45 @@ void initialise_nvptx_passes() {
     initialise_optimisation_passes();
 }
 
-void optimise_module_for_nvptx(llvm::Module& module, int opt_level, std::string& target_asm) {
+void optimise_module_for_nvptx(codegen::Platform& platform,
+                               llvm::Module& module,
+                               int opt_level,
+                               std::string& target_asm) {
     // CUDA target machine we generating code for.
     std::unique_ptr<llvm::TargetMachine> tm;
-
-    // Hardcode target infromation for now. Change if necessary.
-    llvm::Triple triple("nvptx64-nvidia-cuda");
-    std::string subtarget = "sm_20";
-    std::string features = "+ptx60";
+    std::string platform_name = platform.get_name();
+
+    // Target and layout information.
+    static const std::map<std::string, std::string> triple_str = {
+            {"nvptx", "nvptx-nvidia-cuda"},
+            {"nvptx64", "nvptx64-nvidia-cuda"}};
+    static const std::map<std::string, std::string> data_layout_str = {
+            {"nvptx", "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
+                      "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
+                      "-v64:64:64-v128:128:128-n16:32:64"},
+            {"nvptx64", "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
+                        "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
+                        "-v64:64:64-v128:128:128-n16:32:64"}};
+
+    // Set data layout and target triple information for the module.
+    auto triple = triple_str.find(platform_name)->second;
+    module.setDataLayout(data_layout_str.find(platform_name)->second);
+    module.setTargetTriple(triple);
+
+    std::string subtarget = platform.get_subtarget_name();
+    std::string features = "+ptx70";
 
     // Find the specified target in registry.
     std::string error_msg;
-    auto* target = llvm::TargetRegistry::lookupTarget("", triple, error_msg);
+    auto* target = llvm::TargetRegistry::lookupTarget(triple, error_msg);
     if (!target)
         throw std::runtime_error("Error: " + error_msg + "\n");
 
-    tm.reset(target->createTargetMachine(triple.str(), subtarget, features, {}, {}));
+    tm.reset(target->createTargetMachine(triple, subtarget, features, {}, {}));
     if (!tm)
         throw std::runtime_error("Error: creating target machine failed! Aborting.");
 
-    // Set data layout and target triple information for the module. Note
-    // that  we may want to have a more elaborate layout than the one
-    // created by `createDataLayout()`.
-    module.setDataLayout(tm->createDataLayout());
-    module.setTargetTriple("nvptx64-nvidia-cuda");
-
+    // Create pass managers.
     llvm::legacy::FunctionPassManager func_pm(&module);
     llvm::legacy::PassManager module_pm;
     llvm::PassManagerBuilder pm_builder;
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index 90b9aed385..17be5073e2 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include "codegen/llvm/target_platform.hpp"
+
 #include "llvm/IR/Module.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -20,7 +22,10 @@ void initialise_optimisation_passes();
 void initialise_nvptx_passes();
 
 /// Optimises the given LLVM IR module for NVPTX targets.
-void optimise_module_for_nvptx(llvm::Module& module, int opt_level, std::string& target_asm);
+void optimise_module_for_nvptx(codegen::Platform& platform,
+                               llvm::Module& module,
+                               int opt_level,
+                               std::string& target_asm);
 
 /// Optimises the given LLVM IR module.
 void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm = nullptr);
diff --git a/src/codegen/llvm/target_platform.cpp b/src/codegen/llvm/target_platform.cpp
index 6cb8c7bb2b..84fb1b9b36 100644
--- a/src/codegen/llvm/target_platform.cpp
+++ b/src/codegen/llvm/target_platform.cpp
@@ -30,6 +30,10 @@ bool Platform::is_gpu() {
     return platform_id == PlatformID::GPU;
 }
 
+bool Platform::is_CUDA_gpu() {
+  return platform_id == PlatformID::GPU && (name == "nvptx" || name == "nvptx64");
+}
+
 bool Platform::is_single_precision() {
   return use_single_precision;
 }
@@ -38,6 +42,12 @@ std::string Platform::get_name() const {
     return name;
 }
 
+std::string Platform::get_subtarget_name() const {
+    if (platform_id != PlatformID::GPU)
+        throw std::runtime_error("Error: platform must be a GPU to query the subtarget!\n");
+    return subtarget_name;
+}
+
 std::string Platform::get_math_library() const {
     return math_library;
 }
diff --git a/src/codegen/llvm/target_platform.hpp b/src/codegen/llvm/target_platform.hpp
index 2eabbb1a4b..99b7ad33d9 100644
--- a/src/codegen/llvm/target_platform.hpp
+++ b/src/codegen/llvm/target_platform.hpp
@@ -32,11 +32,16 @@ class Platform {
     /// Name of the platform.
     const std::string name = Platform::DEFAULT_PLATFORM_NAME;
 
+    /// Target chip for GPUs.
+    /// TODO: this should only be available to GPUs! If we refactor target
+    /// classes so that GPUPlatform <: Platform, it will be nicer!
+    const std::string subtarget_name = "sm_35";
+
     /// Target-specific id to compare platforms easily.
     PlatformID platform_id;
 
     /// User-provided width that is used to construct LLVM instructions
-    //  and types.
+    ///  and types.
     int instruction_width = 1;
 
     /// Use single-precision floating-point types.
@@ -46,6 +51,19 @@ class Platform {
     std::string math_library = Platform::DEFAULT_MATH_LIBRARY;
 
   public:
+    Platform(PlatformID platform_id,
+             const std::string& name,
+             const std::string& subtarget_name,
+             std::string& math_library,
+             bool use_single_precision = false,
+             int instruction_width = 1)
+              : platform_id(platform_id)
+              , name(name)
+              , subtarget_name(subtarget_name)
+              , math_library(math_library)
+              , use_single_precision(use_single_precision)
+              , instruction_width(instruction_width) {}
+
     Platform(PlatformID platform_id,
              const std::string& name,
              std::string& math_library,
@@ -77,10 +95,15 @@ class Platform {
     /// Checks if this platform is a GPU.
     bool is_gpu();
 
+    /// Checks if this platform is CUDA platform.
+    bool is_CUDA_gpu();
+
     bool is_single_precision();
 
     std::string get_name() const;
 
+    std::string get_subtarget_name() const;
+
     std::string get_math_library() const;
 
     int get_instruction_width() const;
diff --git a/src/main.cpp b/src/main.cpp
index 19dc4e5cbc..c1556687b4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -356,9 +356,12 @@ int main(int argc, const char* argv[]) {
 
     auto gpu_opt = app.add_subcommand("gpu", "LLVM GPU option")->ignore_case();
     gpu_opt->needs(llvm_opt);
-    gpu_opt->add_option("--name",
+    auto gpu_target_name = gpu_opt->add_option("--name",
         llvm_gpu_name,
         "Name of GPU platform to use")->ignore_case();
+   gpu_opt->add_option("--target-chip",
+        llvm_cpu_name,
+        "Name of target chip to use")->ignore_case();
     auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
         llvm_math_library,
         "Math library for GPU code generation ({})"_format(llvm_math_library));
@@ -701,7 +704,7 @@ int main(int argc, const char* argv[]) {
                                                           : PlatformID::GPU;
               const std::string name =
                   llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
-              Platform platform(pid, name, llvm_math_library, llvm_float_type,
+              Platform platform(pid, name, llvm_cpu_name, llvm_math_library, llvm_float_type,
                                 llvm_vector_width);
 
               logger->info("Running LLVM backend code generator");
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 675ceaf170..1d080f4131 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -49,7 +49,7 @@ std::string run_gpu_llvm_visitor(const std::string& text,
     NeuronSolveVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
 
-    codegen::Platform gpu_platform(codegen::PlatformID::GPU, /*name=*/"nvidia",
+    codegen::Platform gpu_platform(codegen::PlatformID::GPU, /*name=*/"nvptx64",
                                    math_library, use_single_precision, 1);
     codegen::CodegenLLVMVisitor llvm_visitor(
         /*mod_filename=*/"unknown",
@@ -1677,7 +1677,7 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
 
             // Check target information.
             // TODO: this may change when more platforms are supported.
-            std::regex data_layout(R"(target datalayout = \"e-i64:64-i128:128-v16:16-v32:32-n16:32:64\")");
+            std::regex data_layout(R"(target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\")");
             std::regex triple(R"(nvptx64-nvidia-cuda)");
             REQUIRE(std::regex_search(module_string, m, data_layout));
             REQUIRE(std::regex_search(module_string, m, triple));

From a2ef4a69b7ebe317738dba6abf1c9a7a3afadb47 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 25 Mar 2022 09:37:45 +0100
Subject: [PATCH 237/331] Added missing include

---
 src/codegen/llvm/target_platform.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/codegen/llvm/target_platform.cpp b/src/codegen/llvm/target_platform.cpp
index 84fb1b9b36..49a0db9a31 100644
--- a/src/codegen/llvm/target_platform.cpp
+++ b/src/codegen/llvm/target_platform.cpp
@@ -7,6 +7,8 @@
 
 #include "codegen/llvm/target_platform.hpp"
 
+#include <stdexcept>
+
 namespace nmodl {
 namespace codegen {
 

From cb3547a3c0ef4dd0fcb4883c63f79af2dd2bf365 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Mon, 28 Mar 2022 11:13:48 +0200
Subject: [PATCH 238/331] Replaced `find()` with `at()` when querying target
 information

Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>
---
 src/codegen/llvm/llvm_utils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index 2b5092b851..cadad7bee0 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -95,8 +95,8 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
                         "-v64:64:64-v128:128:128-n16:32:64"}};
 
     // Set data layout and target triple information for the module.
-    auto triple = triple_str.find(platform_name)->second;
-    module.setDataLayout(data_layout_str.find(platform_name)->second);
+    auto triple = triple_str.at(platform_name);
+    module.setDataLayout(data_layout_str.at(platform_name);
     module.setTargetTriple(triple);
 
     std::string subtarget = platform.get_subtarget_name();

From b36d7f9f0e5f8b24ac045ad565a4059a5d048f52 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 28 Mar 2022 11:27:57 +0200
Subject: [PATCH 239/331] Small changes to select the GPU benchmark

---
 src/main.cpp                      | 2 +-
 test/benchmark/llvm_benchmark.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index d1f5cd490e..e9f959866c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -753,7 +753,7 @@ int main(int argc, const char* argv[]) {
 
                 if (llvm_benchmark) {
                     logger->info("Running LLVM benchmark");
-                    if (llvm_gpu_name == "cuda") {
+                    if (platform.is_gpu()) {
                         const GPUExecutionParameters gpu_execution_parameters{
                             llvm_cuda_grid_dim_x,
                             llvm_cuda_grid_dim_y,
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 958189c4a4..9f677eca64 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -23,7 +23,7 @@ void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
     // create functions
     generate_llvm(node);
     // Finally, run the benchmark and log the measurements.
-    if (platform.get_name() == "cuda") {
+    if (platform.is_CUDA_gpu()) {
         run_benchmark_on_gpu(node);
     } else {
         run_benchmark_on_cpu(node);

From cab4fbf8657b354b16cedd205dcc1732141ad3bb Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 28 Mar 2022 11:33:19 +0200
Subject: [PATCH 240/331] Removed setting module datalayout and triple in
 CUDADriver

---
 test/benchmark/cuda_driver.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index ee3d4bff6d..c1786964f8 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -82,14 +82,6 @@ void print_ptx_to_file(const std::string& ptx_compiled_module, const std::string
     ptx_file.close();
 }
 
-/// Sets the target triple and the data layout of the module.
-void set_triple_and_data_layout(llvm::Module& module) {
-    module.setDataLayout(
-        "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-"
-        "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
-    module.setTargetTriple("nvptx64-nvidia-cuda");
-}
-
 void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     // CUDA initialization
     checkCudaErrors(cuInit(0));
@@ -113,8 +105,6 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
         throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
     }
 
-    // set_triple_and_data_layout(*module);
-
     // Save the LLVM IR module to string
     std::string kernel_llvm_ir;
     llvm::raw_string_ostream os(kernel_llvm_ir);

From afa14daab7aa5e2306c63dff70d552845220ba38 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Mon, 28 Mar 2022 11:43:01 +0200
Subject: [PATCH 241/331] Added closing parenthesis and made sm_70 default
 subtarget

---
 src/codegen/llvm/llvm_utils.cpp      | 2 +-
 src/codegen/llvm/target_platform.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index cadad7bee0..7086275557 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -96,7 +96,7 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
 
     // Set data layout and target triple information for the module.
     auto triple = triple_str.at(platform_name);
-    module.setDataLayout(data_layout_str.at(platform_name);
+    module.setDataLayout(data_layout_str.at(platform_name));
     module.setTargetTriple(triple);
 
     std::string subtarget = platform.get_subtarget_name();
diff --git a/src/codegen/llvm/target_platform.hpp b/src/codegen/llvm/target_platform.hpp
index 99b7ad33d9..282f6943d7 100644
--- a/src/codegen/llvm/target_platform.hpp
+++ b/src/codegen/llvm/target_platform.hpp
@@ -35,7 +35,7 @@ class Platform {
     /// Target chip for GPUs.
     /// TODO: this should only be available to GPUs! If we refactor target
     /// classes so that GPUPlatform <: Platform, it will be nicer!
-    const std::string subtarget_name = "sm_35";
+    const std::string subtarget_name = "sm_70";
 
     /// Target-specific id to compare platforms easily.
     PlatformID platform_id;

From 002e79a62134e0bbb63665d14d3aebd2ecb48933 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 28 Mar 2022 11:46:23 +0200
Subject: [PATCH 242/331] Make sure that benchmark is only running for CUDA
 backends and restrict CLI gpu names to supported ones

---
 src/main.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/main.cpp b/src/main.cpp
index e9f959866c..7061518e5b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -377,7 +377,8 @@ int main(int argc, const char* argv[]) {
     auto gpu_target_name = gpu_opt->add_option("--name",
         llvm_gpu_name,
         "Name of GPU platform to use")->ignore_case();
-   gpu_opt->add_option("--target-chip",
+    gpu_target_name->check(CLI::IsMember({"nvptx", "nvptx64"}));
+    gpu_opt->add_option("--target-chip",
         llvm_cpu_name,
         "Name of target chip to use")->ignore_case();
     auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
@@ -753,6 +754,10 @@ int main(int argc, const char* argv[]) {
 
                 if (llvm_benchmark) {
                     logger->info("Running LLVM benchmark");
+                    if (platform.is_gpu() && !platform.is_CUDA_gpu()) {
+                        throw std::runtime_error(
+                            "Benchmarking is only supported on CUDA GPUs at the moment");
+                    }
                     if (platform.is_gpu()) {
                         const GPUExecutionParameters gpu_execution_parameters{
                             llvm_cuda_grid_dim_x,

From 7524375b82f54ed8bc2317c1cd23a02b448c588c Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Mon, 28 Mar 2022 13:08:09 +0200
Subject: [PATCH 243/331] [LLVM][GPU] NVPTX specific passes for code generation
 (#833)

* Added NVPTX-specific optimization passes for PTX generation
* Added tests
* Added CLI options to select 32- or 64-bit targets and target compute architecture (e.g. sm_70)

Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>
---
 cmake/LLVMHelper.cmake                    |  3 +
 src/codegen/llvm/codegen_llvm_visitor.cpp | 24 ++++++-
 src/codegen/llvm/llvm_utils.cpp           | 81 +++++++++++++++++++++++
 src/codegen/llvm/llvm_utils.hpp           | 11 +++
 src/codegen/llvm/target_platform.cpp      | 12 ++++
 src/codegen/llvm/target_platform.hpp      | 25 ++++++-
 src/main.cpp                              |  7 +-
 test/unit/codegen/codegen_llvm_ir.cpp     | 45 ++++++++++++-
 8 files changed, 201 insertions(+), 7 deletions(-)

diff --git a/cmake/LLVMHelper.cmake b/cmake/LLVMHelper.cmake
index 9e4af5d503..717a597f95 100644
--- a/cmake/LLVMHelper.cmake
+++ b/cmake/LLVMHelper.cmake
@@ -15,6 +15,9 @@ set(NMODL_LLVM_COMPONENTS
     ipo
     mc
     native
+    nvptxcodegen
+    nvptxdesc
+    nvptxinfo
     orcjit
     target
     transformutils
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 589e069ec4..d906e9bd44 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -883,8 +883,9 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         throw std::runtime_error("Error: incorrect IR has been generated!\n" + ostream.str());
     }
 
-    if (opt_level_ir) {
-        logger->info("Running LLVM optimisation passes");
+    // Handle optimization passes for GPUs separately.
+    if (platform.is_cpu() && opt_level_ir) {
+        logger->info("Running LLVM optimisation passes for CPU platforms");
         utils::initialise_optimisation_passes();
         utils::optimise_module(*module, opt_level_ir);
     }
@@ -915,12 +916,29 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 #endif
     }
 
+    // Handle GPU optimizations (CUDA platfroms only for now).
+    if (platform.is_gpu()) {
+        if (!platform.is_CUDA_gpu())
+            throw std::runtime_error("Error: unsupported GPU architecture!\n");
+
+        // We only support CUDA backends anyway, so this works for now.
+        utils::initialise_nvptx_passes();
+
+        std::string target_asm;
+        utils::optimise_module_for_nvptx(platform, *module, opt_level_ir, target_asm);
+
+        logger->debug("Dumping generated IR...\n" + dump_module());
+        logger->debug("Dumping generated PTX...\n" + target_asm);
+    } else {
+        // Workaround for debug outputs.
+        logger->debug("Dumping generated IR...\n" + dump_module());
+    }
+
     // If the output directory is specified, save the IR to .ll file.
     if (output_dir != ".") {
         utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
     }
 
-    logger->debug("Dumping generated IR...\n" + dump_module());
     // Setup CodegenHelper for C++ wrapper file
     setup(node);
     print_wrapper_routines();
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index 59967c59c1..7086275557 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -13,8 +13,10 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 
 namespace nmodl {
@@ -61,6 +63,85 @@ static void run_optimisation_passes(llvm::Module& module,
 /*                             Optimisation utils                                       */
 /****************************************************************************************/
 
+void initialise_nvptx_passes() {
+    // Register targets.
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXAsmPrinter();
+
+    // Initialize passes.
+    initialise_optimisation_passes();
+}
+
+void optimise_module_for_nvptx(codegen::Platform& platform,
+                               llvm::Module& module,
+                               int opt_level,
+                               std::string& target_asm) {
+    // CUDA target machine we generating code for.
+    std::unique_ptr<llvm::TargetMachine> tm;
+    std::string platform_name = platform.get_name();
+
+    // Target and layout information.
+    static const std::map<std::string, std::string> triple_str = {
+            {"nvptx", "nvptx-nvidia-cuda"},
+            {"nvptx64", "nvptx64-nvidia-cuda"}};
+    static const std::map<std::string, std::string> data_layout_str = {
+            {"nvptx", "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
+                      "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
+                      "-v64:64:64-v128:128:128-n16:32:64"},
+            {"nvptx64", "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
+                        "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
+                        "-v64:64:64-v128:128:128-n16:32:64"}};
+
+    // Set data layout and target triple information for the module.
+    auto triple = triple_str.at(platform_name);
+    module.setDataLayout(data_layout_str.at(platform_name));
+    module.setTargetTriple(triple);
+
+    std::string subtarget = platform.get_subtarget_name();
+    std::string features = "+ptx70";
+
+    // Find the specified target in registry.
+    std::string error_msg;
+    auto* target = llvm::TargetRegistry::lookupTarget(triple, error_msg);
+    if (!target)
+        throw std::runtime_error("Error: " + error_msg + "\n");
+
+    tm.reset(target->createTargetMachine(triple, subtarget, features, {}, {}));
+    if (!tm)
+        throw std::runtime_error("Error: creating target machine failed! Aborting.");
+
+    // Create pass managers.
+    llvm::legacy::FunctionPassManager func_pm(&module);
+    llvm::legacy::PassManager module_pm;
+    llvm::PassManagerBuilder pm_builder;
+    pm_builder.OptLevel = opt_level;
+    pm_builder.SizeLevel = 0;
+    pm_builder.Inliner = llvm::createFunctionInliningPass();
+
+    // Do not vectorize!
+    pm_builder.LoopVectorize = false;
+
+    // Adjusting pass manager adds target-specific IR transformations, e.g.
+    // inferring address spaces.
+    tm->adjustPassManager(pm_builder);
+    pm_builder.populateFunctionPassManager(func_pm);
+    pm_builder.populateModulePassManager(module_pm);
+
+    // This runs target-indepependent optimizations.
+    run_optimisation_passes(module, func_pm, module_pm);
+
+    // Now, we want to run target-specific (e.g. NVPTX) passes. In LLVM, this
+    // is done via `addPassesToEmitFile`.
+    llvm::raw_string_ostream stream(target_asm);
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegen_pm;
+
+    tm->addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
+    codegen_pm.run(module);
+}
+
 void initialise_optimisation_passes() {
     auto& registry = *llvm::PassRegistry::getPassRegistry();
     llvm::initializeCore(registry);
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index 8e1e6e48dc..17be5073e2 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include "codegen/llvm/target_platform.hpp"
+
 #include "llvm/IR/Module.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -16,6 +18,15 @@ namespace utils {
 /// Initialises some LLVM optimisation passes.
 void initialise_optimisation_passes();
 
+/// Initialises NVPTX-specific optimisation passes.
+void initialise_nvptx_passes();
+
+/// Optimises the given LLVM IR module for NVPTX targets.
+void optimise_module_for_nvptx(codegen::Platform& platform,
+                               llvm::Module& module,
+                               int opt_level,
+                               std::string& target_asm);
+
 /// Optimises the given LLVM IR module.
 void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm = nullptr);
 
diff --git a/src/codegen/llvm/target_platform.cpp b/src/codegen/llvm/target_platform.cpp
index 6cb8c7bb2b..49a0db9a31 100644
--- a/src/codegen/llvm/target_platform.cpp
+++ b/src/codegen/llvm/target_platform.cpp
@@ -7,6 +7,8 @@
 
 #include "codegen/llvm/target_platform.hpp"
 
+#include <stdexcept>
+
 namespace nmodl {
 namespace codegen {
 
@@ -30,6 +32,10 @@ bool Platform::is_gpu() {
     return platform_id == PlatformID::GPU;
 }
 
+bool Platform::is_CUDA_gpu() {
+  return platform_id == PlatformID::GPU && (name == "nvptx" || name == "nvptx64");
+}
+
 bool Platform::is_single_precision() {
   return use_single_precision;
 }
@@ -38,6 +44,12 @@ std::string Platform::get_name() const {
     return name;
 }
 
+std::string Platform::get_subtarget_name() const {
+    if (platform_id != PlatformID::GPU)
+        throw std::runtime_error("Error: platform must be a GPU to query the subtarget!\n");
+    return subtarget_name;
+}
+
 std::string Platform::get_math_library() const {
     return math_library;
 }
diff --git a/src/codegen/llvm/target_platform.hpp b/src/codegen/llvm/target_platform.hpp
index 2eabbb1a4b..282f6943d7 100644
--- a/src/codegen/llvm/target_platform.hpp
+++ b/src/codegen/llvm/target_platform.hpp
@@ -32,11 +32,16 @@ class Platform {
     /// Name of the platform.
     const std::string name = Platform::DEFAULT_PLATFORM_NAME;
 
+    /// Target chip for GPUs.
+    /// TODO: this should only be available to GPUs! If we refactor target
+    /// classes so that GPUPlatform <: Platform, it will be nicer!
+    const std::string subtarget_name = "sm_70";
+
     /// Target-specific id to compare platforms easily.
     PlatformID platform_id;
 
     /// User-provided width that is used to construct LLVM instructions
-    //  and types.
+    ///  and types.
     int instruction_width = 1;
 
     /// Use single-precision floating-point types.
@@ -46,6 +51,19 @@ class Platform {
     std::string math_library = Platform::DEFAULT_MATH_LIBRARY;
 
   public:
+    Platform(PlatformID platform_id,
+             const std::string& name,
+             const std::string& subtarget_name,
+             std::string& math_library,
+             bool use_single_precision = false,
+             int instruction_width = 1)
+              : platform_id(platform_id)
+              , name(name)
+              , subtarget_name(subtarget_name)
+              , math_library(math_library)
+              , use_single_precision(use_single_precision)
+              , instruction_width(instruction_width) {}
+
     Platform(PlatformID platform_id,
              const std::string& name,
              std::string& math_library,
@@ -77,10 +95,15 @@ class Platform {
     /// Checks if this platform is a GPU.
     bool is_gpu();
 
+    /// Checks if this platform is CUDA platform.
+    bool is_CUDA_gpu();
+
     bool is_single_precision();
 
     std::string get_name() const;
 
+    std::string get_subtarget_name() const;
+
     std::string get_math_library() const;
 
     int get_instruction_width() const;
diff --git a/src/main.cpp b/src/main.cpp
index 19dc4e5cbc..c1556687b4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -356,9 +356,12 @@ int main(int argc, const char* argv[]) {
 
     auto gpu_opt = app.add_subcommand("gpu", "LLVM GPU option")->ignore_case();
     gpu_opt->needs(llvm_opt);
-    gpu_opt->add_option("--name",
+    auto gpu_target_name = gpu_opt->add_option("--name",
         llvm_gpu_name,
         "Name of GPU platform to use")->ignore_case();
+   gpu_opt->add_option("--target-chip",
+        llvm_cpu_name,
+        "Name of target chip to use")->ignore_case();
     auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
         llvm_math_library,
         "Math library for GPU code generation ({})"_format(llvm_math_library));
@@ -701,7 +704,7 @@ int main(int argc, const char* argv[]) {
                                                           : PlatformID::GPU;
               const std::string name =
                   llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
-              Platform platform(pid, name, llvm_math_library, llvm_float_type,
+              Platform platform(pid, name, llvm_cpu_name, llvm_math_library, llvm_float_type,
                                 llvm_vector_width);
 
               logger->info("Running LLVM backend code generator");
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index f15e924481..1d080f4131 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -49,7 +49,7 @@ std::string run_gpu_llvm_visitor(const std::string& text,
     NeuronSolveVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
 
-    codegen::Platform gpu_platform(codegen::PlatformID::GPU, /*name=*/"nvidia",
+    codegen::Platform gpu_platform(codegen::PlatformID::GPU, /*name=*/"nvptx64",
                                    math_library, use_single_precision, 1);
     codegen::CodegenLLVMVisitor llvm_visitor(
         /*mod_filename=*/"unknown",
@@ -1648,4 +1648,47 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             REQUIRE(std::regex_search(module_string, m, grid_dim));
         }
     }
+
+    GIVEN("When optimizing for GPU platforms") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = y + 2
+            }
+        )";
+
+        THEN("address spaces are inferred and target information added") {
+            std::string module_string = run_gpu_llvm_visitor(nmodl_text,
+                                                             /*opt_level=*/3,
+                                                             /*use_single_precision=*/false);
+            std::smatch m;
+
+            // Check target information.
+            // TODO: this may change when more platforms are supported.
+            std::regex data_layout(R"(target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\")");
+            std::regex triple(R"(nvptx64-nvidia-cuda)");
+            REQUIRE(std::regex_search(module_string, m, data_layout));
+            REQUIRE(std::regex_search(module_string, m, triple));
+
+            // Check for address space casts and address spaces in general when loading data.
+            std::regex as_cast(R"(addrspacecast %.*__instance_var__type\* %.* to %.*__instance_var__type addrspace\(1\)\*)");
+            std::regex gep_as1(R"(getelementptr inbounds %.*__instance_var__type, %.*__instance_var__type addrspace\(1\)\* %.*, i64 0, i32 .*)");
+            std::regex load_as1(R"(load double\*, double\* addrspace\(1\)\* %.*)");
+            REQUIRE(std::regex_search(module_string, m, as_cast));
+            REQUIRE(std::regex_search(module_string, m, gep_as1));
+            REQUIRE(std::regex_search(module_string, m, load_as1));
+        }
+    }
 }

From 93eba6e2c93cdbdf552a8a9bfdcd896c0476220b Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 28 Mar 2022 18:35:31 +0200
Subject: [PATCH 244/331] Use kernel bitcode for GPU compilation via NVVM

---
 test/benchmark/cuda_driver.cpp | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index c1786964f8..dd2cec7efe 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -16,6 +16,8 @@
 #include "fmt/format.h"
 #include "utils/common_utils.hpp"
 
+#include "llvm/Bitcode/BitcodeWriter.h"
+
 using fmt::literals::operator""_format;
 
 namespace nmodl {
@@ -76,12 +78,20 @@ auto get_compilation_options(int compute_version_major, BenchmarkInfo* benchmark
     return compilation_options;
 }
 
-void print_ptx_to_file(const std::string& ptx_compiled_module, const std::string& filename) {
+void print_string_to_file(const std::string& ptx_compiled_module, const std::string& filename) {
     std::ofstream ptx_file(filename);
     ptx_file << ptx_compiled_module;
     ptx_file.close();
 }
 
+std::string print_bitcode_to_string(const llvm::Module& module) {
+    std::string bitcode_string;
+    llvm::raw_string_ostream os(bitcode_string);
+    WriteBitcodeToFile(module, os);
+    os.flush();
+    return bitcode_string;
+}
+
 void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     // CUDA initialization
     checkCudaErrors(cuInit(0));
@@ -105,11 +115,8 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
         throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
     }
 
-    // Save the LLVM IR module to string
-    std::string kernel_llvm_ir;
-    llvm::raw_string_ostream os(kernel_llvm_ir);
-    os << *module;
-    os.flush();
+    // Save the LLVM module bitcode to string
+    std::string kernel_bitcode = print_bitcode_to_string(*module);
 
     // Create NVVM program object
     checkNVVMErrors(nvvmCreateProgram(&prog));
@@ -120,7 +127,7 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
 
     // Add custom IR to program
     checkNVVMErrors(nvvmAddModuleToProgram(
-        prog, kernel_llvm_ir.c_str(), kernel_llvm_ir.size(), "nmodl_llvm_ir"));
+        prog, kernel_bitcode.c_str(), kernel_bitcode.size(), "nmodl_kernel"));
 
     // Declare compile options
     auto compilation_options = get_compilation_options(device_info.compute_version_major,
@@ -143,8 +150,8 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
     nvvmGetCompiledResult(prog, compiled_module);
     ptx_compiled_module = std::string(compiled_module);
     free(compiled_module);
-    print_ptx_to_file(ptx_compiled_module,
-                      benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
+    print_string_to_file(ptx_compiled_module,
+                         benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
 
     // Create driver context
     checkCudaErrors(cuCtxCreate(&context, 0, device));

From 55ef3c04133bdf64702e4264a8a43ced832df219 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 14:37:48 +0200
Subject: [PATCH 245/331] Fix cmake way to find nvvm

---
 CMakeLists.txt                |  5 ++-
 cmake/FindNVVM.cmake          | 65 +++++++++++++++++++++++++++++++++++
 test/benchmark/CMakeLists.txt |  2 +-
 3 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 cmake/FindNVVM.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b15448b052..c160d39d09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -165,8 +165,7 @@ if(NMODL_ENABLE_LLVM)
   if(NMODL_ENABLE_LLVM_CUDA)
     enable_language(CUDA)
     find_package(CUDAToolkit)
-    set(CUDA_NVVM_INCLUDE_DIR ${CUDAToolkit_LIBRARY_ROOT}/nvvm/include)
-    set(CUDA_NVVM_LIBRARY_DIR ${CUDAToolkit_LIBRARY_ROOT}/nvvm/lib64)
+    include(cmake/FindNVVM.cmake)
     set(NMODL_CUDA_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS} ${CUDA_NVVM_INCLUDE_DIR})
     include_directories(${NMODL_CUDA_INCLUDE_DIRECTORIES})
     add_definitions(-DNMODL_LLVM_CUDA_BACKEND)
@@ -281,7 +280,7 @@ message(STATUS "LLVM CUDA Codegen   | ${NMODL_ENABLE_LLVM_CUDA}")
 if(NMODL_ENABLE_LLVM_CUDA)
   message(STATUS "  CUDA VERSION      | ${CUDAToolkit_VERSION}")
   message(STATUS "  INCLUDE           | ${NMODL_CUDA_INCLUDE_DIRECTORIES}")
-  message(STATUS "  LIBRARY           | ${CUDAToolkit_LIBRARY_DIR}")
+  message(STATUS "  LIBRARY           | ${CUDAToolkit_LIBRARY_DIR};${CUDA_NVVM_LIBRARIES}")
 endif()
 if(NMODL_CLANG_FORMAT)
   message(STATUS "Clang Format        | ${ClangFormat_EXECUTABLE}")
diff --git a/cmake/FindNVVM.cmake b/cmake/FindNVVM.cmake
new file mode 100644
index 0000000000..a975d9d5b4
--- /dev/null
+++ b/cmake/FindNVVM.cmake
@@ -0,0 +1,65 @@
+# - Find the NVVM include directory and libraries
+# Modified version of the file found here:
+# https://raw.githubusercontent.com/nvidia-compiler-sdk/nvvmir-samples/master/CMakeLists.txt
+# https://raw.githubusercontent.com/hshindo/arrayfire/master/CMakeModules/FindNVVM.cmake
+
+# libNVVM
+if(NOT DEFINED ENV{LIBNVVM_HOME})
+  set(LIBNVVM_HOME "${CUDAToolkit_LIBRARY_ROOT}/nvvm")
+else()
+  set(LIBNVVM_HOME "$ENV{LIBNVVM_HOME}")
+endif()
+message(STATUS "Using LIBNVVM_HOME: ${LIBNVVM_HOME}")
+
+if (CMAKE_SIZEOF_VOID_P STREQUAL "8")
+  if (WIN32)
+    set (CUDA_LIB_SEARCH_PATH "${CUDAToolkit_LIBRARY_ROOT}/lib/x64")
+    set (NVVM_DLL_NAME nvvm64_${NVVM_DLL_VERSION}.dll)
+  else ()
+    set (CUDA_LIB_SEARCH_PATH "")
+  endif()
+else()
+  if (WIN32)
+    set (CUDA_LIB_SEARCH_PATH "${CUDAToolkit_LIBRARY_ROOT}/lib/Win32")
+    set (NVVM_DLL_NAME nvvm32_${NVVM_DLL_VERSION}.dll)
+  else()
+    set (CUDA_LIB_SEARCH_PATH "")
+  endif()
+endif()
+
+### Find libNVVM
+# The directory structure for nvvm is a bit complex.
+# On Windows:
+#   32-bit -- nvvm/lib/Win32
+#   64-bit -- nvvm/lib/x64
+# On Linux:
+#   32-bit -- nvvm/lib
+#   64-bit -- nvvm/lib64
+# On Mac:
+#   Universal -- nvvm/lib
+if (CMAKE_SIZEOF_VOID_P STREQUAL "8")
+  if (WIN32)
+    set (LIB_ARCH_SUFFIX "/x64")
+  elseif (APPLE)
+    set (LIB_ARCH_SUFFIX "")
+  else ()
+    set (LIB_ARCH_SUFFIX "64")
+  endif()
+else()
+  if (WIN32)
+    set (LIB_ARCH_SUFFIX "/Win32")
+  else()
+    set (LIB_ARCH_SUFFIX "")
+  endif()
+endif()
+
+find_library(NVVM_LIB nvvm PATHS "${LIBNVVM_HOME}/lib${LIB_ARCH_SUFFIX}")
+find_file(NVVM_H nvvm.h PATHS "${LIBNVVM_HOME}/include")
+
+if(NVVM_H)
+  get_filename_component(CUDA_NVVM_INCLUDE_DIR ${NVVM_H} PATH)
+else()
+  message(FATAL_ERROR "Unable to find nvvm.h")
+endif()
+
+set(CUDA_NVVM_LIBRARIES ${NVVM_LIB})
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index 2c83f2326c..b554bbde91 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -13,7 +13,7 @@ include_directories(${LLVM_INCLUDE_DIRS})
 add_library(llvm_benchmark STATIC ${LLVM_BENCHMARK_SOURCE_FILES})
 add_dependencies(llvm_benchmark lexer util visitor)
 if(NMODL_ENABLE_LLVM_CUDA)
-  target_link_libraries(llvm_benchmark PRIVATE CUDA::cudart CUDA::nvrtc ${CUDA_NVVM_LIBRARY_DIR}/libnvvm.so)
+  target_link_libraries(llvm_benchmark PRIVATE CUDA::cudart CUDA::nvrtc ${CUDA_NVVM_LIBRARIES})
 endif()
 
 if(NMODL_ENABLE_JIT_EVENT_LISTENERS)

From 8e189156d52da961e890002bca63ac1036ce9551 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 14:41:10 +0200
Subject: [PATCH 246/331] Rename target-chip option to target-arch

---
 src/main.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 7061518e5b..a56e145322 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -378,9 +378,9 @@ int main(int argc, const char* argv[]) {
         llvm_gpu_name,
         "Name of GPU platform to use")->ignore_case();
     gpu_target_name->check(CLI::IsMember({"nvptx", "nvptx64"}));
-    gpu_opt->add_option("--target-chip",
+    gpu_opt->add_option("--target-arch",
         llvm_cpu_name,
-        "Name of target chip to use")->ignore_case();
+        "Name of target architecture to use")->ignore_case();
     auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
         llvm_math_library,
         "Math library for GPU code generation ({})"_format(llvm_math_library));

From 19aee68e025c6d490c168fde2714ecf2ee6365b2 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 14:43:04 +0200
Subject: [PATCH 247/331] Handle grid and block dim CLI options

---
 src/main.cpp | 28 ++--------------------------
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index a56e145322..ae0e179543 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -200,21 +200,9 @@ int main(int argc, const char* argv[]) {
     /// X dimension of grid in blocks for GPU execution
     int llvm_cuda_grid_dim_x = 1;
 
-    /// Y dimension of grid in blocks for GPU execution
-    int llvm_cuda_grid_dim_y = 1;
-
-    /// Z dimension of grid in blocks for GPU execution
-    int llvm_cuda_grid_dim_z = 1;
-
     /// X dimension of block in threads for GPU execution
     int llvm_cuda_block_dim_x = 1;
 
-    /// Y dimension of block in threads for GPU execution
-    int llvm_cuda_block_dim_y = 1;
-
-    /// Z dimension of block in threads for GPU execution
-    int llvm_cuda_block_dim_z = 1;
-
     /// run llvm benchmark
     bool llvm_benchmark(false);
 
@@ -408,24 +396,12 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
-    benchmark_opt->add_option("--gridDimX",
+    benchmark_opt->add_option("--grid-dim-x",
                               llvm_cuda_grid_dim_x,
                               "Grid dimension X ({})"_format(llvm_cuda_grid_dim_x))->ignore_case();
-    benchmark_opt->add_option("--gridDimY",
-                                llvm_cuda_grid_dim_y,
-                                "Grid dimension Y ({})"_format(llvm_cuda_grid_dim_y))->ignore_case();
-    benchmark_opt->add_option("--gridDimZ",
-                                llvm_cuda_grid_dim_z,
-                                "Grid dimension Z ({})"_format(llvm_cuda_grid_dim_z))->ignore_case();
-    benchmark_opt->add_option("--blockDimX",
+    benchmark_opt->add_option("--block-dim-x",
                                 llvm_cuda_block_dim_x,
                                 "Block dimension X ({})"_format(llvm_cuda_block_dim_x))->ignore_case();
-    benchmark_opt->add_option("--blockDimY",
-                                llvm_cuda_block_dim_y,
-                                "Block dimension Y ({})"_format(llvm_cuda_block_dim_y))->ignore_case();
-    benchmark_opt->add_option("--blockDimZ",
-                                llvm_cuda_block_dim_z,
-                                "Block dimension Z ({})"_format(llvm_cuda_block_dim_z))->ignore_case();
 #endif
     // clang-format on
 

From f13881f71f015c2ca65afbfcb112a29f9685fbb1 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 14:49:34 +0200
Subject: [PATCH 248/331] Removed unneeded GPU execution options

---
 src/main.cpp                      |  7 +------
 test/benchmark/cuda_driver.hpp    | 20 ++++++++++----------
 test/benchmark/gpu_parameters.hpp |  5 -----
 3 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index ae0e179543..005c1c8f5b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -736,12 +736,7 @@ int main(int argc, const char* argv[]) {
                     }
                     if (platform.is_gpu()) {
                         const GPUExecutionParameters gpu_execution_parameters{
-                            llvm_cuda_grid_dim_x,
-                            llvm_cuda_grid_dim_y,
-                            llvm_cuda_grid_dim_z,
-                            llvm_cuda_block_dim_x,
-                            llvm_cuda_block_dim_y,
-                            llvm_cuda_block_dim_z};
+                            llvm_cuda_grid_dim_x, llvm_cuda_block_dim_x};
                         benchmark::LLVMBenchmark benchmark(visitor,
                                                            modfile,
                                                            output_dir,
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index 2874b046c6..8a4f81cc5e 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -97,12 +97,12 @@ class CUDADriver {
         void* kernel_parameters[] = {};
         checkCudaErrors(cuLaunchKernel(function,
                                        gpu_execution_parameters.gridDimX,
-                                       gpu_execution_parameters.gridDimY,
-                                       gpu_execution_parameters.gridDimY,
+                                       1,
+                                       1,
                                        gpu_execution_parameters.blockDimX,
-                                       gpu_execution_parameters.blockDimY,
-                                       gpu_execution_parameters.blockDimY,
-                                       gpu_execution_parameters.sharedMemBytes,
+                                       1,
+                                       1,
+                                       0,
                                        nullptr,
                                        kernel_parameters,
                                        nullptr));
@@ -122,12 +122,12 @@ class CUDADriver {
         void* kernel_parameters[] = {&arg};
         checkCudaErrors(cuLaunchKernel(function,
                                        gpu_execution_parameters.gridDimX,
-                                       gpu_execution_parameters.gridDimY,
-                                       gpu_execution_parameters.gridDimY,
+                                       1,
+                                       1,
                                        gpu_execution_parameters.blockDimX,
-                                       gpu_execution_parameters.blockDimY,
-                                       gpu_execution_parameters.blockDimY,
-                                       gpu_execution_parameters.sharedMemBytes,
+                                       1,
+                                       1,
+                                       0,
                                        nullptr,
                                        kernel_parameters,
                                        nullptr));
diff --git a/test/benchmark/gpu_parameters.hpp b/test/benchmark/gpu_parameters.hpp
index 7d52b28757..5e72edb147 100644
--- a/test/benchmark/gpu_parameters.hpp
+++ b/test/benchmark/gpu_parameters.hpp
@@ -20,12 +20,7 @@ namespace cuda_details {
 
 struct GPUExecutionParameters {
     int gridDimX;
-    int gridDimY;
-    int gridDimZ;
     int blockDimX;
-    int blockDimY;
-    int blockDimZ;
-    int sharedMemBytes;
 };
 
 }  // namespace cuda_details

From 4df56f0b56e2897e735148cb7253f1448096e091 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 14:53:14 +0200
Subject: [PATCH 249/331] Cleared up benchmark instantiation in main function

---
 src/main.cpp | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 005c1c8f5b..5e319044d9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -734,32 +734,19 @@ int main(int argc, const char* argv[]) {
                         throw std::runtime_error(
                             "Benchmarking is only supported on CUDA GPUs at the moment");
                     }
-                    if (platform.is_gpu()) {
-                        const GPUExecutionParameters gpu_execution_parameters{
-                            llvm_cuda_grid_dim_x, llvm_cuda_block_dim_x};
-                        benchmark::LLVMBenchmark benchmark(visitor,
-                                                           modfile,
-                                                           output_dir,
-                                                           shared_lib_paths,
-                                                           num_experiments,
-                                                           instance_size,
-                                                           platform,
-                                                           llvm_opt_level_ir,
-                                                           llvm_opt_level_codegen,
-                                                           gpu_execution_parameters);
-                        benchmark.run(ast);
-                    } else {
-                        benchmark::LLVMBenchmark benchmark(visitor,
-                                                           modfile,
-                                                           output_dir,
-                                                           shared_lib_paths,
-                                                           num_experiments,
-                                                           instance_size,
-                                                           platform,
-                                                           llvm_opt_level_ir,
-                                                           llvm_opt_level_codegen);
-                        benchmark.run(ast);
-                    }
+                    const GPUExecutionParameters gpu_execution_parameters{llvm_cuda_grid_dim_x,
+                                                                          llvm_cuda_block_dim_x};
+                    benchmark::LLVMBenchmark benchmark(visitor,
+                                                       modfile,
+                                                       output_dir,
+                                                       shared_lib_paths,
+                                                       num_experiments,
+                                                       instance_size,
+                                                       platform,
+                                                       llvm_opt_level_ir,
+                                                       llvm_opt_level_codegen,
+                                                       gpu_execution_parameters);
+                    benchmark.run(ast);
                 }
             }
 #endif

From ab9906350e7ad6b0690228d34230d5e3cf28b6dc Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 15:20:01 +0200
Subject: [PATCH 250/331] Improved reading logs from nvvm compiler

---
 test/benchmark/cuda_driver.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index dd2cec7efe..56c44bd14f 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -35,8 +35,8 @@ void CUDADriver::checkNVVMErrors(nvvmResult err) {
     if (err != NVVM_SUCCESS) {
         size_t program_log_size;
         nvvmGetProgramLogSize(prog, &program_log_size);
-        auto program_log = (char*) malloc(program_log_size);
-        nvvmGetProgramLog(prog, program_log);
+        std::string program_log(program_log_size, '\0');
+        nvvmGetProgramLog(prog, &program_log.front());
         throw std::runtime_error(
             "Compilation Log:\n {}\nNVVM Error: {}\n"_format(program_log, nvvmGetErrorString(err)));
     }
@@ -143,13 +143,10 @@ void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
                                        compilation_options_c_str.data()));
 
     // Get compiled module
-    char* compiled_module;
     size_t compiled_module_size;
     nvvmGetCompiledResultSize(prog, &compiled_module_size);
-    compiled_module = (char*) malloc(compiled_module_size);
-    nvvmGetCompiledResult(prog, compiled_module);
-    ptx_compiled_module = std::string(compiled_module);
-    free(compiled_module);
+    ptx_compiled_module.resize(compiled_module_size);
+    nvvmGetCompiledResult(prog, &ptx_compiled_module.front());
     print_string_to_file(ptx_compiled_module,
                          benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
 

From 2da743bc394aaedb0a3fd075e0d90d52c8498333 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 15:55:58 +0200
Subject: [PATCH 251/331] Removed duplicated code from LLVMBenchmark

---
 src/main.cpp                      |   7 ++
 test/benchmark/llvm_benchmark.cpp | 130 +++++++++++-------------------
 test/benchmark/llvm_benchmark.hpp |  15 ++--
 3 files changed, 66 insertions(+), 86 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 5e319044d9..90b6440f7c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -734,6 +734,13 @@ int main(int argc, const char* argv[]) {
                         throw std::runtime_error(
                             "Benchmarking is only supported on CUDA GPUs at the moment");
                     }
+#ifndef NMODL_LLVM_CUDA_BACKEND
+                    if (platform.is_CUDA_gpu()) {
+                        throw std::runtime_error(
+                            "GPU benchmarking is not supported if NMODL is not built with CUDA "
+                            "backend enabled.");
+                    }
+#endif
                     const GPUExecutionParameters gpu_execution_parameters{llvm_cuda_grid_dim_x,
                                                                           llvm_cuda_block_dim_x};
                     benchmark::LLVMBenchmark benchmark(visitor,
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 9f677eca64..d0a0195fa7 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -23,11 +23,7 @@ void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
     // create functions
     generate_llvm(node);
     // Finally, run the benchmark and log the measurements.
-    if (platform.is_CUDA_gpu()) {
-        run_benchmark_on_gpu(node);
-    } else {
-        run_benchmark_on_cpu(node);
-    }
+    run_benchmark(node);
 }
 
 void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
@@ -41,24 +37,55 @@ void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
     logger->info("Created LLVM IR module from NMODL AST in {} sec", diff.count());
 }
 
-void LLVMBenchmark::run_benchmark_on_cpu(const std::shared_ptr<ast::Program>& node) {
+void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     // Set the codegen data helper and find the kernels.
     auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
     std::vector<std::string> kernel_names;
     llvm_visitor.find_kernel_names(kernel_names);
 
     // Get feature's string and turn them off depending on the cpu.
-    const auto cpu_name = platform.get_name();
-    logger->info("CPU: {}", cpu_name);
+    std::string backend_name;
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    if (platform.is_CUDA_gpu()) {
+        backend_name = platform.get_name();
+    } else {
+#endif
+        backend_name = platform.get_name() == "default" ? llvm::sys::getHostCPUName().str()
+                                                        : platform.get_name();
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    }
+#endif
+    logger->info("Backend: {}", backend_name);
 
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
     // Create the benchmark runner and initialize it.
-    std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
-                           mod_filename;
-    runner::BenchmarkRunner runner(
-        std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
-    runner.initialize_driver();
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    if (platform.is_CUDA_gpu()) {
+        std::string filename = "cuda_" + mod_filename;
+        cuda_runner = std::make_unique<runner::BenchmarkGPURunner>(std::move(m),
+                                                                   filename,
+                                                                   output_dir,
+                                                                   backend_name,
+                                                                   shared_libs,
+                                                                   opt_level_ir,
+                                                                   opt_level_codegen);
+        cuda_runner->initialize_driver();
+    } else {
+#endif
+        std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
+                               mod_filename;
+        cpu_runner = std::make_unique<runner::BenchmarkRunner>(std::move(m),
+                                                               filename,
+                                                               output_dir,
+                                                               backend_name,
+                                                               shared_libs,
+                                                               opt_level_ir,
+                                                               opt_level_codegen);
+        cpu_runner->initialize_driver();
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    }
+#endif
 
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
@@ -80,70 +107,17 @@ void LLVMBenchmark::run_benchmark_on_cpu(const std::shared_ptr<ast::Program>& no
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            runner.run_with_argument<int, void*>(wrapper_name, instance_data.base_ptr);
-            auto end = std::chrono::steady_clock::now();
-            std::chrono::duration<double> diff = end - start;
-
-            // Log the time taken for each run.
-            logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
-
-            // Update statistics.
-            time_sum += diff.count();
-            time_squared_sum += diff.count() * diff.count();
-            time_min = std::min(time_min, diff.count());
-            time_max = std::max(time_max, diff.count());
-        }
-        // Log the average time taken for the kernel.
-        double time_mean = time_sum / num_experiments;
-        logger->info("Average compute time = {:.6f}", time_mean);
-        logger->info("Compute time variance = {:g}",
-                     time_squared_sum / num_experiments - time_mean * time_mean);
-        logger->info("Minimum compute time = {:.6f}", time_min);
-        logger->info("Maximum compute time = {:.6f}\n", time_max);
-    }
-}
-
 #ifdef NMODL_LLVM_CUDA_BACKEND
-void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& node) {
-    // Set the codegen data helper and find the kernels.
-    auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
-    std::vector<std::string> kernel_names;
-    llvm_visitor.find_kernel_names(kernel_names);
-
-    // Get feature's string and turn them off depending on the cpu.
-    const auto gpu_name = platform.get_name();
-    logger->info("GPU backend: {}", gpu_name);
-
-    std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
-
-    // Create the benchmark runner and initialize it.
-    std::string filename = "cuda_" + mod_filename;
-    runner::BenchmarkGPURunner runner(
-        std::move(m), filename, output_dir, gpu_name, shared_libs, opt_level_ir, opt_level_codegen);
-    runner.initialize_driver();
-
-    // Benchmark every kernel.
-    for (const auto& kernel_name: kernel_names) {
-        // For every kernel run the benchmark `num_experiments` times.
-        double time_min = std::numeric_limits<double>::max();
-        double time_max = 0.0;
-        double time_sum = 0.0;
-        double time_squared_sum = 0.0;
-        for (int i = 0; i < num_experiments; ++i) {
-            // Initialise the data.
-            auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-
-            // Log instance size once.
-            if (i == 0) {
-                double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-                logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
+            if (platform.is_CUDA_gpu()) {
+                cuda_runner->run_with_argument<void*>(kernel_name,
+                                                      instance_data.base_ptr,
+                                                      gpu_execution_parameters);
+            } else {
+#endif
+                cpu_runner->run_with_argument<int, void*>(wrapper_name, instance_data.base_ptr);
+#ifdef NMODL_LLVM_CUDA_BACKEND
             }
-
-            // Record the execution time of the kernel.
-            auto start = std::chrono::steady_clock::now();
-            runner.run_with_argument<void*>(kernel_name,
-                                            instance_data.base_ptr,
-                                            gpu_execution_parameters);
+#endif
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
@@ -165,12 +139,6 @@ void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& no
         logger->info("Maximum compute time = {:.6f}\n", time_max);
     }
 }
-#else
-void LLVMBenchmark::run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& node) {
-    throw std::runtime_error(
-        "GPU benchmarking is not supported if NMODL is not built with CUDA backend enabled.");
-}
-#endif
 
 }  // namespace benchmark
 }  // namespace nmodl
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index 52232f7787..c992704e49 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -12,6 +12,8 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "gpu_parameters.hpp"
+#include "test/benchmark/cuda_driver.hpp"
+#include "test/benchmark/jit_driver.hpp"
 #include "utils/logger.hpp"
 
 using nmodl::codegen::Platform;
@@ -60,6 +62,12 @@ class LLVMBenchmark {
     /// Filestream for dumping logs to the file.
     std::ofstream ofs;
 
+    /// CPU benchmark runner
+    std::unique_ptr<runner::BenchmarkRunner> cpu_runner;
+
+    /// CUDA benchmark runner
+    std::unique_ptr<runner::BenchmarkGPURunner> cuda_runner;
+
   public:
     LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,
                   const std::string& mod_filename,
@@ -107,11 +115,8 @@ class LLVMBenchmark {
     /// Visits the AST to construct the LLVM IR module.
     void generate_llvm(const std::shared_ptr<ast::Program>& node);
 
-    /// Runs the main body of the benchmark, executing the compute kernels on CPU.
-    void run_benchmark_on_cpu(const std::shared_ptr<ast::Program>& node);
-
-    /// Runs the main body of the benchmark, executing the compute kernels on GPU.
-    void run_benchmark_on_gpu(const std::shared_ptr<ast::Program>& node);
+    /// Runs the main body of the benchmark, executing the compute kernels on CPU or GPU.
+    void run_benchmark(const std::shared_ptr<ast::Program>& node);
 
     /// Sets the log output stream (file or console).
     void set_log_output();

From eec1dffaa24250722fd3dc6214e7450b553d506c Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 15:59:51 +0200
Subject: [PATCH 252/331] Removed backend name from CUDADriver init

---
 test/benchmark/cuda_driver.cpp | 2 +-
 test/benchmark/cuda_driver.hpp | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index 56c44bd14f..c0626912d9 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -92,7 +92,7 @@ std::string print_bitcode_to_string(const llvm::Module& module) {
     return bitcode_string;
 }
 
-void CUDADriver::init(const std::string& gpu, BenchmarkInfo* benchmark_info) {
+void CUDADriver::init(BenchmarkInfo* benchmark_info) {
     // CUDA initialization
     checkCudaErrors(cuInit(0));
     checkCudaErrors(cuDeviceGetCount(&device_info.count));
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index 8a4f81cc5e..ec479d03c5 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -85,7 +85,7 @@ class CUDADriver {
         : module(std::move(m)) {}
 
     /// Initializes the CUDA GPU JIT driver.
-    void init(const std::string& gpu, BenchmarkInfo* benchmark_info = nullptr);
+    void init(BenchmarkInfo* benchmark_info = nullptr);
 
     /// Lookups the entry-point without arguments in the CUDA module and executes it.
     void execute_without_arguments(const std::string& entry_point,
@@ -179,7 +179,7 @@ class TestGPURunner: public BaseGPURunner {
         : BaseGPURunner(std::move(m)) {}
 
     virtual void initialize_driver() {
-        driver->init(backend);
+        driver->init();
     }
 };
 
@@ -209,7 +209,7 @@ class BenchmarkGPURunner: public BaseGPURunner {
         , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
     virtual void initialize_driver() {
-        driver->init(backend, &benchmark_info);
+        driver->init(&benchmark_info);
     }
 };
 

From 412dc0f5faf6b003ae95bdfef420304de8e38aca Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 16:02:09 +0200
Subject: [PATCH 253/331] More cleanup of gpu name

---
 test/benchmark/cuda_driver.hpp    | 10 +---------
 test/benchmark/llvm_benchmark.cpp |  1 -
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index ec479d03c5..c9d04317c7 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -171,11 +171,8 @@ class BaseGPURunner {
  * \brief A simple runner for testing purposes.
  */
 class TestGPURunner: public BaseGPURunner {
-    /// GPU backend to target.
-    std::string backend;
-
   public:
-    explicit TestGPURunner(std::unique_ptr<llvm::Module> m, std::string backend)
+    explicit TestGPURunner(std::unique_ptr<llvm::Module> m)
         : BaseGPURunner(std::move(m)) {}
 
     virtual void initialize_driver() {
@@ -193,19 +190,14 @@ class BenchmarkGPURunner: public BaseGPURunner {
     /// Benchmarking information passed to JIT driver.
     BenchmarkInfo benchmark_info;
 
-    /// Beckend to target.
-    std::string backend;
-
   public:
     BenchmarkGPURunner(std::unique_ptr<llvm::Module> m,
                        std::string filename,
                        std::string output_dir,
-                       std::string backend,
                        std::vector<std::string> lib_paths = {},
                        int opt_level_ir = 0,
                        int opt_level_codegen = 0)
         : BaseGPURunner(std::move(m))
-        , backend(backend)
         , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
     virtual void initialize_driver() {
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index d0a0195fa7..6a2914708a 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -66,7 +66,6 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         cuda_runner = std::make_unique<runner::BenchmarkGPURunner>(std::move(m),
                                                                    filename,
                                                                    output_dir,
-                                                                   backend_name,
                                                                    shared_libs,
                                                                    opt_level_ir,
                                                                    opt_level_codegen);

From 997a2d037159b9224ab3114a96ac7cd570e695be Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 16:11:01 +0200
Subject: [PATCH 254/331] Refactor cuda kernel execution to avoid code
 duplication

---
 test/benchmark/cuda_driver.hpp    | 34 ++++++++++---------------------
 test/benchmark/llvm_benchmark.cpp |  8 ++------
 2 files changed, 13 insertions(+), 29 deletions(-)

diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index c9d04317c7..399b154b33 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -87,14 +87,13 @@ class CUDADriver {
     /// Initializes the CUDA GPU JIT driver.
     void init(BenchmarkInfo* benchmark_info = nullptr);
 
-    /// Lookups the entry-point without arguments in the CUDA module and executes it.
-    void execute_without_arguments(const std::string& entry_point,
-                                   const GPUExecutionParameters& gpu_execution_parameters) {
+    void launch_cuda_kernel(const std::string& entry_point,
+                            const GPUExecutionParameters& gpu_execution_parameters,
+                            void* kernel_parameters) {
         // Get kernel function
         checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
 
         // Kernel launch
-        void* kernel_parameters[] = {};
         checkCudaErrors(cuLaunchKernel(function,
                                        gpu_execution_parameters.gridDimX,
                                        1,
@@ -104,34 +103,23 @@ class CUDADriver {
                                        1,
                                        0,
                                        nullptr,
-                                       kernel_parameters,
+                                       &kernel_parameters,
                                        nullptr));
         cudaDeviceSynchronize();
     }
 
+    /// Lookups the entry-point without arguments in the CUDA module and executes it.
+    void execute_without_arguments(const std::string& entry_point,
+                                   const GPUExecutionParameters& gpu_execution_parameters) {
+        launch_cuda_kernel(entry_point, gpu_execution_parameters, {});
+    }
+
     /// Lookups the entry-point with arguments in the CUDA module and executes it.
     template <typename ArgType>
     void execute_with_arguments(const std::string& entry_point,
                                 ArgType arg,
                                 const GPUExecutionParameters& gpu_execution_parameters) {
-        // Get kernel function
-        logger->info("Executing kernel {}", entry_point);
-        checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
-
-        // Kernel launch
-        void* kernel_parameters[] = {&arg};
-        checkCudaErrors(cuLaunchKernel(function,
-                                       gpu_execution_parameters.gridDimX,
-                                       1,
-                                       1,
-                                       gpu_execution_parameters.blockDimX,
-                                       1,
-                                       1,
-                                       0,
-                                       nullptr,
-                                       kernel_parameters,
-                                       nullptr));
-        cudaDeviceSynchronize();
+        launch_cuda_kernel(entry_point, gpu_execution_parameters, {&arg});
     }
 };
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 6a2914708a..9ae2c2911d 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -63,12 +63,8 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 #ifdef NMODL_LLVM_CUDA_BACKEND
     if (platform.is_CUDA_gpu()) {
         std::string filename = "cuda_" + mod_filename;
-        cuda_runner = std::make_unique<runner::BenchmarkGPURunner>(std::move(m),
-                                                                   filename,
-                                                                   output_dir,
-                                                                   shared_libs,
-                                                                   opt_level_ir,
-                                                                   opt_level_codegen);
+        cuda_runner = std::make_unique<runner::BenchmarkGPURunner>(
+            std::move(m), filename, output_dir, shared_libs, opt_level_ir, opt_level_codegen);
         cuda_runner->initialize_driver();
     } else {
 #endif

From b2ea4231769e7e3e6c23ff315a38822224510be9 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 16:15:40 +0200
Subject: [PATCH 255/331] Small fixes for non-CUDA backend compilation

---
 CMakeLists.txt                    | 1 +
 test/benchmark/llvm_benchmark.hpp | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c160d39d09..1cc8a2eefb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" OFF)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
 option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
 option(NMODL_ENABLE_LLVM_GPU "Enable LLVM based GPU code generation" ON)
+option(NMODL_ENABLE_LLVM_CUDA "Enable LLVM CUDA backend to run GPU benchmark" OFF)
 option(NMODL_ENABLE_JIT_EVENT_LISTENERS "Enable JITEventListener for Perf and Vtune" OFF)
 
 if(NMODL_ENABLE_LEGACY_UNITS)
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index c992704e49..a8d89d985f 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -65,8 +65,10 @@ class LLVMBenchmark {
     /// CPU benchmark runner
     std::unique_ptr<runner::BenchmarkRunner> cpu_runner;
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
     /// CUDA benchmark runner
     std::unique_ptr<runner::BenchmarkGPURunner> cuda_runner;
+#endif
 
   public:
     LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,

From a66d1554863e39b10b04e32b5f46f91f9df31c62 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 17:32:56 +0200
Subject: [PATCH 256/331] Fix wrapper issue on GPU

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 30 ++++++++++++++++-------
 test/benchmark/llvm_benchmark.cpp         |  2 +-
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index f123807859..11e45ba300 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -465,10 +465,6 @@ void CodegenLLVMVisitor::write_to_variable(const ast::VarName& node, llvm::Value
 }
 
 void CodegenLLVMVisitor::wrap_kernel_functions() {
-    // Wrapper doesn't work on GPU
-    if (platform.is_gpu()) {
-        return;
-    }
     // First, identify all kernels.
     std::vector<std::string> kernel_names;
     find_kernel_names(kernel_names);
@@ -478,10 +474,15 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         auto kernel = module->getFunction(kernel_name);
 
         // Create a wrapper void function that takes a void pointer as a single argument.
-        llvm::Type* i32_type = ir_builder.get_i32_type();
+        llvm::Type* return_type;
+        if (platform.is_gpu()) {
+            return_type = ir_builder.get_void_type();
+        } else {
+            return_type = ir_builder.get_i32_type();
+        }
         llvm::Type* void_ptr_type = ir_builder.get_i8_ptr_type();
         llvm::Function* wrapper_func = llvm::Function::Create(
-            llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
+            llvm::FunctionType::get(return_type, {void_ptr_type}, /*isVarArg=*/false),
             llvm::Function::ExternalLinkage,
             "__" + kernel_name + "_wrapper",
             *module);
@@ -501,9 +502,20 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         args.push_back(bitcasted);
         ir_builder.create_function_call(kernel, args, /*use_result=*/false);
 
-        // Create a 0 return value and a return instruction.
-        ir_builder.create_i32_constant(0);
-        ir_builder.create_return(ir_builder.pop_last_value());
+        // create return instructions and annotate wrapper with certain attributes depending on
+        // the backend type
+        if (platform.is_gpu()) {
+            // return void
+            ir_builder.create_return();
+            annotate_kernel_with_nvvm(wrapper_func);
+        } else {
+            // Create a 0 return value and a return instruction.
+            ir_builder.create_i32_constant(0);
+            ir_builder.create_return(ir_builder.pop_last_value());
+            ir_builder.set_function(wrapper_func);
+            ir_builder.set_kernel_attributes();
+        }
+        ir_builder.clear_function();
     }
 }
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 9ae2c2911d..4ff425e3f0 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -104,7 +104,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             auto start = std::chrono::steady_clock::now();
 #ifdef NMODL_LLVM_CUDA_BACKEND
             if (platform.is_CUDA_gpu()) {
-                cuda_runner->run_with_argument<void*>(kernel_name,
+                cuda_runner->run_with_argument<void*>(wrapper_name,
                                                       instance_data.base_ptr,
                                                       gpu_execution_parameters);
             } else {

From 4743361eb492cce5286586551b23e1cbcd4ac519 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 17:33:30 +0200
Subject: [PATCH 257/331] Print async execution error on GPU

---
 test/benchmark/cuda_driver.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index 399b154b33..15ce26bf07 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -105,7 +105,10 @@ class CUDADriver {
                                        nullptr,
                                        &kernel_parameters,
                                        nullptr));
-        cudaDeviceSynchronize();
+        auto asyncErr = cudaDeviceSynchronize();
+        if (asyncErr != cudaSuccess) {
+            throw std::runtime_error("CUDA Execution Error: {}\n"_format(cudaGetErrorString(asyncErr)));
+        }
     }
 
     /// Lookups the entry-point without arguments in the CUDA module and executes it.

From cd978f3e4a8d325c13320a8d198352e65d85a3c5 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 18:22:07 +0200
Subject: [PATCH 258/331] Added documentation for running the GPU benchmark

---
 INSTALL.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/INSTALL.md b/INSTALL.md
index 1b65c1212c..724b158ed3 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -141,6 +141,29 @@ export NMODL_WRAPLIB=/opt/nmodl/lib/libpywrapper.so
 **Note**: In order for all unit tests to function correctly when building without linking against libpython we must
 set `NMODL_PYLIB` before running cmake!
 
+### Using CUDA backend to run benchmarks
+
+`NMODL` supports generating code and compiling it for execution on an `NVIDIA` GPU via its benchmark infrastructure using the `LLVM` backend. To enable the `CUDA` backend to compile and execute the GPU code we need to set the following `CMake` flag during compilation of `NMODL`:
+```
+-DNMODL_ENABLE_LLVM_CUDA=ON
+```
+
+To find the need `CUDA` libraries (`cudart`, `nvrtc` and `nvvm`) it's needed to have CUDA Toolkit installed on your system.
+This can be done by installing the CUDA Toolkit from the [CUDA Toolkit website](https://developer.nvidia.com/cuda-downloads) or by installing the `CUDA` spack package and loading the corresponding module.
+
+Then given a supported MOD file you can execute the benchmark on GPU in you supported NVIDIA GPU by running the following command:
+```
+./bin/nmodl <file>.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_80" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_ROOT}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2 --grid-dim-x 4096 --block-dim-x 256
+```
+The above command executes the benchmark on a GPU with `Compute Architecture` `sm_80` and links the generated code to the `libdevice` optimized math library provided by `NVIDIA`.
+Using the above command you can also select the optimization level of the generated code, the instance size of the generated data, the number of repetitions and the grid and block dimensions for the GPU execution.
+
+**Note**: In order for the CUDA backend to be able to compile and execute the generated code on GPU the CUDA Toolkit version installed needs to have the same version as the `CUDA` installed by the NVIDIA driver in the system that will be used to run the benchmark.
+You can find the CUDA Toolkit version by running the following command:
+```
+nvidia-smi
+```
+and noting the `CUDA Version` stated there. For example if `CUDA Version` reported by `nvidia-smi` is CUDA 11.4 you need to install the `CUDA Toolkit 11.4.*` to be able to compile and execute the GPU code.
 
 ## Testing the Installed Module
 

From d6adc1ffb914a0caea7e21fa75a904ae50344a3f Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 29 Mar 2022 18:22:31 +0200
Subject: [PATCH 259/331] Mention CMake 3.18 dependency

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index 724b158ed3..4f61866e82 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -21,7 +21,7 @@ To build the project from source, a modern C++ compiler with C++14 support is ne
 
 - flex (>=2.6)
 - bison (>=3.0)
-- CMake (>=3.15)
+- CMake (>=3.18)
 - Python (>=3.6)
 - Python packages : jinja2 (>=2.10), pyyaml (>=3.13), pytest (>=4.0.0), sympy (>=1.3), textwrap
 

From a12a7769e7174b9b6fe599e702eefd91bde3266a Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 31 Mar 2022 13:59:05 +0200
Subject: [PATCH 260/331] Working CUDA JIT

---
 src/codegen/llvm/llvm_utils.cpp   |  34 +++++++---
 src/codegen/llvm/llvm_utils.hpp   |   8 ++-
 src/main.cpp                      |   7 +-
 test/benchmark/cuda_driver.cpp    | 108 ++++++++++++++++++++++++------
 test/benchmark/cuda_driver.hpp    |  14 ++--
 test/benchmark/llvm_benchmark.cpp |   2 +-
 6 files changed, 131 insertions(+), 42 deletions(-)

diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index 7086275557..72b0f6fd19 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -74,12 +74,8 @@ void initialise_nvptx_passes() {
     initialise_optimisation_passes();
 }
 
-void optimise_module_for_nvptx(codegen::Platform& platform,
-                               llvm::Module& module,
-                               int opt_level,
-                               std::string& target_asm) {
+std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform, llvm::Module& module) {
     // CUDA target machine we generating code for.
-    std::unique_ptr<llvm::TargetMachine> tm;
     std::string platform_name = platform.get_name();
 
     // Target and layout information.
@@ -108,9 +104,30 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
     if (!target)
         throw std::runtime_error("Error: " + error_msg + "\n");
 
+    std::unique_ptr<llvm::TargetMachine> tm;
     tm.reset(target->createTargetMachine(triple, subtarget, features, {}, {}));
     if (!tm)
         throw std::runtime_error("Error: creating target machine failed! Aborting.");
+    return tm;
+}
+
+std::string get_module_ptx(llvm::TargetMachine& tm, llvm::Module& module) {
+    std::string target_asm;
+    llvm::raw_string_ostream stream(target_asm);
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegen_pm;
+
+    tm.addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
+    codegen_pm.run(module);
+    return target_asm;
+}
+
+void optimise_module_for_nvptx(const codegen::Platform& platform,
+                               llvm::Module& module,
+                               int opt_level,
+                               std::string& target_asm) {
+    // Create target machine for CUDA GPU
+    auto tm = create_CUDA_target_machine(platform, module);
 
     // Create pass managers.
     llvm::legacy::FunctionPassManager func_pm(&module);
@@ -134,12 +151,7 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
 
     // Now, we want to run target-specific (e.g. NVPTX) passes. In LLVM, this
     // is done via `addPassesToEmitFile`.
-    llvm::raw_string_ostream stream(target_asm);
-    llvm::buffer_ostream pstream(stream);
-    llvm::legacy::PassManager codegen_pm;
-
-    tm->addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
-    codegen_pm.run(module);
+    target_asm = get_module_ptx(*tm, module);
 }
 
 void initialise_optimisation_passes() {
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index 17be5073e2..d73c3dea21 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -21,8 +21,14 @@ void initialise_optimisation_passes();
 /// Initialises NVPTX-specific optimisation passes.
 void initialise_nvptx_passes();
 
+//// Initializes a CUDA target machine
+std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform, llvm::Module& module);
+
+/// Generate PTX code given a CUDA target machine and the module
+std::string get_module_ptx(llvm::TargetMachine& tm, llvm::Module& module);
+
 /// Optimises the given LLVM IR module for NVPTX targets.
-void optimise_module_for_nvptx(codegen::Platform& platform,
+void optimise_module_for_nvptx(const codegen::Platform& platform,
                                llvm::Module& module,
                                int opt_level,
                                std::string& target_asm);
diff --git a/src/main.cpp b/src/main.cpp
index 90b6440f7c..4975724eed 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -194,6 +194,9 @@ int main(int argc, const char* argv[]) {
     /// traget GPU platform name
     std::string llvm_gpu_name = "default";
 
+    /// GPU target architecture
+    std::string llvm_gpu_target_architecture = "sm_70";
+
     /// llvm vector width if generating code for CPUs
     int llvm_vector_width = 1;
 
@@ -367,7 +370,7 @@ int main(int argc, const char* argv[]) {
         "Name of GPU platform to use")->ignore_case();
     gpu_target_name->check(CLI::IsMember({"nvptx", "nvptx64"}));
     gpu_opt->add_option("--target-arch",
-        llvm_cpu_name,
+        llvm_gpu_target_architecture,
         "Name of target architecture to use")->ignore_case();
     auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
         llvm_math_library,
@@ -715,7 +718,7 @@ int main(int argc, const char* argv[]) {
                 // Create platform abstraction.
                 PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU : PlatformID::GPU;
                 const std::string name = llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
-                Platform platform(pid, name, llvm_math_library, llvm_float_type, llvm_vector_width);
+                Platform platform(pid, name, llvm_gpu_target_architecture, llvm_math_library, llvm_float_type, llvm_vector_width);
 
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index c0626912d9..a848abf989 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -17,6 +17,10 @@
 #include "utils/common_utils.hpp"
 
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetMachine.h"
 
 using fmt::literals::operator""_format;
 
@@ -27,7 +31,8 @@ void CUDADriver::checkCudaErrors(CUresult err) {
     if (err != CUDA_SUCCESS) {
         const char* ret = NULL;
         cuGetErrorName(err, &ret);
-        throw std::runtime_error("CUDA error: " + std::string(ret));
+        // throw std::runtime_error("CUDA error: " + std::string(ret));
+        std::cout << "CUDA error: " << ret << std::endl;
     }
 }
 
@@ -51,7 +56,8 @@ std::string load_file_to_string(const std::string& filename) {
     return str;
 }
 
-void CUDADriver::load_libraries(BenchmarkInfo* benchmark_info) {
+void CUDADriver::link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_info) {
+    llvm::Linker linker(module);
     for (const auto& lib_path: benchmark_info->shared_lib_paths) {
         const auto lib_name = lib_path.substr(lib_path.find_last_of("/\\") + 1);
         std::regex libdevice_bitcode_name{"libdevice.*.bc"};
@@ -59,13 +65,25 @@ void CUDADriver::load_libraries(BenchmarkInfo* benchmark_info) {
             throw std::runtime_error("Only libdevice is supported for now");
         }
         // Load libdevice module to the NVVM program
-        const auto libdevice_module = load_file_to_string(lib_path);
-        const auto libdevice_module_size = libdevice_module.size();
-        checkNVVMErrors(nvvmAddModuleToProgram(
-            prog, libdevice_module.c_str(), libdevice_module_size, "libdevice"));
+        llvm::SMDiagnostic Error;
+
+        llvm::errs() << lib_name << "\n";
+        auto LibDeviceModule = parseIRFile(lib_name, Error, module.getContext());
+        if (!LibDeviceModule) {
+            throw std::runtime_error("Could not find or load libdevice\n");
+        }
+        linker.linkInModule(std::move(LibDeviceModule), llvm::Linker::LinkOnlyNeeded);
     }
 }
 
+std::string get_ptx_compiled_module(const llvm::Module& module) {
+    std::string SPIRAssembly;
+    llvm::raw_string_ostream IROstream(SPIRAssembly);
+    IROstream << module;
+    IROstream.flush();
+    return SPIRAssembly;
+}
+
 auto get_compilation_options(int compute_version_major, BenchmarkInfo* benchmark_info) {
     std::vector<std::string> compilation_options;
     // Set the correct architecture to generate the PTX for
@@ -92,7 +110,7 @@ std::string print_bitcode_to_string(const llvm::Module& module) {
     return bitcode_string;
 }
 
-void CUDADriver::init(BenchmarkInfo* benchmark_info) {
+void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmark_info) {
     // CUDA initialization
     checkCudaErrors(cuInit(0));
     checkCudaErrors(cuDeviceGetCount(&device_info.count));
@@ -119,15 +137,15 @@ void CUDADriver::init(BenchmarkInfo* benchmark_info) {
     std::string kernel_bitcode = print_bitcode_to_string(*module);
 
     // Create NVVM program object
-    checkNVVMErrors(nvvmCreateProgram(&prog));
+    // checkNVVMErrors(nvvmCreateProgram(&prog));
 
     // Load the external libraries modules to the NVVM program
     // Currently only libdevice is supported
-    load_libraries(benchmark_info);
+    // link_libraries(*module, benchmark_info);
 
     // Add custom IR to program
-    checkNVVMErrors(nvvmAddModuleToProgram(
-        prog, kernel_bitcode.c_str(), kernel_bitcode.size(), "nmodl_kernel"));
+    // checkNVVMErrors(nvvmAddModuleToProgram(
+        // prog, kernel_bitcode.c_str(), kernel_bitcode.size(), "nmodl_kernel"));
 
     // Declare compile options
     auto compilation_options = get_compilation_options(device_info.compute_version_major,
@@ -138,23 +156,73 @@ void CUDADriver::init(BenchmarkInfo* benchmark_info) {
         compilation_options_c_str.push_back(option.c_str());
     }
     // Compile the program
-    checkNVVMErrors(nvvmCompileProgram(prog,
-                                       compilation_options_c_str.size(),
-                                       compilation_options_c_str.data()));
+    logger->info("Compiling the LLVM IR to PTX");
+    // checkNVVMErrors(nvvmCompileProgram(prog,
+                                    //    compilation_options_c_str.size(),
+                                    //    compilation_options_c_str.data()));
 
     // Get compiled module
     size_t compiled_module_size;
-    nvvmGetCompiledResultSize(prog, &compiled_module_size);
-    ptx_compiled_module.resize(compiled_module_size);
-    nvvmGetCompiledResult(prog, &ptx_compiled_module.front());
-    print_string_to_file(ptx_compiled_module,
-                         benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
+    // nvvmGetCompiledResultSize(prog, &compiled_module_size);
+    // ptx_compiled_module.resize(compiled_module_size);
+    // nvvmGetCompiledResult(prog, &ptx_compiled_module.front());
+    // print_string_to_file(ptx_compiled_module,
+                        //  benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
 
     // Create driver context
     checkCudaErrors(cuCtxCreate(&context, 0, device));
 
+    // Create target machine for CUDA GPU and generate PTX code
+    // auto tm = utils::create_CUDA_target_machine(platform, *module);
+    // ptx_compiled_module = utils::get_module_ptx(*tm, *module);
+    const auto opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
+    utils::optimise_module_for_nvptx(platform, *module, opt_level_codegen, ptx_compiled_module);
+    if (benchmark_info) {
+        print_string_to_file(ptx_compiled_module,
+                            benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
+    }
+
     // Create module for object
-    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx_compiled_module.c_str(), 0, 0, 0));
+    logger->info("Loading PTX to CUDA module");
+    // CUjit_option options[] = {CU_JIT_TARGET};
+    // void** option_vals = new void*[1];
+    // auto target_architecture = CU_TARGET_COMPUTE_86;
+    // option_vals[0] = (void*)target_architecture;
+    const unsigned int jitNumOptions = 6;
+    CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
+    void **jitOptVals = new void*[jitNumOptions];
+
+    // set up size of compilation log buffer                                                                                                     
+    jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+    size_t jitLogBufferSize = 1024*1024;
+    jitOptVals[0] = (void*)jitLogBufferSize;
+
+    // set up pointer to the compilation log buffer
+    jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
+    char *jitLogBuffer = new char[jitLogBufferSize];
+    jitOptVals[1] = jitLogBuffer;
+
+    // set up size of compilation error log buffer
+    jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+    size_t jitErrorLogBufferSize = 1024*1024;
+    jitOptVals[2] = (void*)jitErrorLogBufferSize;
+
+    // set up pointer to the compilation error log buffer
+    jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
+    char *jitErrorLogBuffer = new char[jitErrorLogBufferSize];
+    jitOptVals[3] = jitErrorLogBuffer;
+
+    // set up wall clock time                                                                                                                    
+    jitOptions[4] = CU_JIT_WALL_TIME;
+    float jitTime = 0.0;
+
+    jitOptions[5] = CU_JIT_TARGET;
+    auto target_architecture = CU_TARGET_COMPUTE_86;
+    jitOptVals[5] = (void*)target_architecture;
+    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx_compiled_module.c_str(), jitNumOptions, jitOptions, jitOptVals));
+    logger->info("CUDA JIT walltime: "_format((double)jitOptions[4]));
+    logger->info("CUDA JIT INFO LOG: "_format(jitLogBuffer));
+    logger->info("CUDA JIT ERROR LOG: "_format(jitErrorLogBuffer));
 }
 
 }  // namespace runner
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index 15ce26bf07..ed030816e0 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -78,14 +78,14 @@ class CUDADriver {
 
     void checkCudaErrors(CUresult err);
     void checkNVVMErrors(nvvmResult err);
-    void load_libraries(BenchmarkInfo* benchmark_info);
+    void link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_info);
 
   public:
     explicit CUDADriver(std::unique_ptr<llvm::Module> m)
         : module(std::move(m)) {}
 
     /// Initializes the CUDA GPU JIT driver.
-    void init(BenchmarkInfo* benchmark_info = nullptr);
+    void init(const codegen::Platform& platform, BenchmarkInfo* benchmark_info = nullptr);
 
     void launch_cuda_kernel(const std::string& entry_point,
                             const GPUExecutionParameters& gpu_execution_parameters,
@@ -140,7 +140,7 @@ class BaseGPURunner {
 
   public:
     /// Sets up the CUDA driver.
-    virtual void initialize_driver() = 0;
+    virtual void initialize_driver(const codegen::Platform& platform) = 0;
 
     /// Runs the entry-point function without arguments.
     void run_without_arguments(const std::string& entry_point,
@@ -166,8 +166,8 @@ class TestGPURunner: public BaseGPURunner {
     explicit TestGPURunner(std::unique_ptr<llvm::Module> m)
         : BaseGPURunner(std::move(m)) {}
 
-    virtual void initialize_driver() {
-        driver->init();
+    virtual void initialize_driver(const codegen::Platform& platform) {
+        driver->init(platform);
     }
 };
 
@@ -191,8 +191,8 @@ class BenchmarkGPURunner: public BaseGPURunner {
         : BaseGPURunner(std::move(m))
         , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
 
-    virtual void initialize_driver() {
-        driver->init(&benchmark_info);
+    virtual void initialize_driver(const codegen::Platform& platform) {
+        driver->init(platform, &benchmark_info);
     }
 };
 
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 4ff425e3f0..98b3c0ec04 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -65,7 +65,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         std::string filename = "cuda_" + mod_filename;
         cuda_runner = std::make_unique<runner::BenchmarkGPURunner>(
             std::move(m), filename, output_dir, shared_libs, opt_level_ir, opt_level_codegen);
-        cuda_runner->initialize_driver();
+        cuda_runner->initialize_driver(platform);
     } else {
 #endif
         std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +

From 8ff894c991f6c7ee9453437010587e2261adeed2 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 31 Mar 2022 14:58:15 +0200
Subject: [PATCH 261/331] Cleared up NVVM code

---
 CMakeLists.txt                 |   8 +-
 cmake/FindNVVM.cmake           |  65 --------------
 test/benchmark/CMakeLists.txt  |   2 +-
 test/benchmark/cuda_driver.cpp | 153 ++++++++++++---------------------
 test/benchmark/cuda_driver.hpp |  15 +---
 5 files changed, 59 insertions(+), 184 deletions(-)
 delete mode 100644 cmake/FindNVVM.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1cc8a2eefb..6f912d8ecd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,9 +166,7 @@ if(NMODL_ENABLE_LLVM)
   if(NMODL_ENABLE_LLVM_CUDA)
     enable_language(CUDA)
     find_package(CUDAToolkit)
-    include(cmake/FindNVVM.cmake)
-    set(NMODL_CUDA_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS} ${CUDA_NVVM_INCLUDE_DIR})
-    include_directories(${NMODL_CUDA_INCLUDE_DIRECTORIES})
+    include_directories(${CUDAToolkit_INCLUDE_DIRS})
     add_definitions(-DNMODL_LLVM_CUDA_BACKEND)
   endif()
 endif()
@@ -280,8 +278,8 @@ endif()
 message(STATUS "LLVM CUDA Codegen   | ${NMODL_ENABLE_LLVM_CUDA}")
 if(NMODL_ENABLE_LLVM_CUDA)
   message(STATUS "  CUDA VERSION      | ${CUDAToolkit_VERSION}")
-  message(STATUS "  INCLUDE           | ${NMODL_CUDA_INCLUDE_DIRECTORIES}")
-  message(STATUS "  LIBRARY           | ${CUDAToolkit_LIBRARY_DIR};${CUDA_NVVM_LIBRARIES}")
+  message(STATUS "  INCLUDE           | ${CUDAToolkit_INCLUDE_DIRS}")
+  message(STATUS "  LIBRARY           | ${CUDAToolkit_LIBRARY_DIR}")
 endif()
 if(NMODL_CLANG_FORMAT)
   message(STATUS "Clang Format        | ${ClangFormat_EXECUTABLE}")
diff --git a/cmake/FindNVVM.cmake b/cmake/FindNVVM.cmake
deleted file mode 100644
index a975d9d5b4..0000000000
--- a/cmake/FindNVVM.cmake
+++ /dev/null
@@ -1,65 +0,0 @@
-# - Find the NVVM include directory and libraries
-# Modified version of the file found here:
-# https://raw.githubusercontent.com/nvidia-compiler-sdk/nvvmir-samples/master/CMakeLists.txt
-# https://raw.githubusercontent.com/hshindo/arrayfire/master/CMakeModules/FindNVVM.cmake
-
-# libNVVM
-if(NOT DEFINED ENV{LIBNVVM_HOME})
-  set(LIBNVVM_HOME "${CUDAToolkit_LIBRARY_ROOT}/nvvm")
-else()
-  set(LIBNVVM_HOME "$ENV{LIBNVVM_HOME}")
-endif()
-message(STATUS "Using LIBNVVM_HOME: ${LIBNVVM_HOME}")
-
-if (CMAKE_SIZEOF_VOID_P STREQUAL "8")
-  if (WIN32)
-    set (CUDA_LIB_SEARCH_PATH "${CUDAToolkit_LIBRARY_ROOT}/lib/x64")
-    set (NVVM_DLL_NAME nvvm64_${NVVM_DLL_VERSION}.dll)
-  else ()
-    set (CUDA_LIB_SEARCH_PATH "")
-  endif()
-else()
-  if (WIN32)
-    set (CUDA_LIB_SEARCH_PATH "${CUDAToolkit_LIBRARY_ROOT}/lib/Win32")
-    set (NVVM_DLL_NAME nvvm32_${NVVM_DLL_VERSION}.dll)
-  else()
-    set (CUDA_LIB_SEARCH_PATH "")
-  endif()
-endif()
-
-### Find libNVVM
-# The directory structure for nvvm is a bit complex.
-# On Windows:
-#   32-bit -- nvvm/lib/Win32
-#   64-bit -- nvvm/lib/x64
-# On Linux:
-#   32-bit -- nvvm/lib
-#   64-bit -- nvvm/lib64
-# On Mac:
-#   Universal -- nvvm/lib
-if (CMAKE_SIZEOF_VOID_P STREQUAL "8")
-  if (WIN32)
-    set (LIB_ARCH_SUFFIX "/x64")
-  elseif (APPLE)
-    set (LIB_ARCH_SUFFIX "")
-  else ()
-    set (LIB_ARCH_SUFFIX "64")
-  endif()
-else()
-  if (WIN32)
-    set (LIB_ARCH_SUFFIX "/Win32")
-  else()
-    set (LIB_ARCH_SUFFIX "")
-  endif()
-endif()
-
-find_library(NVVM_LIB nvvm PATHS "${LIBNVVM_HOME}/lib${LIB_ARCH_SUFFIX}")
-find_file(NVVM_H nvvm.h PATHS "${LIBNVVM_HOME}/include")
-
-if(NVVM_H)
-  get_filename_component(CUDA_NVVM_INCLUDE_DIR ${NVVM_H} PATH)
-else()
-  message(FATAL_ERROR "Unable to find nvvm.h")
-endif()
-
-set(CUDA_NVVM_LIBRARIES ${NVVM_LIB})
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index b554bbde91..a0320ae7c4 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -13,7 +13,7 @@ include_directories(${LLVM_INCLUDE_DIRS})
 add_library(llvm_benchmark STATIC ${LLVM_BENCHMARK_SOURCE_FILES})
 add_dependencies(llvm_benchmark lexer util visitor)
 if(NMODL_ENABLE_LLVM_CUDA)
-  target_link_libraries(llvm_benchmark PRIVATE CUDA::cudart CUDA::nvrtc ${CUDA_NVVM_LIBRARIES})
+  target_link_libraries(llvm_benchmark PRIVATE CUDA::cudart CUDA::nvrtc)
 endif()
 
 if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index a848abf989..717049cec6 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -31,31 +31,10 @@ void CUDADriver::checkCudaErrors(CUresult err) {
     if (err != CUDA_SUCCESS) {
         const char* ret = NULL;
         cuGetErrorName(err, &ret);
-        // throw std::runtime_error("CUDA error: " + std::string(ret));
-        std::cout << "CUDA error: " << ret << std::endl;
+        throw std::runtime_error("CUDA error: " + std::string(ret));
     }
 }
 
-void CUDADriver::checkNVVMErrors(nvvmResult err) {
-    if (err != NVVM_SUCCESS) {
-        size_t program_log_size;
-        nvvmGetProgramLogSize(prog, &program_log_size);
-        std::string program_log(program_log_size, '\0');
-        nvvmGetProgramLog(prog, &program_log.front());
-        throw std::runtime_error(
-            "Compilation Log:\n {}\nNVVM Error: {}\n"_format(program_log, nvvmGetErrorString(err)));
-    }
-}
-
-std::string load_file_to_string(const std::string& filename) {
-    std::ifstream t(filename);
-    if (!t.is_open()) {
-        throw std::runtime_error("File {} not found"_format(filename));
-    }
-    std::string str((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
-    return str;
-}
-
 void CUDADriver::link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_info) {
     llvm::Linker linker(module);
     for (const auto& lib_path: benchmark_info->shared_lib_paths) {
@@ -76,38 +55,52 @@ void CUDADriver::link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_i
     }
 }
 
-std::string get_ptx_compiled_module(const llvm::Module& module) {
-    std::string SPIRAssembly;
-    llvm::raw_string_ostream IROstream(SPIRAssembly);
-    IROstream << module;
-    IROstream.flush();
-    return SPIRAssembly;
-}
-
-auto get_compilation_options(int compute_version_major, BenchmarkInfo* benchmark_info) {
-    std::vector<std::string> compilation_options;
-    // Set the correct architecture to generate the PTX for
-    // Architectures should be based on the major compute capability of the GPU
-    const std::string arch_option{"-arch=compute_{}0"_format(compute_version_major)};
-    compilation_options.push_back(arch_option);
-    // Set the correct optimization level
-    const std::string optimization_option{"-opt={}"_format(benchmark_info->opt_level_codegen)};
-    compilation_options.push_back(optimization_option);
-    return compilation_options;
-}
-
 void print_string_to_file(const std::string& ptx_compiled_module, const std::string& filename) {
     std::ofstream ptx_file(filename);
     ptx_file << ptx_compiled_module;
     ptx_file.close();
 }
 
-std::string print_bitcode_to_string(const llvm::Module& module) {
-    std::string bitcode_string;
-    llvm::raw_string_ostream os(bitcode_string);
-    WriteBitcodeToFile(module, os);
-    os.flush();
-    return bitcode_string;
+CUjit_target get_compute_architecture(const int compute_version_major, const int compute_version_minor) {
+    auto compute_architecture = compute_version_major*10 + compute_version_minor;
+    switch(compute_architecture) {
+        case 20:
+            return CU_TARGET_COMPUTE_20;
+        case 21:
+            return CU_TARGET_COMPUTE_21;
+        case 30:
+            return CU_TARGET_COMPUTE_30;
+        case 32:
+            return CU_TARGET_COMPUTE_32;
+        case 35:
+            return CU_TARGET_COMPUTE_35;
+        case 37:
+            return CU_TARGET_COMPUTE_37;
+        case 50:
+            return CU_TARGET_COMPUTE_50;
+        case 52:
+            return CU_TARGET_COMPUTE_52;
+        case 53:
+            return CU_TARGET_COMPUTE_53;
+        case 60:
+            return CU_TARGET_COMPUTE_60;
+        case 61:
+            return CU_TARGET_COMPUTE_61;
+        case 62:
+            return CU_TARGET_COMPUTE_62;
+        case 70:
+            return CU_TARGET_COMPUTE_70;
+        case 72:
+            return CU_TARGET_COMPUTE_72;
+        case 75:
+            return CU_TARGET_COMPUTE_75;
+        case 80:
+            return CU_TARGET_COMPUTE_80;
+        case 86:
+            return CU_TARGET_COMPUTE_86;
+        default:
+            throw std::runtime_error("Unsupported compute architecture");
+    }
 }
 
 void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmark_info) {
@@ -133,48 +126,14 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
         throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
     }
 
-    // Save the LLVM module bitcode to string
-    std::string kernel_bitcode = print_bitcode_to_string(*module);
-
-    // Create NVVM program object
-    // checkNVVMErrors(nvvmCreateProgram(&prog));
-
     // Load the external libraries modules to the NVVM program
     // Currently only libdevice is supported
     // link_libraries(*module, benchmark_info);
 
-    // Add custom IR to program
-    // checkNVVMErrors(nvvmAddModuleToProgram(
-        // prog, kernel_bitcode.c_str(), kernel_bitcode.size(), "nmodl_kernel"));
-
-    // Declare compile options
-    auto compilation_options = get_compilation_options(device_info.compute_version_major,
-                                                       benchmark_info);
-    // transform compilation options to vector of const char*
-    std::vector<const char*> compilation_options_c_str;
-    for (const auto& option: compilation_options) {
-        compilation_options_c_str.push_back(option.c_str());
-    }
     // Compile the program
     logger->info("Compiling the LLVM IR to PTX");
-    // checkNVVMErrors(nvvmCompileProgram(prog,
-                                    //    compilation_options_c_str.size(),
-                                    //    compilation_options_c_str.data()));
-
-    // Get compiled module
-    size_t compiled_module_size;
-    // nvvmGetCompiledResultSize(prog, &compiled_module_size);
-    // ptx_compiled_module.resize(compiled_module_size);
-    // nvvmGetCompiledResult(prog, &ptx_compiled_module.front());
-    // print_string_to_file(ptx_compiled_module,
-                        //  benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
-
-    // Create driver context
-    checkCudaErrors(cuCtxCreate(&context, 0, device));
 
-    // Create target machine for CUDA GPU and generate PTX code
-    // auto tm = utils::create_CUDA_target_machine(platform, *module);
-    // ptx_compiled_module = utils::get_module_ptx(*tm, *module);
+    // Optimize code for nvptx including the wrapper functions and generate PTX
     const auto opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
     utils::optimise_module_for_nvptx(platform, *module, opt_level_codegen, ptx_compiled_module);
     if (benchmark_info) {
@@ -182,13 +141,12 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
                             benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
     }
 
+    // Create driver context
+    checkCudaErrors(cuCtxCreate(&context, 0, device));
+
     // Create module for object
     logger->info("Loading PTX to CUDA module");
-    // CUjit_option options[] = {CU_JIT_TARGET};
-    // void** option_vals = new void*[1];
-    // auto target_architecture = CU_TARGET_COMPUTE_86;
-    // option_vals[0] = (void*)target_architecture;
-    const unsigned int jitNumOptions = 6;
+    const unsigned int jitNumOptions = 5;
     CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
     void **jitOptVals = new void*[jitNumOptions];
 
@@ -212,17 +170,14 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
     char *jitErrorLogBuffer = new char[jitErrorLogBufferSize];
     jitOptVals[3] = jitErrorLogBuffer;
 
-    // set up wall clock time                                                                                                                    
-    jitOptions[4] = CU_JIT_WALL_TIME;
-    float jitTime = 0.0;
-
-    jitOptions[5] = CU_JIT_TARGET;
-    auto target_architecture = CU_TARGET_COMPUTE_86;
-    jitOptVals[5] = (void*)target_architecture;
-    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, ptx_compiled_module.c_str(), jitNumOptions, jitOptions, jitOptVals));
-    logger->info("CUDA JIT walltime: "_format((double)jitOptions[4]));
-    logger->info("CUDA JIT INFO LOG: "_format(jitLogBuffer));
-    logger->info("CUDA JIT ERROR LOG: "_format(jitErrorLogBuffer));
+    jitOptions[4] = CU_JIT_TARGET;
+    auto target_architecture = get_compute_architecture(device_info.compute_version_major, device_info.compute_version_minor);
+    jitOptVals[4] = (void*)target_architecture;
+
+    auto cuda_jit_ret = cuModuleLoadDataEx(&cudaModule, ptx_compiled_module.c_str(), jitNumOptions, jitOptions, jitOptVals);
+    logger->info("CUDA JIT INFO LOG: {}"_format(std::string(jitLogBuffer)));
+    logger->info("CUDA JIT ERROR LOG: {}"_format(std::string(jitErrorLogBuffer)));
+    checkCudaErrors(cuda_jit_ret);
 }
 
 }  // namespace runner
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index ed030816e0..c44a276d2b 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -26,7 +26,6 @@
 #include "cuda.h"
 #include "cuda_runtime.h"
 #include "gpu_parameters.hpp"
-#include "nvvm.h"
 
 using nmodl::cuda_details::GPUExecutionParameters;
 
@@ -50,24 +49,13 @@ struct DeviceInfo {
  */
 void checkCudaErrors(CUresult err);
 
-/**
- * @brief Throw meaningful error in case NVVM API call fails
- *
- * Checks whether a call to the NVVM API was succsful and if not it throws a runntime_error with
- * the error message from NVVM.
- *
- * @param err Return value of the NVVM API call
- */
-void checkNVVMErrors(nvvmResult err);
-
 /**
  * \class CUDADriver
- * \brief Driver to execute a MOD file function via the CUDA and NVVM backend.
+ * \brief Driver to execute a MOD file function via the CUDA JIT backend.
  */
 class CUDADriver {
     /// LLVM IR module to execute.
     std::unique_ptr<llvm::Module> module;
-    nvvmProgram prog;
     CUdevice device;
     CUmodule cudaModule;
     CUcontext context;
@@ -77,7 +65,6 @@ class CUDADriver {
     std::string ptx_compiled_module;
 
     void checkCudaErrors(CUresult err);
-    void checkNVVMErrors(nvvmResult err);
     void link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_info);
 
   public:

From 7b00e258758981cb1497e45d7f0caefdd1dbe0e8 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 31 Mar 2022 15:01:32 +0200
Subject: [PATCH 262/331] Revert setting kernel attributes on GPU backend code

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 11e45ba300..def6cc2424 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -708,9 +708,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
     // If function is a compute kernel, add a void terminator explicitly, since there is no
     // `CodegenReturnVar` node. Also, set the necessary attributes.
     if (is_kernel_function(name)) {
-        if (!platform.is_gpu()) {
-            ir_builder.set_kernel_attributes();
-        }
+        ir_builder.set_kernel_attributes();
         ir_builder.create_return();
     }
 

From 882d33c7b704ed640c7657257c16cfa544ad16a7 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 31 Mar 2022 15:02:15 +0200
Subject: [PATCH 263/331] Revert "Use cmake 3.18 in the CI"

This reverts commit 18df661cb2e6d8896601e7b34466b7c05979ba81.
---
 azure-pipelines.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index ece44244f3..4a490b9cb4 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -52,8 +52,8 @@ jobs:
       chmod +x llvm.sh
       sudo ./llvm.sh 13
     env:
-      CMAKE_VER: 'v3.18.0'
-      CMAKE_PKG: 'cmake-3.18.0-Linux-x86_64'
+      CMAKE_VER: 'v3.17.0'
+      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
     displayName: 'Install Dependencies'
   - script: |
       export PATH=$(pwd)/$CMAKE_PKG/bin:/home/vsts/.local/bin:$PATH
@@ -71,7 +71,7 @@ jobs:
       make install #this is needed for the integration tests
       env CTEST_OUTPUT_ON_FAILURE=1 make test
     env:
-      CMAKE_PKG: 'cmake-3.18.0-Linux-x86_64'
+      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
     displayName: 'Build and Run Unit Tests'
   - script: |
       export PATH=$(pwd)/$CMAKE_PKG/bin:/home/vsts/.local/bin:$PATH
@@ -94,7 +94,7 @@ jobs:
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
     env:
-      CMAKE_PKG: 'cmake-3.18.0-Linux-x86_64'
+      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
       SHELL: 'bash'
     displayName: 'Build Neuron and Run Integration Tests'
   - script: |
@@ -118,7 +118,7 @@ jobs:
       fi
       ./bin/nrnivmodl-core $(Build.Repository.LocalPath)/test/integration/mod
     env:
-      CMAKE_PKG: 'cmake-3.18.0-Linux-x86_64'
+      CMAKE_PKG: 'cmake-3.17.0-Linux-x86_64'
     displayName: 'Build CoreNEURON and Run Integration Tests with ISPC compiler'
 - job: 'osx1015'
   pool:

From a7954a47a4ee602b47aee6046bf03fe7edee72f5 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 31 Mar 2022 15:39:47 +0200
Subject: [PATCH 264/331] Roll back to CMake 3.17

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f912d8ecd..1489a7f497 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 # See top-level LICENSE file for details.
 # =============================================================================
 
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
 
 project(NMODL LANGUAGES CXX)
 

From 512f2cac0caa3f25c459b813ac91b7688adc39ed Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 31 Mar 2022 15:39:55 +0200
Subject: [PATCH 265/331] Improve CUDA JIT logs

---
 test/benchmark/cuda_driver.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index 717049cec6..0337923b92 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -175,8 +175,12 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
     jitOptVals[4] = (void*)target_architecture;
 
     auto cuda_jit_ret = cuModuleLoadDataEx(&cudaModule, ptx_compiled_module.c_str(), jitNumOptions, jitOptions, jitOptVals);
-    logger->info("CUDA JIT INFO LOG: {}"_format(std::string(jitLogBuffer)));
-    logger->info("CUDA JIT ERROR LOG: {}"_format(std::string(jitErrorLogBuffer)));
+    if (!std::string(jitLogBuffer).empty()) {
+        logger->info("CUDA JIT INFO LOG: {}"_format(std::string(jitLogBuffer)));
+    }
+    if (!std::string(jitErrorLogBuffer).empty()) {
+        logger->info("CUDA JIT ERROR LOG: {}"_format(std::string(jitErrorLogBuffer)));
+    }
     checkCudaErrors(cuda_jit_ret);
 }
 

From b39eab3e41eb054337671c5b73eee7cd923daa1e Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 31 Mar 2022 16:05:02 +0200
Subject: [PATCH 266/331] Link with libdevice

---
 test/benchmark/cuda_driver.cpp | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index 0337923b92..9c3725bcb5 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -16,10 +16,10 @@
 #include "fmt/format.h"
 #include "utils/common_utils.hpp"
 
-#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Linker/Linker.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Target/TargetMachine.h"
 
 using fmt::literals::operator""_format;
@@ -43,15 +43,13 @@ void CUDADriver::link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_i
         if (!std::regex_match(lib_name, libdevice_bitcode_name)) {
             throw std::runtime_error("Only libdevice is supported for now");
         }
-        // Load libdevice module to the NVVM program
-        llvm::SMDiagnostic Error;
-
-        llvm::errs() << lib_name << "\n";
-        auto LibDeviceModule = parseIRFile(lib_name, Error, module.getContext());
-        if (!LibDeviceModule) {
-            throw std::runtime_error("Could not find or load libdevice\n");
+        // Load libdevice module to the LLVM Module
+        auto libdevice_file_memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
+        llvm::Expected<std::unique_ptr<llvm::Module>> libdevice_expected_module = parseBitcodeFile(libdevice_file_memory_buffer->get()->getMemBufferRef(), module.getContext());
+        if (std::error_code error = errorToErrorCode(libdevice_expected_module.takeError())) {
+            throw std::runtime_error("Error reading bitcode: {}"_format(error.message()));
         }
-        linker.linkInModule(std::move(LibDeviceModule), llvm::Linker::LinkOnlyNeeded);
+        linker.linkInModule(std::move(libdevice_expected_module.get()), llvm::Linker::LinkOnlyNeeded);
     }
 }
 
@@ -128,7 +126,7 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
 
     // Load the external libraries modules to the NVVM program
     // Currently only libdevice is supported
-    // link_libraries(*module, benchmark_info);
+    link_libraries(*module, benchmark_info);
 
     // Compile the program
     logger->info("Compiling the LLVM IR to PTX");

From 38e8370345dd126aeb9044baa5245b828e6b857f Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 31 Mar 2022 16:07:03 +0200
Subject: [PATCH 267/331] Mention CMake 3.17 in INSTALL.md

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index 4f61866e82..feac241f50 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -21,7 +21,7 @@ To build the project from source, a modern C++ compiler with C++14 support is ne
 
 - flex (>=2.6)
 - bison (>=3.0)
-- CMake (>=3.18)
+- CMake (>=3.17)
 - Python (>=3.6)
 - Python packages : jinja2 (>=2.10), pyyaml (>=3.13), pytest (>=4.0.0), sympy (>=1.3), textwrap
 

From 8c46edf8981a86d57476e63ec17370caee192a38 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 31 Mar 2022 16:10:18 +0200
Subject: [PATCH 268/331] Make clang-format happy

---
 src/codegen/llvm/llvm_utils.cpp |  23 ++++---
 src/codegen/llvm/llvm_utils.hpp |   3 +-
 src/main.cpp                    |   7 +-
 test/benchmark/cuda_driver.cpp  | 114 +++++++++++++++++---------------
 test/benchmark/cuda_driver.hpp  |   3 +-
 5 files changed, 83 insertions(+), 67 deletions(-)

diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index 72b0f6fd19..144103661d 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -74,21 +74,24 @@ void initialise_nvptx_passes() {
     initialise_optimisation_passes();
 }
 
-std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform, llvm::Module& module) {
+std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform,
+                                                                llvm::Module& module) {
     // CUDA target machine we generating code for.
     std::string platform_name = platform.get_name();
 
     // Target and layout information.
-    static const std::map<std::string, std::string> triple_str = {
-            {"nvptx", "nvptx-nvidia-cuda"},
-            {"nvptx64", "nvptx64-nvidia-cuda"}};
+    static const std::map<std::string, std::string> triple_str = {{"nvptx", "nvptx-nvidia-cuda"},
+                                                                  {"nvptx64",
+                                                                   "nvptx64-nvidia-cuda"}};
     static const std::map<std::string, std::string> data_layout_str = {
-            {"nvptx", "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
-                      "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
-                      "-v64:64:64-v128:128:128-n16:32:64"},
-            {"nvptx64", "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
-                        "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
-                        "-v64:64:64-v128:128:128-n16:32:64"}};
+        {"nvptx",
+         "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
+         "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
+         "-v64:64:64-v128:128:128-n16:32:64"},
+        {"nvptx64",
+         "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
+         "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
+         "-v64:64:64-v128:128:128-n16:32:64"}};
 
     // Set data layout and target triple information for the module.
     auto triple = triple_str.at(platform_name);
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index d73c3dea21..f498ad2dfa 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -22,7 +22,8 @@ void initialise_optimisation_passes();
 void initialise_nvptx_passes();
 
 //// Initializes a CUDA target machine
-std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform, llvm::Module& module);
+std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform,
+                                                                llvm::Module& module);
 
 /// Generate PTX code given a CUDA target machine and the module
 std::string get_module_ptx(llvm::TargetMachine& tm, llvm::Module& module);
diff --git a/src/main.cpp b/src/main.cpp
index 4975724eed..93a7ea991f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -718,7 +718,12 @@ int main(int argc, const char* argv[]) {
                 // Create platform abstraction.
                 PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU : PlatformID::GPU;
                 const std::string name = llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
-                Platform platform(pid, name, llvm_gpu_target_architecture, llvm_math_library, llvm_float_type, llvm_vector_width);
+                Platform platform(pid,
+                                  name,
+                                  llvm_gpu_target_architecture,
+                                  llvm_math_library,
+                                  llvm_float_type,
+                                  llvm_vector_width);
 
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index 9c3725bcb5..b5b3ee2760 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -45,11 +45,14 @@ void CUDADriver::link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_i
         }
         // Load libdevice module to the LLVM Module
         auto libdevice_file_memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
-        llvm::Expected<std::unique_ptr<llvm::Module>> libdevice_expected_module = parseBitcodeFile(libdevice_file_memory_buffer->get()->getMemBufferRef(), module.getContext());
+        llvm::Expected<std::unique_ptr<llvm::Module>> libdevice_expected_module =
+            parseBitcodeFile(libdevice_file_memory_buffer->get()->getMemBufferRef(),
+                             module.getContext());
         if (std::error_code error = errorToErrorCode(libdevice_expected_module.takeError())) {
             throw std::runtime_error("Error reading bitcode: {}"_format(error.message()));
         }
-        linker.linkInModule(std::move(libdevice_expected_module.get()), llvm::Linker::LinkOnlyNeeded);
+        linker.linkInModule(std::move(libdevice_expected_module.get()),
+                            llvm::Linker::LinkOnlyNeeded);
     }
 }
 
@@ -59,45 +62,46 @@ void print_string_to_file(const std::string& ptx_compiled_module, const std::str
     ptx_file.close();
 }
 
-CUjit_target get_compute_architecture(const int compute_version_major, const int compute_version_minor) {
-    auto compute_architecture = compute_version_major*10 + compute_version_minor;
-    switch(compute_architecture) {
-        case 20:
-            return CU_TARGET_COMPUTE_20;
-        case 21:
-            return CU_TARGET_COMPUTE_21;
-        case 30:
-            return CU_TARGET_COMPUTE_30;
-        case 32:
-            return CU_TARGET_COMPUTE_32;
-        case 35:
-            return CU_TARGET_COMPUTE_35;
-        case 37:
-            return CU_TARGET_COMPUTE_37;
-        case 50:
-            return CU_TARGET_COMPUTE_50;
-        case 52:
-            return CU_TARGET_COMPUTE_52;
-        case 53:
-            return CU_TARGET_COMPUTE_53;
-        case 60:
-            return CU_TARGET_COMPUTE_60;
-        case 61:
-            return CU_TARGET_COMPUTE_61;
-        case 62:
-            return CU_TARGET_COMPUTE_62;
-        case 70:
-            return CU_TARGET_COMPUTE_70;
-        case 72:
-            return CU_TARGET_COMPUTE_72;
-        case 75:
-            return CU_TARGET_COMPUTE_75;
-        case 80:
-            return CU_TARGET_COMPUTE_80;
-        case 86:
-            return CU_TARGET_COMPUTE_86;
-        default:
-            throw std::runtime_error("Unsupported compute architecture");
+CUjit_target get_compute_architecture(const int compute_version_major,
+                                      const int compute_version_minor) {
+    auto compute_architecture = compute_version_major * 10 + compute_version_minor;
+    switch (compute_architecture) {
+    case 20:
+        return CU_TARGET_COMPUTE_20;
+    case 21:
+        return CU_TARGET_COMPUTE_21;
+    case 30:
+        return CU_TARGET_COMPUTE_30;
+    case 32:
+        return CU_TARGET_COMPUTE_32;
+    case 35:
+        return CU_TARGET_COMPUTE_35;
+    case 37:
+        return CU_TARGET_COMPUTE_37;
+    case 50:
+        return CU_TARGET_COMPUTE_50;
+    case 52:
+        return CU_TARGET_COMPUTE_52;
+    case 53:
+        return CU_TARGET_COMPUTE_53;
+    case 60:
+        return CU_TARGET_COMPUTE_60;
+    case 61:
+        return CU_TARGET_COMPUTE_61;
+    case 62:
+        return CU_TARGET_COMPUTE_62;
+    case 70:
+        return CU_TARGET_COMPUTE_70;
+    case 72:
+        return CU_TARGET_COMPUTE_72;
+    case 75:
+        return CU_TARGET_COMPUTE_75;
+    case 80:
+        return CU_TARGET_COMPUTE_80;
+    case 86:
+        return CU_TARGET_COMPUTE_86;
+    default:
+        throw std::runtime_error("Unsupported compute architecture");
     }
 }
 
@@ -136,7 +140,7 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
     utils::optimise_module_for_nvptx(platform, *module, opt_level_codegen, ptx_compiled_module);
     if (benchmark_info) {
         print_string_to_file(ptx_compiled_module,
-                            benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
+                             benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
     }
 
     // Create driver context
@@ -145,34 +149,36 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
     // Create module for object
     logger->info("Loading PTX to CUDA module");
     const unsigned int jitNumOptions = 5;
-    CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
-    void **jitOptVals = new void*[jitNumOptions];
+    CUjit_option* jitOptions = new CUjit_option[jitNumOptions];
+    void** jitOptVals = new void*[jitNumOptions];
 
-    // set up size of compilation log buffer                                                                                                     
+    // set up size of compilation log buffer
     jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-    size_t jitLogBufferSize = 1024*1024;
-    jitOptVals[0] = (void*)jitLogBufferSize;
+    size_t jitLogBufferSize = 1024 * 1024;
+    jitOptVals[0] = (void*) jitLogBufferSize;
 
     // set up pointer to the compilation log buffer
     jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
-    char *jitLogBuffer = new char[jitLogBufferSize];
+    char* jitLogBuffer = new char[jitLogBufferSize];
     jitOptVals[1] = jitLogBuffer;
 
     // set up size of compilation error log buffer
     jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-    size_t jitErrorLogBufferSize = 1024*1024;
-    jitOptVals[2] = (void*)jitErrorLogBufferSize;
+    size_t jitErrorLogBufferSize = 1024 * 1024;
+    jitOptVals[2] = (void*) jitErrorLogBufferSize;
 
     // set up pointer to the compilation error log buffer
     jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
-    char *jitErrorLogBuffer = new char[jitErrorLogBufferSize];
+    char* jitErrorLogBuffer = new char[jitErrorLogBufferSize];
     jitOptVals[3] = jitErrorLogBuffer;
 
     jitOptions[4] = CU_JIT_TARGET;
-    auto target_architecture = get_compute_architecture(device_info.compute_version_major, device_info.compute_version_minor);
-    jitOptVals[4] = (void*)target_architecture;
+    auto target_architecture = get_compute_architecture(device_info.compute_version_major,
+                                                        device_info.compute_version_minor);
+    jitOptVals[4] = (void*) target_architecture;
 
-    auto cuda_jit_ret = cuModuleLoadDataEx(&cudaModule, ptx_compiled_module.c_str(), jitNumOptions, jitOptions, jitOptVals);
+    auto cuda_jit_ret = cuModuleLoadDataEx(
+        &cudaModule, ptx_compiled_module.c_str(), jitNumOptions, jitOptions, jitOptVals);
     if (!std::string(jitLogBuffer).empty()) {
         logger->info("CUDA JIT INFO LOG: {}"_format(std::string(jitLogBuffer)));
     }
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index c44a276d2b..07323526e2 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -94,7 +94,8 @@ class CUDADriver {
                                        nullptr));
         auto asyncErr = cudaDeviceSynchronize();
         if (asyncErr != cudaSuccess) {
-            throw std::runtime_error("CUDA Execution Error: {}\n"_format(cudaGetErrorString(asyncErr)));
+            throw std::runtime_error(
+                "CUDA Execution Error: {}\n"_format(cudaGetErrorString(asyncErr)));
         }
     }
 

From bceeba14f949c0f200da53ff2d038dcb77faf882 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 1 Apr 2022 12:22:44 +0200
Subject: [PATCH 269/331] Free not needed char* and print ll file generated for
 benchmark

---
 test/benchmark/cuda_driver.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index b5b3ee2760..878688a2cd 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -138,6 +138,7 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
     // Optimize code for nvptx including the wrapper functions and generate PTX
     const auto opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
     utils::optimise_module_for_nvptx(platform, *module, opt_level_codegen, ptx_compiled_module);
+    utils::save_ir_to_ll_file(*module,  benchmark_info->output_dir + "/" + benchmark_info->filename + "_benchmark");
     if (benchmark_info) {
         print_string_to_file(ptx_compiled_module,
                              benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
@@ -185,6 +186,10 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
     if (!std::string(jitErrorLogBuffer).empty()) {
         logger->info("CUDA JIT ERROR LOG: {}"_format(std::string(jitErrorLogBuffer)));
     }
+    free(jitOptions);
+    free(jitOptVals);
+    free(jitLogBuffer);
+    free(jitErrorLogBuffer);
     checkCudaErrors(cuda_jit_ret);
 }
 

From 854d2d57c65c4f378e847cc32f3010f9f767aaad Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 1 Apr 2022 12:23:24 +0200
Subject: [PATCH 270/331] Clear nvvm.annotations and update them for the
 wrapper function so that the original kernel is a device function

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 13 ++++++++++---
 src/codegen/llvm/codegen_llvm_visitor.hpp |  7 +++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index def6cc2424..c6265714ff 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -61,10 +61,10 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
-void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel, const std::string& annotation = "kernel") {
     llvm::Metadata* metadata[] = {
         llvm::ValueAsMetadata::get(kernel),
-        llvm::MDString::get(*context, "kernel"),
+        llvm::MDString::get(*context, annotation),
         llvm::ValueAsMetadata::get(
             llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
     llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
@@ -136,6 +136,13 @@ void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLib
 }
 #endif
 
+void CodegenLLVMVisitor::annotate_wrapper_with_nvvm(llvm::Function* kernel, llvm::Function* kernel_wrapper, const std::string& annotation = "kernel") {
+    auto module_named_metadata = module->getNamedMetadata("nvvm.annotations");
+    module->eraseNamedMetadata(module_named_metadata);
+    annotate_kernel_with_nvvm(kernel, "device");
+    annotate_kernel_with_nvvm(kernel_wrapper, annotation);
+}
+
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
     node->accept(*this);
     return ir_builder.pop_last_value();
@@ -507,7 +514,7 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         if (platform.is_gpu()) {
             // return void
             ir_builder.create_return();
-            annotate_kernel_with_nvvm(wrapper_func);
+            annotate_wrapper_with_nvvm(kernel, wrapper_func, "kernel");
         } else {
             // Create a 0 return value and a return instruction.
             ir_builder.create_i32_constant(0);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 9d005c71c4..14999cf923 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -298,8 +298,11 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
     void wrap_kernel_functions();
 
   private:
-    // Annotates kernel function with NVVM metadata.
-    void annotate_kernel_with_nvvm(llvm::Function* kernel);
+    /// Annotates kernel function with NVVM metadata.
+    void annotate_kernel_with_nvvm(llvm::Function* kernel, const std::string& annotation);
+
+    /// Annotates kernel wrapper function with NVVM metadata and sets the kernel NVVM annotation to device function
+    void annotate_wrapper_with_nvvm(llvm::Function* kernel, llvm::Function* kernel_wrapper, const std::string& annotation);
 
 #if LLVM_VERSION_MAJOR >= 13
     /// Populates target library info with the vector library definitions.

From 9ee3d92d10611bcd2b6e0a3d6d7e2058a7ad5b18 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 1 Apr 2022 13:00:47 +0200
Subject: [PATCH 271/331] Handle wrapper and kernel nvvm annotations properly

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 29 +++++++++++++++++++----
 src/codegen/llvm/codegen_llvm_visitor.hpp |  4 ++--
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index c6265714ff..d375c5cb2a 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -31,6 +31,10 @@ namespace codegen {
 /*                                  Helper routines                                     */
 /****************************************************************************************/
 
+static std::string get_wrapper_name(const std::string& kernel_name) {
+    return "__" + kernel_name + "_wrapper";
+}
+
 /// A utility to check for supported Statement AST nodes.
 static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
@@ -136,11 +140,23 @@ void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLib
 }
 #endif
 
-void CodegenLLVMVisitor::annotate_wrapper_with_nvvm(llvm::Function* kernel, llvm::Function* kernel_wrapper, const std::string& annotation = "kernel") {
+void CodegenLLVMVisitor::annotate_wrapper_kernels_with_nvvm() {
+    // First clear all the nvvm annotations from the module
     auto module_named_metadata = module->getNamedMetadata("nvvm.annotations");
     module->eraseNamedMetadata(module_named_metadata);
-    annotate_kernel_with_nvvm(kernel, "device");
-    annotate_kernel_with_nvvm(kernel_wrapper, annotation);
+
+    // Then each kernel should be annotated as "device" function and wrappers should be annotated as "kernel" functions 
+    std::vector<std::string> kernel_names;
+    find_kernel_names(kernel_names);
+
+    for (const auto& kernel_name: kernel_names) {
+        // Get the kernel function.
+        auto kernel = module->getFunction(kernel_name);
+        // Get the kernel wrapper function.
+        auto kernel_wrapper = module->getFunction(get_wrapper_name(kernel_name));
+        annotate_kernel_with_nvvm(kernel, "device");
+        annotate_kernel_with_nvvm(kernel_wrapper, "kernel");
+    }
 }
 
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
@@ -491,7 +507,7 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         llvm::Function* wrapper_func = llvm::Function::Create(
             llvm::FunctionType::get(return_type, {void_ptr_type}, /*isVarArg=*/false),
             llvm::Function::ExternalLinkage,
-            "__" + kernel_name + "_wrapper",
+            get_wrapper_name(kernel_name),
             *module);
 
         // Optionally, add debug information for the wrapper function.
@@ -514,7 +530,6 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         if (platform.is_gpu()) {
             // return void
             ir_builder.create_return();
-            annotate_wrapper_with_nvvm(kernel, wrapper_func, "kernel");
         } else {
             // Create a 0 return value and a return instruction.
             ir_builder.create_i32_constant(0);
@@ -524,6 +539,10 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         }
         ir_builder.clear_function();
     }
+    // for GPU we need to first clear all the annotations and then reapply them
+    if (platform.is_gpu()) {
+        annotate_wrapper_kernels_with_nvvm();
+    }
 }
 
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 14999cf923..81ab272e89 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -301,8 +301,8 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
     /// Annotates kernel function with NVVM metadata.
     void annotate_kernel_with_nvvm(llvm::Function* kernel, const std::string& annotation);
 
-    /// Annotates kernel wrapper function with NVVM metadata and sets the kernel NVVM annotation to device function
-    void annotate_wrapper_with_nvvm(llvm::Function* kernel, llvm::Function* kernel_wrapper, const std::string& annotation);
+    /// Handles NVVM function annotations when we create the wrapper functions. All original kernels should be "device" functions and wrappers "kernel" functions
+    void annotate_wrapper_kernels_with_nvvm();
 
 #if LLVM_VERSION_MAJOR >= 13
     /// Populates target library info with the vector library definitions.

From 8cc39cbe59d389704d0e33503c3bbbbd99082735 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 1 Apr 2022 13:08:43 +0200
Subject: [PATCH 272/331] Update INSTALL.md

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index feac241f50..7ddb21b15c 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -148,7 +148,7 @@ set `NMODL_PYLIB` before running cmake!
 -DNMODL_ENABLE_LLVM_CUDA=ON
 ```
 
-To find the need `CUDA` libraries (`cudart`, `nvrtc` and `nvvm`) it's needed to have CUDA Toolkit installed on your system.
+To find the need `CUDA` libraries (`cudart` and `nvrtc`) it's needed to have CUDA Toolkit installed on your system.
 This can be done by installing the CUDA Toolkit from the [CUDA Toolkit website](https://developer.nvidia.com/cuda-downloads) or by installing the `CUDA` spack package and loading the corresponding module.
 
 Then given a supported MOD file you can execute the benchmark on GPU in you supported NVIDIA GPU by running the following command:

From 86b8917b6af7694eb3fdf0fe03d2bb528f156d2c Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 1 Apr 2022 16:36:18 +0200
Subject: [PATCH 273/331] Testing benchmarks in gitlab CI

---
 .gitlab-ci.yml                | 40 ++++++++++++++++++++++++++++++++++-
 test/integration/mod/test.mod | 16 ++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 test/integration/mod/test.mod

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a17a8dea9e..977b31d7e1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,7 +26,6 @@ trigger cvf:
   variables:
     SPACK_PACKAGE: nmodl
     SPACK_PACKAGE_SPEC: ~legacy-unit+python
-    SPACK_EXTRA_MODULES: llvm
     SPACK_INSTALL_EXTRA_FLAGS: -v
 
 spack_setup:
@@ -68,3 +67,42 @@ test:gcc:
     - .ctest
     - .nmodl_tests
   needs: ["build:gcc"]
+
+.benchmark_config:
+  variables:
+    bb5_ntasks: 2   # so we block 16 cores
+    bb5_cpus_per_task: 8 # ninja -j {this}
+    bb5_memory: 76G # ~16*384/80
+    bb5_constraint: "volta&clx"
+
+.spack_nmodl_cuda:
+  variables:
+    SPACK_PACKAGE: nmodl
+    SPACK_PACKAGE_SPEC: ~legacy-unit~python+llvm+cuda
+    SPACK_INSTALL_EXTRA_FLAGS: -v
+    SPACK_PACKAGE_DEPENDENCIES: ^cuda@11.4.2 # same as CUDA driver
+
+build_cuda:gcc:
+  extends:
+    - .spack_build
+    - .spack_nmodl_cuda
+  variables:
+    SPACK_PACKAGE_COMPILER: gcc
+
+test_benchmark:cpu:
+  extends:
+    - .benchmark_config
+  script:
+    - module load unstable nmodl/develop
+    - module unload cuda/11.6.0
+    - nmodl test/integration/mod/test.mod llvm --ir --opt-level-ir 3 benchmark --run --opt-level-codegen 3 --instance-size 10000000 --repeat 2
+  needs: ["build_cuda:gcc"]
+
+test_benchmark:gpu:
+  extends:
+    - .benchmark_config
+  script:
+    - module load unstable nmodl/develop
+    - module unload cuda/11.6.0
+    - nmodl test/integration/mod/test.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_70" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2
+  needs: ["build_cuda:gcc"]
diff --git a/test/integration/mod/test.mod b/test/integration/mod/test.mod
new file mode 100644
index 0000000000..1c5292ba6e
--- /dev/null
+++ b/test/integration/mod/test.mod
@@ -0,0 +1,16 @@
+NEURON {
+	SUFFIX test
+	RANGE x, y
+}
+
+ASSIGNED { x y }
+
+STATE { m }
+
+BREAKPOINT {
+	SOLVE states METHOD cnexp
+}
+
+DERIVATIVE states {
+  m = y + 2
+}

From 3b0781b72edbf7b3233b47b6b6e72441a06b1047 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 4 Apr 2022 12:26:21 +0200
Subject: [PATCH 274/331] Use deployed CUDA 11.6.0 and fix allocation

---
 .gitlab-ci.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 977b31d7e1..bbdac57a14 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,7 +25,7 @@ trigger cvf:
 .spack_nmodl:
   variables:
     SPACK_PACKAGE: nmodl
-    SPACK_PACKAGE_SPEC: ~legacy-unit+python
+    SPACK_PACKAGE_SPEC: ~legacy-unit+python+llvm
     SPACK_INSTALL_EXTRA_FLAGS: -v
 
 spack_setup:
@@ -73,14 +73,13 @@ test:gcc:
     bb5_ntasks: 2   # so we block 16 cores
     bb5_cpus_per_task: 8 # ninja -j {this}
     bb5_memory: 76G # ~16*384/80
-    bb5_constraint: "volta&clx"
+    bb5_constraint: clx&volta
 
 .spack_nmodl_cuda:
   variables:
     SPACK_PACKAGE: nmodl
     SPACK_PACKAGE_SPEC: ~legacy-unit~python+llvm+cuda
     SPACK_INSTALL_EXTRA_FLAGS: -v
-    SPACK_PACKAGE_DEPENDENCIES: ^cuda@11.4.2 # same as CUDA driver
 
 build_cuda:gcc:
   extends:
@@ -94,7 +93,6 @@ test_benchmark:cpu:
     - .benchmark_config
   script:
     - module load unstable nmodl/develop
-    - module unload cuda/11.6.0
     - nmodl test/integration/mod/test.mod llvm --ir --opt-level-ir 3 benchmark --run --opt-level-codegen 3 --instance-size 10000000 --repeat 2
   needs: ["build_cuda:gcc"]
 
@@ -103,6 +101,5 @@ test_benchmark:gpu:
     - .benchmark_config
   script:
     - module load unstable nmodl/develop
-    - module unload cuda/11.6.0
     - nmodl test/integration/mod/test.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_70" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2
   needs: ["build_cuda:gcc"]

From 4af4e27d3a85e283cdd139073f9d1355e218dd9a Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 4 Apr 2022 12:32:49 +0200
Subject: [PATCH 275/331] Small fix in spack variant

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index bbdac57a14..785064d88e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -78,7 +78,7 @@ test:gcc:
 .spack_nmodl_cuda:
   variables:
     SPACK_PACKAGE: nmodl
-    SPACK_PACKAGE_SPEC: ~legacy-unit~python+llvm+cuda
+    SPACK_PACKAGE_SPEC: ~legacy-unit~python+llvm+llvm_cuda
     SPACK_INSTALL_EXTRA_FLAGS: -v
 
 build_cuda:gcc:

From 919742916bf0ddd6c9010fcde777e079075d9ab6 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 4 Apr 2022 13:19:29 +0200
Subject: [PATCH 276/331] Fix allocation

---
 .gitlab-ci.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 785064d88e..7e88a0c21b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -70,9 +70,8 @@ test:gcc:
 
 .benchmark_config:
   variables:
-    bb5_ntasks: 2   # so we block 16 cores
-    bb5_cpus_per_task: 8 # ninja -j {this}
-    bb5_memory: 76G # ~16*384/80
+    bb5_ntasks: 1
+    bb5_memory: 16G
     bb5_constraint: clx&volta
 
 .spack_nmodl_cuda:

From 7a1a599832f24e6959a87d30022c07f168eb5410 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 4 Apr 2022 13:49:42 +0200
Subject: [PATCH 277/331] Fix bb5 constraint

---
 .gitlab-ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7e88a0c21b..da528130b7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -71,8 +71,10 @@ test:gcc:
 .benchmark_config:
   variables:
     bb5_ntasks: 1
+    bb5_cpus_per_task: 1
     bb5_memory: 16G
-    bb5_constraint: clx&volta
+    bb5_exclusive: full
+    bb5_constraint: gpu_32g # CascadeLake GPU node
 
 .spack_nmodl_cuda:
   variables:

From dde8b4bf208514d26bad687cac5f91d7fe6566d3 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 4 Apr 2022 15:04:27 +0200
Subject: [PATCH 278/331] Load nmodl with spack

---
 .gitlab-ci.yml | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index da528130b7..6816b96838 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -76,24 +76,27 @@ test:gcc:
     bb5_exclusive: full
     bb5_constraint: gpu_32g # CascadeLake GPU node
 
-.spack_nmodl_cuda:
+.build_allocation:
   variables:
-    SPACK_PACKAGE: nmodl
-    SPACK_PACKAGE_SPEC: ~legacy-unit~python+llvm+llvm_cuda
-    SPACK_INSTALL_EXTRA_FLAGS: -v
+    bb5_ntasks: 2   # so we block 16 cores
+    bb5_cpus_per_task: 8 # ninja -j {this}
+    bb5_memory: 76G # ~16*384/80
 
 build_cuda:gcc:
-  extends:
-    - .spack_build
-    - .spack_nmodl_cuda
+  extends: [.spack_build, .build_allocation]
   variables:
+    SPACK_PACKAGE: nmodl
+    SPACK_PACKAGE_SPEC: ~legacy-unit~python+llvm+llvm_cuda
+    SPACK_INSTALL_EXTRA_FLAGS: -v
     SPACK_PACKAGE_COMPILER: gcc
 
 test_benchmark:cpu:
   extends:
     - .benchmark_config
   script:
-    - module load unstable nmodl/develop
+    - module load unstable git
+    - . ${SPACK_ROOT}/share/spack/setup-env.sh
+    - spack load nmodl/${SPACK_INSTALLED_HASH}
     - nmodl test/integration/mod/test.mod llvm --ir --opt-level-ir 3 benchmark --run --opt-level-codegen 3 --instance-size 10000000 --repeat 2
   needs: ["build_cuda:gcc"]
 
@@ -101,6 +104,8 @@ test_benchmark:gpu:
   extends:
     - .benchmark_config
   script:
-    - module load unstable nmodl/develop
+    - module load unstable git
+    - . ${SPACK_ROOT}/share/spack/setup-env.sh
+    - spack load nmodl/${SPACK_INSTALLED_HASH}
     - nmodl test/integration/mod/test.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_70" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2
   needs: ["build_cuda:gcc"]

From e9263d8dba0b9db1f7712e873f8deb71fe77f264 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 4 Apr 2022 15:28:03 +0200
Subject: [PATCH 279/331] Fix GPU execution configuration in the benchmark

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6816b96838..55c52be210 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -74,7 +74,7 @@ test:gcc:
     bb5_cpus_per_task: 1
     bb5_memory: 16G
     bb5_exclusive: full
-    bb5_constraint: gpu_32g # CascadeLake GPU node
+    bb5_constraint: gpu_32g # CascadeLake CPU & V100 GPU node
 
 .build_allocation:
   variables:
@@ -107,5 +107,5 @@ test_benchmark:gpu:
     - module load unstable git
     - . ${SPACK_ROOT}/share/spack/setup-env.sh
     - spack load nmodl/${SPACK_INSTALLED_HASH}
-    - nmodl test/integration/mod/test.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_70" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2
+    - nmodl test/integration/mod/test.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_70" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2 --grid-dim-x 4096 --block-dim-x 256
   needs: ["build_cuda:gcc"]

From de598805307cbea56420181bf99a2e1a6cab5b3b Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 4 Apr 2022 18:40:12 +0200
Subject: [PATCH 280/331] Updated script to execute the benchmarks

---
 test/benchmark/nmodl-llvm-time.sh | 47 +++++++++++++------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 2e368a2d38..15168f1169 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -12,8 +12,8 @@ set -e
 
 module purge
 unset MODULEPATH
-export MODULEPATH="/gpfs/bbp.cscs.ch/ssd/apps/hpc/jenkins/modules/all"
-module load unstable
+export MODULEPATH=/gpfs/bbp.cscs.ch/ssd/apps/bsd/modules/_meta
+module load unstable gcc cuda
 
 # default params
 inst_size=100000000
@@ -95,17 +95,17 @@ while [[ "$1" != "" ]]; do
 done
 
 #intel paths
-intel_library_dir=$(module show intel 2>&1 | grep " LD_LIBRARY_PATH " | awk -F' ' '{print $3}' | head -n 1)
-svml_lib=$intel_library_dir/intel64_lin/libsvml.so
+intel_library_dir=$(module show intel-oneapi-compilers 2>&1 | grep " LD_LIBRARY_PATH " | grep "intel64_lin" | awk -F' ' '{print $3}' | head -n 1)
+svml_lib=$intel_library_dir/libsvml.so
 intel_exe=$(module show intel 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)/icpc
 
 #sleef library
 sleef_lib=/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621/sleef-3.5.1/lib64/libsleefgnuabi.so
 
 #llvm path
-llvm_path="/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621"
-clang_exe=${llvm_path}/bin/clang++
-llc_exe=${llvm_path}/bin/llc
+llvm_path=$(module show llvm/13.0.0 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)
+clang_exe=${llvm_path}/clang++
+llc_exe=${llvm_path}/llc
 
 #gcc path
 gcc_exe=$(module show gcc 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)/g++
@@ -244,8 +244,8 @@ for kernel_target in ${KERNEL_TARGETS}; do
 	                ${debug} cd ..
 
                     # add --fmf nnan contract afn here to generate .ll file similar to the fast-math options from external compilers
-                    nmodl_common_args="${kernels_path}/${kernel_target}.mod benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${svml_lib} ${sleef_lib} --external"
-                    nmodl_llvm_args="llvm --ir --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 --fmf nnan contract afn"
+                    nmodl_common_args="${kernels_path}/${kernel_target}.mod benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${svml_lib} ${sleef_lib} --external"
+                    nmodl_llvm_args="llvm --ir --opt-level-ir 3 --fmf nnan contract afn cpu --name ${nmodl_architecture} --vector-width ${vec_width} --math-library SVML"
 
                     benchmark_ext_desc=${kernel_target}_${compiler}_${nmodl_architecture}_v${vec_width}_${flags//[[:blank:]]/}
                     benchmark_description+=("${benchmark_ext_desc}")
@@ -256,9 +256,10 @@ for kernel_target in ${KERNEL_TARGETS}; do
                 done
 
                 if [ "$compiler" == "clang" ]; then
+                    module load llvm/13.0.0
                     for math_lib in SVML SLEEF;
                     do
-                      nmodl_llvm_args="llvm --ir --vector-width ${vec_width} --veclib ${math_lib} --opt-level-ir 3 --fmf nnan contract afn"
+                      nmodl_llvm_args="llvm --ir --opt-level-ir 3 --fmf nnan contract afn cpu --vector-width ${vec_width} --math-library ${math_lib}"
                       rel_ext_path_llvm=${kernel_target}_nmodl_${spec}_llvm_${math_lib}
                       rel_ext_path_llvm=${rel_ext_path_llvm//=/_}
                       rel_ext_path_llvm=${rel_ext_path_llvm//-/_}
@@ -275,6 +276,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
                       benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_ext_jit_desc}.log | awk '{print $NF}'))
 
                     done
+                    module unload llvm/13.0.0
                 fi
 
             done
@@ -288,23 +290,14 @@ for kernel_target in ${KERNEL_TARGETS}; do
                 fast_math_flag=""
                 fast_math_opt="nonfastmath"
             fi
-            for assume_may_alias in true false; do
-                if $assume_may_alias; then
-                    assume_may_alias_flag="--assume-may-alias"
-                    assume_may_alias_opt="alias"
-                else
-                    assume_may_alias_flag=""
-                    assume_may_alias_opt="noalias"
-                fi
-                echo "|  |  |  options: ${fast_math_flag} ${assume_may_alias_flag}"
-                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} ${assume_may_alias_flag} --vector-width ${vec_width} --veclib SVML --opt-level-ir 3 benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --cpu ${nmodl_architecture} --libs ${svml_lib}"
-                benchmark_nmodl_desc=${kernel_target}_nmodl-jit_${nmodl_architecture}_v${vec_width}_${fast_math_opt}_${assume_may_alias_opt}
-                benchmark_description+=("${benchmark_nmodl_desc}")
-                # runs only kernel generated by LLVM IR
-                ${debug} eval "${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
-                benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
-                benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
-            done
+            echo "|  |  |  options: ${fast_math_flag}"
+            nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} --opt-level-ir 3 cpu --name ${nmodl_architecture} --vector-width ${vec_width} --math-library SVML benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${svml_lib}"
+            benchmark_nmodl_desc=${kernel_target}_nmodl-jit_${nmodl_architecture}_v${vec_width}_${fast_math_opt}
+            benchmark_description+=("${benchmark_nmodl_desc}")
+            # runs only kernel generated by LLVM IR
+            ${debug} eval "${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
+            benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
+            benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
         done
     done
 done

From 9bd6315949e1c69dd9c229fdf9b5bc1a25fd3214 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 5 Apr 2022 13:32:18 +0200
Subject: [PATCH 281/331] [LLVM] Code formatting changes (#838)

* Update hpc-coding-convention (#836)
* Run clang-format with clang-format-13
* Fix gitlab ci NMODL spack variants

Co-authored-by: Nicolas Cornu <nicolas.cornu@epfl.ch>
---
 .gitlab-ci.yml                                |  2 +-
 CMakeLists.txt                                |  7 +-
 cmake/hpc-coding-conventions                  |  2 +-
 .../llvm/codegen_llvm_helper_visitor.cpp      | 71 +++++++++--------
 .../llvm/codegen_llvm_helper_visitor.hpp      |  5 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 14 ++--
 src/codegen/llvm/llvm_ir_builder.cpp          | 11 +--
 src/codegen/llvm/llvm_utils.cpp               | 20 ++---
 src/codegen/llvm/main.cpp                     |  5 +-
 src/codegen/llvm/target_platform.cpp          |  8 +-
 src/codegen/llvm/target_platform.hpp          | 43 +++++------
 src/main.cpp                                  | 77 +++++++++++--------
 src/parser/verbatim_driver.hpp                |  2 -
 src/visitors/nmodl_visitor_helper.ipp         |  1 -
 src/visitors/symtab_visitor_helper.hpp        | 23 +++---
 test/unit/codegen/codegen_llvm_ir.cpp         | 34 +++++---
 16 files changed, 174 insertions(+), 151 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a17a8dea9e..b44650c555 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,7 +25,7 @@ trigger cvf:
 .spack_nmodl:
   variables:
     SPACK_PACKAGE: nmodl
-    SPACK_PACKAGE_SPEC: ~legacy-unit+python
+    SPACK_PACKAGE_SPEC: ~legacy-unit+python+llvm
     SPACK_EXTRA_MODULES: llvm
     SPACK_INSTALL_EXTRA_FLAGS: -v
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c5aae4a7e..5caf48c684 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,14 +105,11 @@ include_directories(
 # HPC Coding Conventions
 # =============================================================================
 set(NMODL_ClangFormat_EXCLUDES_RE
-    ".*/ext/.*$$"
+    "ext/.*$$" "src/language/templates/.*$$"
     CACHE STRING "list of regular expressions to exclude C/C++ files from formatting" FORCE)
 set(NMODL_CMakeFormat_EXCLUDES_RE
-    ".*/ext/.*$$" ".*/src/language/templates/.*$$"
+    "ext/.*$$" "src/language/templates/.*$$"
     CACHE STRING "list of regular expressions to exclude CMake files from formatting" FORCE)
-set(NMODL_ClangFormat_DEPENDENCIES
-    pyastgen parser-gen
-    CACHE STRING "list of CMake targets to build before formatting C++ code" FORCE)
 
 # initialize submodule of coding conventions under cmake
 set(THIRD_PARTY_DIRECTORY "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/cmake/hpc-coding-conventions b/cmake/hpc-coding-conventions
index 7bca42c14a..7eaad9d932 160000
--- a/cmake/hpc-coding-conventions
+++ b/cmake/hpc-coding-conventions
@@ -1 +1 @@
-Subproject commit 7bca42c14a93e2eb2858ad4e90514d629aa3df5b
+Subproject commit 7eaad9d932f1fdcd7421d943cbf7bc5fcd6c5165
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 22ce0c3de8..c029bb736f 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -549,25 +549,27 @@ void CodegenLLVMHelperVisitor::visit_function_block(ast::FunctionBlock& node) {
     create_function_for_node(node);
 }
 
-std::shared_ptr<ast::Expression>
-CodegenLLVMHelperVisitor::loop_initialization_expression(const std::string& induction_var,
-                                                         bool is_remainder_loop) {
+std::shared_ptr<ast::Expression> CodegenLLVMHelperVisitor::loop_initialization_expression(
+    const std::string& induction_var,
+    bool is_remainder_loop) {
     if (platform.is_gpu()) {
         const auto& id = create_varname(induction_var);
         const auto& tid = new ast::CodegenThreadId();
-        return std::make_shared<ast::BinaryExpression>(id, ast::BinaryOperator(ast::BOP_ASSIGN), tid);
+        return std::make_shared<ast::BinaryExpression>(id,
+                                                       ast::BinaryOperator(ast::BOP_ASSIGN),
+                                                       tid);
     }
 
-  // Otherwise, platfrom is CPU. Since the loop can be a remainder loop, check if
-  // we need to initialize at all.
+    // Otherwise, platfrom is CPU. Since the loop can be a remainder loop, check if
+    // we need to initialize at all.
     if (is_remainder_loop)
         return nullptr;
     return int_initialization_expression(induction_var);
 }
 
-std::shared_ptr<ast::Expression>
-CodegenLLVMHelperVisitor::loop_increment_expression(const std::string& induction_var,
-                                                    bool is_remainder_loop) {
+std::shared_ptr<ast::Expression> CodegenLLVMHelperVisitor::loop_increment_expression(
+    const std::string& induction_var,
+    bool is_remainder_loop) {
     const auto& id = create_varname(induction_var);
 
     // For GPU platforms, increment by grid stride.
@@ -576,8 +578,8 @@ CodegenLLVMHelperVisitor::loop_increment_expression(const std::string& induction
         const auto& inc_expr =
             new ast::BinaryExpression(id, ast::BinaryOperator(ast::BOP_ADDITION), stride);
         return std::make_shared<ast::BinaryExpression>(id->clone(),
-                                                    ast::BinaryOperator(ast::BOP_ASSIGN),
-                                                    inc_expr);
+                                                       ast::BinaryOperator(ast::BOP_ASSIGN),
+                                                       inc_expr);
     }
 
     // Otherwise, proceed with increment for CPU loop.
@@ -590,10 +592,10 @@ CodegenLLVMHelperVisitor::loop_increment_expression(const std::string& induction
                                                    inc_expr);
 }
 
-std::shared_ptr<ast::Expression>
-CodegenLLVMHelperVisitor::loop_count_expression(const std::string& induction_var,
-                                                const std::string& node_count,
-                                                bool is_remainder_loop) {
+std::shared_ptr<ast::Expression> CodegenLLVMHelperVisitor::loop_count_expression(
+    const std::string& induction_var,
+    const std::string& node_count,
+    bool is_remainder_loop) {
     const int width = is_remainder_loop ? 1 : platform.get_instruction_width();
     const auto& id = create_varname(induction_var);
     const auto& mech_node_count = create_varname(node_count);
@@ -642,15 +644,11 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         /// access node index and corresponding voltage
         index_statements.push_back(
             visitor::create_statement("node_id = node_index[{}]"_format(naming::INDUCTION_VAR)));
-        body_statements.push_back(
-            visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
+        body_statements.push_back(visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
 
         /// read ion variables
-        ion_read_statements(BlockType::State,
-                            int_variables,
-                            double_variables,
-                            index_statements,
-                            body_statements);
+        ion_read_statements(
+            BlockType::State, int_variables, double_variables, index_statements, body_statements);
 
         /// main compute node : extract solution expressions from the derivative block
         const auto& solutions = collect_nodes(node, {ast::AstNodeType::SOLUTION_EXPRESSION});
@@ -668,11 +666,8 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         }
 
         /// write ion statements
-        ion_write_statements(BlockType::State,
-                             int_variables,
-                             double_variables,
-                             index_statements,
-                             body_statements);
+        ion_write_statements(
+            BlockType::State, int_variables, double_variables, index_statements, body_statements);
 
         // \todo handle process_shadow_update_statement and wrote_conc_call yet
     }
@@ -685,7 +680,7 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
 
     std::vector<std::string> induction_variables{naming::INDUCTION_VAR};
     function_statements.push_back(
-            create_local_variable_statement(induction_variables, INTEGER_TYPE));
+        create_local_variable_statement(induction_variables, INTEGER_TYPE));
 
     if (platform.is_gpu()) {
         create_gpu_compute_body(compute_body, function_statements, int_variables, double_variables);
@@ -734,7 +729,11 @@ void CodegenLLVMHelperVisitor::create_cpu_compute_body(ast::StatementVector& bod
     auto loop_block = std::make_shared<ast::StatementBlock>(body);
     create_compute_body_loop(loop_block, function_statements, int_variables, double_variables);
     if (platform.is_cpu_with_simd())
-        create_compute_body_loop(loop_block, function_statements, int_variables, double_variables, /*is_remainder_loop=*/true);
+        create_compute_body_loop(loop_block,
+                                 function_statements,
+                                 int_variables,
+                                 double_variables,
+                                 /*is_remainder_loop=*/true);
 }
 
 void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::StatementBlock>& block,
@@ -742,15 +741,19 @@ void CodegenLLVMHelperVisitor::create_compute_body_loop(std::shared_ptr<ast::Sta
                                                         std::vector<std::string>& int_variables,
                                                         std::vector<std::string>& double_variables,
                                                         bool is_remainder_loop) {
-    const auto& initialization = loop_initialization_expression(naming::INDUCTION_VAR, is_remainder_loop);
-    const auto& condition = loop_count_expression(naming::INDUCTION_VAR, NODECOUNT_VAR, is_remainder_loop);
+    const auto& initialization = loop_initialization_expression(naming::INDUCTION_VAR,
+                                                                is_remainder_loop);
+    const auto& condition =
+        loop_count_expression(naming::INDUCTION_VAR, NODECOUNT_VAR, is_remainder_loop);
     const auto& increment = loop_increment_expression(naming::INDUCTION_VAR, is_remainder_loop);
 
     // Clone the statement block if needed since it can be used by the remainder loop.
-    auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd()) ? block : std::shared_ptr<ast::StatementBlock>(block->clone());
+    auto loop_block = (is_remainder_loop || !platform.is_cpu_with_simd())
+                          ? block
+                          : std::shared_ptr<ast::StatementBlock>(block->clone());
 
-    // Convert local statement to use CodegenVar statements and create a FOR loop node. Also, if creating
-    // a remainder loop then rename variables to avoid conflicts.
+    // Convert local statement to use CodegenVar statements and create a FOR loop node. Also, if
+    // creating a remainder loop then rename variables to avoid conflicts.
     if (is_remainder_loop)
         rename_local_variables(*loop_block);
     convert_local_statement(*loop_block);
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index c2eb415cb2..aea2f5aea8 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -171,8 +171,9 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
 
   private:
     /// Methods to create target-specific loop constructs.
-    std::shared_ptr<ast::Expression> loop_initialization_expression(const std::string& induction_var,
-                                                                    bool is_remainder_loop);
+    std::shared_ptr<ast::Expression> loop_initialization_expression(
+        const std::string& induction_var,
+        bool is_remainder_loop);
     std::shared_ptr<ast::Expression> loop_count_expression(const std::string& induction_var,
                                                            const std::string& node_count,
                                                            bool is_remainder_loop);
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index d906e9bd44..42ddc04b64 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -62,11 +62,10 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
 }
 
 void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
-    llvm::Metadata* metadata[] = {
-        llvm::ValueAsMetadata::get(kernel),
-        llvm::MDString::get(*context, "kernel"),
-        llvm::ValueAsMetadata::get(
-            llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
+    llvm::Metadata* metadata[] = {llvm::ValueAsMetadata::get(kernel),
+                                  llvm::MDString::get(*context, "kernel"),
+                                  llvm::ValueAsMetadata::get(
+                                      llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
     llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
     module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
 }
@@ -121,7 +120,8 @@ void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLib
             {"SVML", VecLib::SVML}};
         const auto& library = llvm_supported_vector_libraries.find(platform.get_math_library());
         if (library == llvm_supported_vector_libraries.end())
-            throw std::runtime_error("Error: unknown vector library - " + platform.get_math_library() + "\n");
+            throw std::runtime_error("Error: unknown vector library - " +
+                                     platform.get_math_library() + "\n");
 
         // Add vectorizable functions to the target library info.
         switch (library->second) {
@@ -682,7 +682,7 @@ void CodegenLLVMVisitor::visit_codegen_function(const ast::CodegenFunction& node
         } else if (platform.is_gpu()) {
             block->accept(*this);
             annotate_kernel_with_nvvm(func);
-        } else { // scalar
+        } else {  // scalar
             block->accept(*this);
         }
     } else {
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index c851f02970..b99cc81817 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -451,7 +451,8 @@ void IRBuilder::create_scalar_or_vector_alloca(const std::string& name,
     // Even if generating vectorised code, some variables still need to be scalar. Particularly, the
     // induction variable "id" and remainder loop variables (that start with "epilogue" prefix).
     llvm::Type* type;
-    if (platform.is_cpu_with_simd() && vectorize && name != kernel_id && name.rfind("epilogue", 0)) {
+    if (platform.is_cpu_with_simd() && vectorize && name != kernel_id &&
+        name.rfind("epilogue", 0)) {
         int vector_width = platform.get_instruction_width();
         type = llvm::FixedVectorType::get(element_or_scalar_type, vector_width);
     } else {
@@ -558,8 +559,8 @@ void IRBuilder::maybe_replicate_value(llvm::Value* value) {
 void IRBuilder::create_grid_stride() {
     llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
     auto create_call = [&](llvm::Intrinsic::ID id) {
-      llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
-      return builder.CreateCall(intrinsic, {});
+        llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
+        return builder.CreateCall(intrinsic, {});
     };
 
     llvm::Value* block_dim = create_call(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
@@ -572,8 +573,8 @@ void IRBuilder::create_grid_stride() {
 void IRBuilder::create_thread_id() {
     llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
     auto create_call = [&](llvm::Intrinsic::ID id) {
-      llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
-      return builder.CreateCall(intrinsic, {});
+        llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(m, id);
+        return builder.CreateCall(intrinsic, {});
     };
 
     // For now, this function only supports NVPTX backend, however it can be easily
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index 7086275557..4168612790 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -83,16 +83,18 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
     std::string platform_name = platform.get_name();
 
     // Target and layout information.
-    static const std::map<std::string, std::string> triple_str = {
-            {"nvptx", "nvptx-nvidia-cuda"},
-            {"nvptx64", "nvptx64-nvidia-cuda"}};
+    static const std::map<std::string, std::string> triple_str = {{"nvptx", "nvptx-nvidia-cuda"},
+                                                                  {"nvptx64",
+                                                                   "nvptx64-nvidia-cuda"}};
     static const std::map<std::string, std::string> data_layout_str = {
-            {"nvptx", "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
-                      "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
-                      "-v64:64:64-v128:128:128-n16:32:64"},
-            {"nvptx64", "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
-                        "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
-                        "-v64:64:64-v128:128:128-n16:32:64"}};
+        {"nvptx",
+         "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
+         "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
+         "-v64:64:64-v128:128:128-n16:32:64"},
+        {"nvptx64",
+         "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32"
+         "-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32"
+         "-v64:64:64-v128:128:128-n16:32:64"}};
 
     // Set data layout and target triple information for the module.
     auto triple = triple_str.at(platform_name);
diff --git a/src/codegen/llvm/main.cpp b/src/codegen/llvm/main.cpp
index 92d8a486c1..cd2ec2cb12 100644
--- a/src/codegen/llvm/main.cpp
+++ b/src/codegen/llvm/main.cpp
@@ -51,7 +51,10 @@ int main(int argc, const char* argv[]) {
     codegen::Platform platform;
 
     logger->info("Running LLVM Visitor");
-    codegen::CodegenLLVMVisitor llvm_visitor(filename, /*output_dir=*/".", platform, /*opt_level_ir=*/0);
+    codegen::CodegenLLVMVisitor llvm_visitor(filename,
+                                             /*output_dir=*/".",
+                                             platform,
+                                             /*opt_level_ir=*/0);
     llvm_visitor.visit_program(*ast);
     std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
 
diff --git a/src/codegen/llvm/target_platform.cpp b/src/codegen/llvm/target_platform.cpp
index 49a0db9a31..fff195d6b8 100644
--- a/src/codegen/llvm/target_platform.cpp
+++ b/src/codegen/llvm/target_platform.cpp
@@ -17,7 +17,7 @@ const std::string Platform::DEFAULT_MATH_LIBRARY = "none";
 
 bool Platform::is_default_platform() {
     // Default platform is a CPU.
-    return platform_id == PlatformID::CPU &&  name == Platform::DEFAULT_PLATFORM_NAME;
+    return platform_id == PlatformID::CPU && name == Platform::DEFAULT_PLATFORM_NAME;
 }
 
 bool Platform::is_cpu() {
@@ -33,11 +33,11 @@ bool Platform::is_gpu() {
 }
 
 bool Platform::is_CUDA_gpu() {
-  return platform_id == PlatformID::GPU && (name == "nvptx" || name == "nvptx64");
+    return platform_id == PlatformID::GPU && (name == "nvptx" || name == "nvptx64");
 }
 
 bool Platform::is_single_precision() {
-  return use_single_precision;
+    return use_single_precision;
 }
 
 std::string Platform::get_name() const {
@@ -59,7 +59,7 @@ int Platform::get_instruction_width() const {
 }
 
 int Platform::get_precision() const {
-    return use_single_precision? 32 : 64;
+    return use_single_precision ? 32 : 64;
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/target_platform.hpp b/src/codegen/llvm/target_platform.hpp
index 282f6943d7..bed9e8923f 100644
--- a/src/codegen/llvm/target_platform.hpp
+++ b/src/codegen/llvm/target_platform.hpp
@@ -12,10 +12,7 @@
 namespace nmodl {
 namespace codegen {
 
-enum PlatformID {
-    CPU,
-    GPU
-};
+enum PlatformID { CPU, GPU };
 
 /**
  * \class Platform
@@ -57,31 +54,31 @@ class Platform {
              std::string& math_library,
              bool use_single_precision = false,
              int instruction_width = 1)
-              : platform_id(platform_id)
-              , name(name)
-              , subtarget_name(subtarget_name)
-              , math_library(math_library)
-              , use_single_precision(use_single_precision)
-              , instruction_width(instruction_width) {}
+        : platform_id(platform_id)
+        , name(name)
+        , subtarget_name(subtarget_name)
+        , math_library(math_library)
+        , use_single_precision(use_single_precision)
+        , instruction_width(instruction_width) {}
 
     Platform(PlatformID platform_id,
              const std::string& name,
              std::string& math_library,
              bool use_single_precision = false,
              int instruction_width = 1)
-              : platform_id(platform_id)
-              , name(name)
-              , math_library(math_library)
-              , use_single_precision(use_single_precision)
-              , instruction_width(instruction_width) {}
-
-    Platform(bool use_single_precision,
-             int instruction_width)
-            : platform_id(PlatformID::CPU)
-            , use_single_precision(use_single_precision)
-            , instruction_width(instruction_width) {}
-
-    Platform() : platform_id(PlatformID::CPU) {}
+        : platform_id(platform_id)
+        , name(name)
+        , math_library(math_library)
+        , use_single_precision(use_single_precision)
+        , instruction_width(instruction_width) {}
+
+    Platform(bool use_single_precision, int instruction_width)
+        : platform_id(PlatformID::CPU)
+        , use_single_precision(use_single_precision)
+        , instruction_width(instruction_width) {}
+
+    Platform()
+        : platform_id(PlatformID::CPU) {}
 
     /// Checks if this platform is a default platform.
     bool is_default_platform();
diff --git a/src/main.cpp b/src/main.cpp
index c1556687b4..a011cf818a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -695,40 +695,51 @@ int main(int argc, const char* argv[]) {
 
 #ifdef NMODL_LLVM_BACKEND
             if (llvm_ir || llvm_benchmark) {
-              // If benchmarking, we want to optimize the IR with target
-              // information and not in LLVM visitor.
-              int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
-
-              // Create platform abstraction.
-              PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU
-                                                          : PlatformID::GPU;
-              const std::string name =
-                  llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
-              Platform platform(pid, name, llvm_cpu_name, llvm_math_library, llvm_float_type,
-                                llvm_vector_width);
-
-              logger->info("Running LLVM backend code generator");
-              CodegenLLVMVisitor visitor(modfile, output_dir, platform,
-                                         llvm_opt_level, !llvm_no_debug,
-                                         llvm_fast_math_flags);
-              visitor.visit_program(*ast);
-              ast_to_nmodl(*ast, filepath("llvm", "mod"));
-              ast_to_json(*ast, filepath("llvm", "json"));
-
-              if (llvm_benchmark) {
-                // \todo integrate Platform class here
-                if (llvm_gpu_name != "default") {
-                  logger->warn("GPU benchmarking is not supported, targeting "
-                               "CPU instead");
+                // If benchmarking, we want to optimize the IR with target
+                // information and not in LLVM visitor.
+                int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
+
+                // Create platform abstraction.
+                PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU : PlatformID::GPU;
+                const std::string name = llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
+                Platform platform(pid,
+                                  name,
+                                  llvm_cpu_name,
+                                  llvm_math_library,
+                                  llvm_float_type,
+                                  llvm_vector_width);
+
+                logger->info("Running LLVM backend code generator");
+                CodegenLLVMVisitor visitor(modfile,
+                                           output_dir,
+                                           platform,
+                                           llvm_opt_level,
+                                           !llvm_no_debug,
+                                           llvm_fast_math_flags);
+                visitor.visit_program(*ast);
+                ast_to_nmodl(*ast, filepath("llvm", "mod"));
+                ast_to_json(*ast, filepath("llvm", "json"));
+
+                if (llvm_benchmark) {
+                    // \todo integrate Platform class here
+                    if (llvm_gpu_name != "default") {
+                        logger->warn(
+                            "GPU benchmarking is not supported, targeting "
+                            "CPU instead");
+                    }
+
+                    logger->info("Running LLVM benchmark");
+                    benchmark::LLVMBenchmark benchmark(visitor,
+                                                       modfile,
+                                                       output_dir,
+                                                       shared_lib_paths,
+                                                       num_experiments,
+                                                       instance_size,
+                                                       llvm_cpu_name,
+                                                       llvm_opt_level_ir,
+                                                       llvm_opt_level_codegen);
+                    benchmark.run(ast);
                 }
-
-                logger->info("Running LLVM benchmark");
-                benchmark::LLVMBenchmark benchmark(
-                    visitor, modfile, output_dir, shared_lib_paths,
-                    num_experiments, instance_size, llvm_cpu_name,
-                    llvm_opt_level_ir, llvm_opt_level_codegen);
-                benchmark.run(ast);
-              }
             }
 #endif
         }
diff --git a/src/parser/verbatim_driver.hpp b/src/parser/verbatim_driver.hpp
index 7ff63e3bb3..ea8c3e050e 100644
--- a/src/parser/verbatim_driver.hpp
+++ b/src/parser/verbatim_driver.hpp
@@ -23,7 +23,6 @@ namespace parser {
  * \brief Class that binds lexer and parser together for parsing VERBATIM block
  */
 class VerbatimDriver {
-
   protected:
     void init_scanner();
     void destroy_scanner();
@@ -53,4 +52,3 @@ class VerbatimDriver {
 
 
 int Verbatim_parse(nmodl::parser::VerbatimDriver*);
-
diff --git a/src/visitors/nmodl_visitor_helper.ipp b/src/visitors/nmodl_visitor_helper.ipp
index d68c180366..8ec90eb6e1 100644
--- a/src/visitors/nmodl_visitor_helper.ipp
+++ b/src/visitors/nmodl_visitor_helper.ipp
@@ -69,4 +69,3 @@ void NmodlPrintVisitor::visit_element(const std::vector<T>& elements,
 
 }  // namespace visitor
 }  // namespace nmodl
-
diff --git a/src/visitors/symtab_visitor_helper.hpp b/src/visitors/symtab_visitor_helper.hpp
index f7b073f2dd..35e7aa831a 100644
--- a/src/visitors/symtab_visitor_helper.hpp
+++ b/src/visitors/symtab_visitor_helper.hpp
@@ -164,13 +164,13 @@ void SymtabVisitor::add_model_symbol_with_property(ast::Node* node, NmodlType pr
 static void add_external_symbols(symtab::ModelSymbolTable* symtab) {
     ModToken tok(true);
     auto variables = nmodl::get_external_variables();
-    for (auto variable : variables) {
+    for (auto variable: variables) {
         auto symbol = std::make_shared<Symbol>(variable, nullptr, tok);
         symbol->add_property(NmodlType::extern_neuron_variable);
         symtab->insert(symbol);
     }
     auto methods = nmodl::get_external_functions();
-    for (auto method : methods) {
+    for (auto method: methods) {
         auto symbol = std::make_shared<Symbol>(method, nullptr, tok);
         symbol->add_property(NmodlType::extern_method);
         symtab->insert(symbol);
@@ -241,16 +241,17 @@ void SymtabVisitor::setup_symbol_table_for_scoped_block(ast::Node* node, const s
  * @todo we assume table statement follows variable declaration
  */
 void SymtabVisitor::visit_table_statement(ast::TableStatement& node) {
-    auto update_symbol = [this](const ast::NameVector& variables, NmodlType property, int num_values) {
-        for (auto& var : variables) {
-            auto name = var->get_node_name();
-            auto symbol = modsymtab->lookup(name);
-            if (symbol) {
-                symbol->add_property(property);
-                symbol->set_num_values(num_values);
+    auto update_symbol =
+        [this](const ast::NameVector& variables, NmodlType property, int num_values) {
+            for (auto& var: variables) {
+                auto name = var->get_node_name();
+                auto symbol = modsymtab->lookup(name);
+                if (symbol) {
+                    symbol->add_property(property);
+                    symbol->set_num_values(num_values);
+                }
             }
-        }
-    };
+        };
     int num_values = node.get_with()->eval() + 1;
     update_symbol(node.get_table_vars(), NmodlType::table_statement_var, num_values);
     update_symbol(node.get_depend_vars(), NmodlType::table_assigned_var, num_values);
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 1d080f4131..bfd22d6fda 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -49,11 +49,13 @@ std::string run_gpu_llvm_visitor(const std::string& text,
     NeuronSolveVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
 
-    codegen::Platform gpu_platform(codegen::PlatformID::GPU, /*name=*/"nvptx64",
-                                   math_library, use_single_precision, 1);
+    codegen::Platform gpu_platform(
+        codegen::PlatformID::GPU, /*name=*/"nvptx64", math_library, use_single_precision, 1);
     codegen::CodegenLLVMVisitor llvm_visitor(
         /*mod_filename=*/"unknown",
-        /*output_dir=*/".", gpu_platform, opt_level,
+        /*output_dir=*/".",
+        gpu_platform,
+        opt_level,
         /*add_debug_information=*/false);
 
     llvm_visitor.visit_program(*ast);
@@ -77,12 +79,15 @@ std::string run_llvm_visitor(const std::string& text,
     NeuronSolveVisitor().visit_program(*ast);
     SolveBlockVisitor().visit_program(*ast);
 
-    codegen::Platform cpu_platform(codegen::PlatformID::CPU, /*name=*/"default",
-                                   vec_lib, use_single_precision, vector_width);
+    codegen::Platform cpu_platform(
+        codegen::PlatformID::CPU, /*name=*/"default", vec_lib, use_single_precision, vector_width);
     codegen::CodegenLLVMVisitor llvm_visitor(
         /*mod_filename=*/"unknown",
-        /*output_dir=*/".", cpu_platform, opt_level,
-        /*add_debug_information=*/false, fast_math_flags);
+        /*output_dir=*/".",
+        cpu_platform,
+        opt_level,
+        /*add_debug_information=*/false,
+        fast_math_flags);
 
     llvm_visitor.visit_program(*ast);
     return llvm_visitor.dump_module();
@@ -1306,7 +1311,8 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
 
 
         THEN("vector and epilogue scalar loops are constructed") {
-            codegen::Platform simd_platform(/*use_single_precision=*/false, /*instruction_width=*/8);
+            codegen::Platform simd_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/8);
             auto result = run_llvm_visitor_helper(nmodl_text,
                                                   simd_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
@@ -1633,7 +1639,8 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
 
             // Check kernel annotations are correclty created.
             std::regex annotations(R"(!nvvm\.annotations = !\{!0\})");
-            std::regex kernel_data(R"(!0 = !\{void \(%.*__instance_var__type\*\)\* @nrn_state_.*, !\"kernel\", i32 1\})");
+            std::regex kernel_data(
+                R"(!0 = !\{void \(%.*__instance_var__type\*\)\* @nrn_state_.*, !\"kernel\", i32 1\})");
             REQUIRE(std::regex_search(module_string, m, annotations));
             REQUIRE(std::regex_search(module_string, m, kernel_data));
 
@@ -1677,14 +1684,17 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
 
             // Check target information.
             // TODO: this may change when more platforms are supported.
-            std::regex data_layout(R"(target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\")");
+            std::regex data_layout(
+                R"(target datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\")");
             std::regex triple(R"(nvptx64-nvidia-cuda)");
             REQUIRE(std::regex_search(module_string, m, data_layout));
             REQUIRE(std::regex_search(module_string, m, triple));
 
             // Check for address space casts and address spaces in general when loading data.
-            std::regex as_cast(R"(addrspacecast %.*__instance_var__type\* %.* to %.*__instance_var__type addrspace\(1\)\*)");
-            std::regex gep_as1(R"(getelementptr inbounds %.*__instance_var__type, %.*__instance_var__type addrspace\(1\)\* %.*, i64 0, i32 .*)");
+            std::regex as_cast(
+                R"(addrspacecast %.*__instance_var__type\* %.* to %.*__instance_var__type addrspace\(1\)\*)");
+            std::regex gep_as1(
+                R"(getelementptr inbounds %.*__instance_var__type, %.*__instance_var__type addrspace\(1\)\* %.*, i64 0, i32 .*)");
             std::regex load_as1(R"(load double\*, double\* addrspace\(1\)\* %.*)");
             REQUIRE(std::regex_search(module_string, m, as_cast));
             REQUIRE(std::regex_search(module_string, m, gep_as1));

From 6c3fe22fb4d3872c56ea286d9317ad4663cfea41 Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Fri, 8 Apr 2022 09:30:56 +0200
Subject: [PATCH 282/331] [LLVM][GPU][+refactoring] Replacement of math
 intrinsics with library calls (#835)

Added an LLVM pass that replaces math intrinsics
with calls to math library. In particular:

* Functionality of replacement with SIMD functions is factored
out into a separate file and LLVM version dependencies are
dropped (LLVM 13 is already used anyway).

* A pass to replace intrinsics with libdevice calls when targeting
CUDA platforms has been added. So far only `exp` and `pow` are
 supported (single and double precision).

* Added a test to check the replacement

Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>
---
 src/codegen/llvm/CMakeLists.txt               |   2 +
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 100 +--------
 src/codegen/llvm/codegen_llvm_visitor.hpp     |   7 -
 src/codegen/llvm/llvm_utils.cpp               |   7 +
 src/codegen/llvm/llvm_utils.hpp               |   5 +-
 .../llvm/replace_with_lib_functions.cpp       | 210 ++++++++++++++++++
 .../llvm/replace_with_lib_functions.hpp       |  65 ++++++
 src/codegen/llvm/target_platform.cpp          |  10 +-
 src/codegen/llvm/target_platform.hpp          |  10 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  45 +++-
 10 files changed, 343 insertions(+), 118 deletions(-)
 create mode 100644 src/codegen/llvm/replace_with_lib_functions.cpp
 create mode 100644 src/codegen/llvm/replace_with_lib_functions.hpp

diff --git a/src/codegen/llvm/CMakeLists.txt b/src/codegen/llvm/CMakeLists.txt
index 198d90c1a3..ade95b08a7 100644
--- a/src/codegen/llvm/CMakeLists.txt
+++ b/src/codegen/llvm/CMakeLists.txt
@@ -12,6 +12,8 @@ set(LLVM_CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_ir_builder.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_utils.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/replace_with_lib_functions.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/replace_with_lib_functions.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/target_platform.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/target_platform.hpp)
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 42ddc04b64..ca3b405be3 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -12,16 +12,9 @@
 #include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/Host.h"
-
-#if LLVM_VERSION_MAJOR >= 13
-#include "llvm/CodeGen/ReplaceWithVeclib.h"
-#endif
 
 namespace nmodl {
 namespace codegen {
@@ -70,72 +63,6 @@ void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
     module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
 }
 
-#if LLVM_VERSION_MAJOR >= 13
-void CodegenLLVMVisitor::add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
-                                                                 llvm::Triple& triple) {
-    // Since LLVM does not support SLEEF as a vector library yet, process it separately.
-    if (platform.get_math_library() == "SLEEF") {
-// clang-format off
-#define FIXED(w) llvm::ElementCount::getFixed(w)
-// clang-format on
-#define DISPATCH(func, vec_func, width) {func, vec_func, width},
-
-        // Populate function definitions of only exp and pow (for now)
-        const llvm::VecDesc aarch64_functions[] = {
-            // clang-format off
-            DISPATCH("llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4))
-            DISPATCH("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2))
-            DISPATCH("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4))
-            DISPATCH("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2))
-            // clang-format on
-        };
-        const llvm::VecDesc x86_functions[] = {
-            // clang-format off
-            DISPATCH("llvm.exp.f64", "_ZGVbN2v_exp", FIXED(2))
-            DISPATCH("llvm.exp.f64", "_ZGVdN4v_exp", FIXED(4))
-            DISPATCH("llvm.exp.f64", "_ZGVeN8v_exp", FIXED(8))
-            DISPATCH("llvm.pow.f64", "_ZGVbN2vv_pow", FIXED(2))
-            DISPATCH("llvm.pow.f64", "_ZGVdN4vv_pow", FIXED(4))
-            DISPATCH("llvm.pow.f64", "_ZGVeN8vv_pow", FIXED(8))
-            // clang-format on
-        };
-#undef DISPATCH
-
-        if (triple.isAArch64()) {
-            tli.addVectorizableFunctions(aarch64_functions);
-        }
-        if (triple.isX86() && triple.isArch64Bit()) {
-            tli.addVectorizableFunctions(x86_functions);
-        }
-
-    } else {
-        // A map to query vector library by its string value.
-        using VecLib = llvm::TargetLibraryInfoImpl::VectorLibrary;
-        static const std::map<std::string, VecLib> llvm_supported_vector_libraries = {
-            {"Accelerate", VecLib::Accelerate},
-            {"libmvec", VecLib::LIBMVEC_X86},
-            {"libsystem_m", VecLib ::DarwinLibSystemM},
-            {"MASSV", VecLib::MASSV},
-            {"none", VecLib::NoLibrary},
-            {"SVML", VecLib::SVML}};
-        const auto& library = llvm_supported_vector_libraries.find(platform.get_math_library());
-        if (library == llvm_supported_vector_libraries.end())
-            throw std::runtime_error("Error: unknown vector library - " +
-                                     platform.get_math_library() + "\n");
-
-        // Add vectorizable functions to the target library info.
-        switch (library->second) {
-        case VecLib::LIBMVEC_X86:
-            if (!triple.isX86() || !triple.isArch64Bit())
-                break;
-        default:
-            tli.addVectorizableFunctionsFromVecLib(library->second);
-            break;
-        }
-    }
-}
-#endif
-
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
     node->accept(*this);
     return ir_builder.pop_last_value();
@@ -890,31 +817,8 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         utils::optimise_module(*module, opt_level_ir);
     }
 
-    // Optionally, replace LLVM math intrinsics with vector library calls.
-    if (platform.is_cpu_with_simd()) {
-#if LLVM_VERSION_MAJOR < 13
-        logger->warn(
-            "This version of LLVM does not support replacement of LLVM intrinsics with vector "
-            "library calls");
-#else
-        // First, get the target library information and add vectorizable functions for the
-        // specified vector library.
-        llvm::Triple triple(llvm::sys::getDefaultTargetTriple());
-        llvm::TargetLibraryInfoImpl target_lib_info = llvm::TargetLibraryInfoImpl(triple);
-        add_vectorizable_functions_from_vec_lib(target_lib_info, triple);
-
-        // Run passes that replace math intrinsics.
-        llvm::legacy::FunctionPassManager fpm(module.get());
-        fpm.add(new llvm::TargetLibraryInfoWrapperPass(target_lib_info));
-        fpm.add(new llvm::ReplaceWithVeclibLegacy);
-        fpm.doInitialization();
-        for (auto& function: module->getFunctionList()) {
-            if (!function.isDeclaration())
-                fpm.run(function);
-        }
-        fpm.doFinalization();
-#endif
-    }
+    // Optionally, replace LLVM math intrinsics with library calls.
+    utils::replace_with_lib_functions(platform, *module);
 
     // Handle GPU optimizations (CUDA platfroms only for now).
     if (platform.is_gpu()) {
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 683cc7972a..299071ae80 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -26,7 +26,6 @@
 #include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -305,12 +304,6 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
     // Annotates kernel function with NVVM metadata.
     void annotate_kernel_with_nvvm(llvm::Function* kernel);
 
-#if LLVM_VERSION_MAJOR >= 13
-    /// Populates target library info with the vector library definitions.
-    void add_vectorizable_functions_from_vec_lib(llvm::TargetLibraryInfoImpl& tli,
-                                                 llvm::Triple& triple);
-#endif
-
     /// Accepts the given AST node and returns the processed value.
     llvm::Value* accept_and_get(const std::shared_ptr<ast::Node>& node);
 
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index 4168612790..bd4feee32f 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -6,6 +6,7 @@
  *************************************************************************/
 
 #include "codegen/llvm/llvm_utils.hpp"
+#include "codegen/llvm/replace_with_lib_functions.hpp"
 
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
@@ -162,6 +163,12 @@ void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* t
     run_optimisation_passes(module, func_pm, module_pm);
 }
 
+void replace_with_lib_functions(codegen::Platform& platform, llvm::Module& module) {
+    llvm::legacy::PassManager pm;
+    pm.add(new llvm::ReplaceMathFunctions(platform));
+    pm.run(module);
+}
+
 /****************************************************************************************/
 /*                                    File utils                                        */
 /****************************************************************************************/
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index 17be5073e2..3394463317 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -21,6 +21,9 @@ void initialise_optimisation_passes();
 /// Initialises NVPTX-specific optimisation passes.
 void initialise_nvptx_passes();
 
+/// Replaces calls to LLVM intrinsics with appropriate library calls.
+void replace_with_lib_functions(codegen::Platform& platform, llvm::Module& module);
+
 /// Optimises the given LLVM IR module for NVPTX targets.
 void optimise_module_for_nvptx(codegen::Platform& platform,
                                llvm::Module& module,
@@ -30,7 +33,7 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
 /// Optimises the given LLVM IR module.
 void optimise_module(llvm::Module& module, int opt_level, llvm::TargetMachine* tm = nullptr);
 
-///
+/// Saves generated IR module to .ll file.
 void save_ir_to_ll_file(llvm::Module& module, const std::string& filename);
 
 }  // namespace utils
diff --git a/src/codegen/llvm/replace_with_lib_functions.cpp b/src/codegen/llvm/replace_with_lib_functions.cpp
new file mode 100644
index 0000000000..6d98dd3eb0
--- /dev/null
+++ b/src/codegen/llvm/replace_with_lib_functions.cpp
@@ -0,0 +1,210 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include "codegen/llvm/replace_with_lib_functions.hpp"
+
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/CodeGen/ReplaceWithVeclib.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/LegacyPassManager.h"
+
+namespace llvm {
+
+char ReplaceMathFunctions::ID = 0;
+
+bool ReplaceMathFunctions::runOnModule(Module& module) {
+    legacy::FunctionPassManager fpm(&module);
+    bool modified = false;
+
+    // If the platform supports SIMD, replace math intrinsics with library
+    // functions.
+    if (platform->is_cpu_with_simd()) {
+        // First, get the target library information and add vectorizable functions for the
+        // specified vector library.
+        Triple triple(sys::getDefaultTargetTriple());
+        TargetLibraryInfoImpl tli = TargetLibraryInfoImpl(triple);
+        add_vectorizable_functions_from_vec_lib(tli, triple);
+
+        // Add passes that replace math intrinsics with calls.
+        fpm.add(new TargetLibraryInfoWrapperPass(tli));
+        fpm.add(new ReplaceWithVeclibLegacy);
+    }
+
+    // For CUDA GPUs, replace with calls to libdevice.
+    if (platform->is_CUDA_gpu()) {
+        fpm.add(new ReplaceWithLibdevice);
+    }
+
+    // Run passes.
+    fpm.doInitialization();
+    for (auto& function: module.getFunctionList()) {
+        if (!function.isDeclaration())
+            modified |= fpm.run(function);
+    }
+    fpm.doFinalization();
+
+    return modified;
+}
+
+void ReplaceMathFunctions::add_vectorizable_functions_from_vec_lib(TargetLibraryInfoImpl& tli,
+                                                                   Triple& triple) {
+    // Since LLVM does not support SLEEF as a vector library yet, process it separately.
+    if (platform->get_math_library() == "SLEEF") {
+// clang-format off
+#define FIXED(w) ElementCount::getFixed(w)
+// clang-format on
+#define DISPATCH(func, vec_func, width) {func, vec_func, width},
+
+        // Populate function definitions of only exp and pow (for now).
+        const VecDesc aarch64_functions[] = {
+            // clang-format off
+            DISPATCH("llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4))
+            DISPATCH("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2))
+            DISPATCH("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4))
+            DISPATCH("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2))
+            // clang-format on
+        };
+        const VecDesc x86_functions[] = {
+            // clang-format off
+            DISPATCH("llvm.exp.f64", "_ZGVbN2v_exp", FIXED(2))
+            DISPATCH("llvm.exp.f64", "_ZGVdN4v_exp", FIXED(4))
+            DISPATCH("llvm.exp.f64", "_ZGVeN8v_exp", FIXED(8))
+            DISPATCH("llvm.pow.f64", "_ZGVbN2vv_pow", FIXED(2))
+            DISPATCH("llvm.pow.f64", "_ZGVdN4vv_pow", FIXED(4))
+            DISPATCH("llvm.pow.f64", "_ZGVeN8vv_pow", FIXED(8))
+            // clang-format on
+        };
+#undef DISPATCH
+#undef FIXED
+
+        if (triple.isAArch64()) {
+            tli.addVectorizableFunctions(aarch64_functions);
+        }
+        if (triple.isX86() && triple.isArch64Bit()) {
+            tli.addVectorizableFunctions(x86_functions);
+        }
+
+    } else {
+        // A map to query vector library by its string value.
+        using VecLib = TargetLibraryInfoImpl::VectorLibrary;
+        static const std::map<std::string, VecLib> llvm_supported_vector_libraries = {
+            {"Accelerate", VecLib::Accelerate},
+            {"libmvec", VecLib::LIBMVEC_X86},
+            {"libsystem_m", VecLib ::DarwinLibSystemM},
+            {"MASSV", VecLib::MASSV},
+            {"none", VecLib::NoLibrary},
+            {"SVML", VecLib::SVML}};
+
+        const auto& library = llvm_supported_vector_libraries.find(platform->get_math_library());
+        if (library == llvm_supported_vector_libraries.end())
+            throw std::runtime_error("Error: unknown vector library - " +
+                                     platform->get_math_library() + "\n");
+
+        // Add vectorizable functions to the target library info.
+        if (library->second != VecLib::LIBMVEC_X86 || (triple.isX86() && triple.isArch64Bit())) {
+            tli.addVectorizableFunctionsFromVecLib(library->second);
+        }
+    }
+}
+
+void ReplaceWithLibdevice::getAnalysisUsage(AnalysisUsage& au) const {
+    au.setPreservesCFG();
+    au.addPreserved<ScalarEvolutionWrapperPass>();
+    au.addPreserved<AAResultsWrapperPass>();
+    au.addPreserved<LoopAccessLegacyAnalysis>();
+    au.addPreserved<DemandedBitsWrapperPass>();
+    au.addPreserved<OptimizationRemarkEmitterWrapperPass>();
+    au.addPreserved<GlobalsAAWrapperPass>();
+}
+
+bool ReplaceWithLibdevice::runOnFunction(Function& function) {
+    bool modified = false;
+
+    // Try to replace math intrinsics.
+    std::vector<CallInst*> replaced_calls;
+    for (auto& instruction: instructions(function)) {
+        if (auto* call_inst = dyn_cast<CallInst>(&instruction)) {
+            if (replace_call(*call_inst)) {
+                replaced_calls.push_back(call_inst);
+                modified = true;
+            }
+        }
+    }
+
+    // Remove calls to replaced intrinsics.
+    for (auto* call_inst: replaced_calls) {
+        call_inst->eraseFromParent();
+    }
+
+    return modified;
+}
+
+bool ReplaceWithLibdevice::replace_call(CallInst& call_inst) {
+    Module* m = call_inst.getModule();
+    Function* function = call_inst.getCalledFunction();
+
+    // Replace math intrinsics only!
+    auto id = function->getIntrinsicID();
+    bool is_nvvm_intrinsic = id == Intrinsic::nvvm_read_ptx_sreg_ntid_x ||
+                             id == Intrinsic::nvvm_read_ptx_sreg_nctaid_x ||
+                             id == Intrinsic::nvvm_read_ptx_sreg_ctaid_x ||
+                             id == Intrinsic::nvvm_read_ptx_sreg_tid_x;
+    if (id == Intrinsic::not_intrinsic || is_nvvm_intrinsic)
+        return false;
+
+    // Map of supported replacements. For now it is only exp and pow.
+    static const std::map<std::string, std::string> libdevice_name = {{"llvm.exp.f32", "__nv_expf"},
+                                                                      {"llvm.exp.f64", "__nv_exp"},
+                                                                      {"llvm.pow.f32", "__nv_powf"},
+                                                                      {"llvm.pow.f64", "__nv_pow"}};
+
+    // If replacement is not supported, abort.
+    std::string old_name = function->getName().str();
+    auto it = libdevice_name.find(old_name);
+    if (it == libdevice_name.end())
+        throw std::runtime_error("Error: replacements for " + old_name + " are not supported!\n");
+
+    // Get (or create) libdevice function.
+    Function* libdevice_func = m->getFunction(it->second);
+    if (!libdevice_func) {
+        libdevice_func = Function::Create(function->getFunctionType(),
+                                          Function::ExternalLinkage,
+                                          it->second,
+                                          *m);
+        libdevice_func->copyAttributesFrom(function);
+    }
+
+    // Create a call to libdevice function with the same operands.
+    IRBuilder<> builder(&call_inst);
+    std::vector<Value*> args(call_inst.arg_operands().begin(), call_inst.arg_operands().end());
+    SmallVector<OperandBundleDef, 1> op_bundles;
+    call_inst.getOperandBundlesAsDefs(op_bundles);
+    CallInst* new_call = builder.CreateCall(libdevice_func, args, op_bundles);
+
+    // Replace all uses of old instruction with the new one. Also, copy
+    // fast math flags if necessary.
+    call_inst.replaceAllUsesWith(new_call);
+    if (isa<FPMathOperator>(new_call)) {
+        new_call->copyFastMathFlags(&call_inst);
+    }
+
+    return true;
+}
+
+char ReplaceWithLibdevice::ID = 0;
+static RegisterPass<ReplaceWithLibdevice> X("libdevice-replacement",
+                                            "Pass replacing math functions with calls to libdevice",
+                                            false,
+                                            false);
+
+}  // namespace llvm
diff --git a/src/codegen/llvm/replace_with_lib_functions.hpp b/src/codegen/llvm/replace_with_lib_functions.hpp
new file mode 100644
index 0000000000..5bf38ba85f
--- /dev/null
+++ b/src/codegen/llvm/replace_with_lib_functions.hpp
@@ -0,0 +1,65 @@
+/*************************************************************************
+ * Copyright (C) 2018-2020 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include "codegen/llvm/target_platform.hpp"
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Host.h"
+
+using nmodl::codegen::Platform;
+
+namespace llvm {
+
+/**
+ * \class ReplaceMathFunctions
+ * \brief A module LLVM pass that replaces math intrinsics with
+ * SIMD or libdevice library calls.
+ */
+class ReplaceMathFunctions: public ModulePass {
+  private:
+    const Platform* platform;
+
+  public:
+    static char ID;
+
+    ReplaceMathFunctions(const Platform& platform)
+        : ModulePass(ID)
+        , platform(&platform) {}
+
+    bool runOnModule(Module& module) override;
+
+  private:
+    /// Populates `tli` with vectorizable function definitions.
+    void add_vectorizable_functions_from_vec_lib(TargetLibraryInfoImpl& tli, Triple& triple);
+};
+
+/**
+ * \class ReplaceWithLibdevice
+ * \brief A function LLVM pass that replaces math intrinsics with
+ * libdevice library calls.
+ */
+class ReplaceWithLibdevice: public FunctionPass {
+  public:
+    static char ID;
+
+    ReplaceWithLibdevice()
+        : llvm::FunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage& au) const override;
+
+    bool runOnFunction(Function& function) override;
+
+  private:
+    /// Replaces call instruction to intrinsic with libdevice call.
+    bool replace_call(CallInst& call_inst);
+};
+
+}  // namespace llvm
diff --git a/src/codegen/llvm/target_platform.cpp b/src/codegen/llvm/target_platform.cpp
index fff195d6b8..bcab739fb3 100644
--- a/src/codegen/llvm/target_platform.cpp
+++ b/src/codegen/llvm/target_platform.cpp
@@ -15,24 +15,24 @@ namespace codegen {
 const std::string Platform::DEFAULT_PLATFORM_NAME = "default";
 const std::string Platform::DEFAULT_MATH_LIBRARY = "none";
 
-bool Platform::is_default_platform() {
+bool Platform::is_default_platform() const {
     // Default platform is a CPU.
     return platform_id == PlatformID::CPU && name == Platform::DEFAULT_PLATFORM_NAME;
 }
 
-bool Platform::is_cpu() {
+bool Platform::is_cpu() const {
     return platform_id == PlatformID::CPU;
 }
 
-bool Platform::is_cpu_with_simd() {
+bool Platform::is_cpu_with_simd() const {
     return platform_id == PlatformID::CPU && instruction_width > 1;
 }
 
-bool Platform::is_gpu() {
+bool Platform::is_gpu() const {
     return platform_id == PlatformID::GPU;
 }
 
-bool Platform::is_CUDA_gpu() {
+bool Platform::is_CUDA_gpu() const {
     return platform_id == PlatformID::GPU && (name == "nvptx" || name == "nvptx64");
 }
 
diff --git a/src/codegen/llvm/target_platform.hpp b/src/codegen/llvm/target_platform.hpp
index bed9e8923f..8676f176b4 100644
--- a/src/codegen/llvm/target_platform.hpp
+++ b/src/codegen/llvm/target_platform.hpp
@@ -81,19 +81,19 @@ class Platform {
         : platform_id(PlatformID::CPU) {}
 
     /// Checks if this platform is a default platform.
-    bool is_default_platform();
+    bool is_default_platform() const;
 
     /// Checks if this platform is a CPU.
-    bool is_cpu();
+    bool is_cpu() const;
 
     /// Checks if this platform is a CPU with SIMD support.
-    bool is_cpu_with_simd();
+    bool is_cpu_with_simd() const;
 
     /// Checks if this platform is a GPU.
-    bool is_gpu();
+    bool is_gpu() const;
 
     /// Checks if this platform is CUDA platform.
-    bool is_CUDA_gpu();
+    bool is_CUDA_gpu() const;
 
     bool is_single_precision();
 
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index bfd22d6fda..a680206271 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1366,7 +1366,6 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             REQUIRE(std::regex_search(no_library_module_str, m, exp_decl));
             REQUIRE(std::regex_search(no_library_module_str, m, exp_call));
 
-#if LLVM_VERSION_MAJOR >= 13
             // Check exponential calls are replaced with calls to SVML library.
             std::string svml_library_module_str = run_llvm_visitor(nmodl_text,
                                                                    /*opt_level=*/0,
@@ -1444,7 +1443,6 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             REQUIRE(std::regex_search(libsystem_m_library_module_str, m, libsystem_m_exp_decl));
             REQUIRE(std::regex_search(libsystem_m_library_module_str, m, libsystem_m_exp_call));
             REQUIRE(!std::regex_search(libsystem_m_library_module_str, m, fexp_call));
-#endif
         }
     }
 }
@@ -1701,4 +1699,47 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             REQUIRE(std::regex_search(module_string, m, load_as1));
         }
     }
+
+    GIVEN("When using math functions") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                RANGE x, y
+            }
+
+            ASSIGNED { x y }
+
+            STATE { m }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states {
+              m = exp(y) + x ^ 2
+            }
+        )";
+
+        THEN("calls to libdevice are created") {
+            std::string module_string = run_gpu_llvm_visitor(nmodl_text,
+                                                             /*opt_level=*/3,
+                                                             /*use_single_precision=*/false,
+                                                             /*math_library=*/"libdevice");
+            std::smatch m;
+
+            // Check if exp and pow intrinsics have been replaced.
+            std::regex exp_declaration(R"(declare double @__nv_exp\(double\))");
+            std::regex exp_new_call(R"(call double @__nv_exp\(double %.*\))");
+            std::regex exp_old_call(R"(call double @llvm\.exp\.f64\(double %.*\))");
+            std::regex pow_declaration(R"(declare double @__nv_pow\(double, double\))");
+            std::regex pow_new_call(R"(call double @__nv_pow\(double %.*, double .*\))");
+            std::regex pow_old_call(R"(call double @llvm\.pow\.f64\(double %.*, double .*\))");
+            REQUIRE(std::regex_search(module_string, m, exp_declaration));
+            REQUIRE(std::regex_search(module_string, m, exp_new_call));
+            REQUIRE(!std::regex_search(module_string, m, exp_old_call));
+            REQUIRE(std::regex_search(module_string, m, pow_declaration));
+            REQUIRE(std::regex_search(module_string, m, pow_new_call));
+            REQUIRE(!std::regex_search(module_string, m, pow_old_call));
+        }
+    }
 }

From dba0cd4316790bf99cce22e7bc8203e993196c34 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 8 Apr 2022 12:07:49 +0200
Subject: [PATCH 283/331] Replace simple integration test for the math
 libraries in LLVM benchmark

---
 .gitlab-ci.yml                     |  4 ++--
 test/integration/mod/test.mod      | 16 ----------------
 test/integration/mod/test_math.mod | 16 ++++++++++++++++
 3 files changed, 18 insertions(+), 18 deletions(-)
 delete mode 100644 test/integration/mod/test.mod
 create mode 100644 test/integration/mod/test_math.mod

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 55c52be210..6c23942204 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -97,7 +97,7 @@ test_benchmark:cpu:
     - module load unstable git
     - . ${SPACK_ROOT}/share/spack/setup-env.sh
     - spack load nmodl/${SPACK_INSTALLED_HASH}
-    - nmodl test/integration/mod/test.mod llvm --ir --opt-level-ir 3 benchmark --run --opt-level-codegen 3 --instance-size 10000000 --repeat 2
+    - nmodl test/integration/mod/test_math.mod llvm --ir --opt-level-ir 3 benchmark --run --opt-level-codegen 3 --instance-size 10000000 --repeat 2
   needs: ["build_cuda:gcc"]
 
 test_benchmark:gpu:
@@ -107,5 +107,5 @@ test_benchmark:gpu:
     - module load unstable git
     - . ${SPACK_ROOT}/share/spack/setup-env.sh
     - spack load nmodl/${SPACK_INSTALLED_HASH}
-    - nmodl test/integration/mod/test.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_70" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2 --grid-dim-x 4096 --block-dim-x 256
+    - nmodl test/integration/mod/test_math.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_70" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2 --grid-dim-x 4096 --block-dim-x 256
   needs: ["build_cuda:gcc"]
diff --git a/test/integration/mod/test.mod b/test/integration/mod/test.mod
deleted file mode 100644
index 1c5292ba6e..0000000000
--- a/test/integration/mod/test.mod
+++ /dev/null
@@ -1,16 +0,0 @@
-NEURON {
-	SUFFIX test
-	RANGE x, y
-}
-
-ASSIGNED { x y }
-
-STATE { m }
-
-BREAKPOINT {
-	SOLVE states METHOD cnexp
-}
-
-DERIVATIVE states {
-  m = y + 2
-}
diff --git a/test/integration/mod/test_math.mod b/test/integration/mod/test_math.mod
new file mode 100644
index 0000000000..43b6a7380b
--- /dev/null
+++ b/test/integration/mod/test_math.mod
@@ -0,0 +1,16 @@
+NEURON {
+    SUFFIX test
+    RANGE x, y
+}
+
+ASSIGNED { x y }
+
+STATE { m }
+
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+}
+
+DERIVATIVE states {
+    m = exp(y) + x ^ 2
+}

From 951c8145e83561b660c85718ae7402f5ea4339fc Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 8 Apr 2022 13:46:04 +0200
Subject: [PATCH 284/331] Added log function and test

---
 src/codegen/llvm/codegen_llvm_visitor.cpp     | 19 +++++++++++++++++++
 .../llvm/replace_with_lib_functions.cpp       |  9 ++++++++-
 test/unit/codegen/codegen_llvm_ir.cpp         |  8 +++++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 7a492c6576..467fa57e65 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -68,6 +68,25 @@ void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel, const
     module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
 }
 
+void CodegenLLVMVisitor::annotate_wrapper_kernels_with_nvvm() {
+    // First clear all the nvvm annotations from the module
+    auto module_named_metadata = module->getNamedMetadata("nvvm.annotations");
+    module->eraseNamedMetadata(module_named_metadata);
+
+    // Then each kernel should be annotated as "device" function and wrappers should be annotated as "kernel" functions
+    std::vector<std::string> kernel_names;
+    find_kernel_names(kernel_names);
+
+    for (const auto& kernel_name: kernel_names) {
+        // Get the kernel function.
+        auto kernel = module->getFunction(kernel_name);
+        // Get the kernel wrapper function.
+        auto kernel_wrapper = module->getFunction(get_wrapper_name(kernel_name));
+        annotate_kernel_with_nvvm(kernel, "device");
+        annotate_kernel_with_nvvm(kernel_wrapper, "kernel");
+    }
+}
+
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
     node->accept(*this);
     return ir_builder.pop_last_value();
diff --git a/src/codegen/llvm/replace_with_lib_functions.cpp b/src/codegen/llvm/replace_with_lib_functions.cpp
index 6d98dd3eb0..cae6814ad2 100644
--- a/src/codegen/llvm/replace_with_lib_functions.cpp
+++ b/src/codegen/llvm/replace_with_lib_functions.cpp
@@ -72,6 +72,8 @@ void ReplaceMathFunctions::add_vectorizable_functions_from_vec_lib(TargetLibrary
             DISPATCH("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2))
             DISPATCH("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4))
             DISPATCH("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2))
+            DISPATCH("llvm.log.f32", "_ZGVnN4v_logf", FIXED(4))
+            DISPATCH("llvm.log.f64", "_ZGVnN2v_log", FIXED(2))
             // clang-format on
         };
         const VecDesc x86_functions[] = {
@@ -82,6 +84,9 @@ void ReplaceMathFunctions::add_vectorizable_functions_from_vec_lib(TargetLibrary
             DISPATCH("llvm.pow.f64", "_ZGVbN2vv_pow", FIXED(2))
             DISPATCH("llvm.pow.f64", "_ZGVdN4vv_pow", FIXED(4))
             DISPATCH("llvm.pow.f64", "_ZGVeN8vv_pow", FIXED(8))
+            DISPATCH("llvm.log.f64", "_ZGVbN2v_log", FIXED(2))
+            DISPATCH("llvm.log.f64", "_ZGVdN4v_log", FIXED(4))
+            DISPATCH("llvm.log.f64", "_ZGVeN8vv_pow", FIXED(8))
             // clang-format on
         };
 #undef DISPATCH
@@ -166,7 +171,9 @@ bool ReplaceWithLibdevice::replace_call(CallInst& call_inst) {
     static const std::map<std::string, std::string> libdevice_name = {{"llvm.exp.f32", "__nv_expf"},
                                                                       {"llvm.exp.f64", "__nv_exp"},
                                                                       {"llvm.pow.f32", "__nv_powf"},
-                                                                      {"llvm.pow.f64", "__nv_pow"}};
+                                                                      {"llvm.pow.f64", "__nv_pow"},
+                                                                      {"llvm.log.f32", "__nv_logf"},
+                                                                      {"llvm.log.f64", "__nv_log"}};
 
     // If replacement is not supported, abort.
     std::string old_name = function->getName().str();
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index a680206271..a2b8574cc7 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1716,7 +1716,7 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             }
 
             DERIVATIVE states {
-              m = exp(y) + x ^ 2
+              m = exp(y) + x ^ 2 + log(x)
             }
         )";
 
@@ -1734,12 +1734,18 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             std::regex pow_declaration(R"(declare double @__nv_pow\(double, double\))");
             std::regex pow_new_call(R"(call double @__nv_pow\(double %.*, double .*\))");
             std::regex pow_old_call(R"(call double @llvm\.pow\.f64\(double %.*, double .*\))");
+            std::regex log_declaration(R"(declare double @__nv_log\(double\))");
+            std::regex log_new_call(R"(call double @__nv_log\(double %.*\))");
+            std::regex log_old_call(R"(call double @llvm\.log\.f64\(double %.*\))");
             REQUIRE(std::regex_search(module_string, m, exp_declaration));
             REQUIRE(std::regex_search(module_string, m, exp_new_call));
             REQUIRE(!std::regex_search(module_string, m, exp_old_call));
             REQUIRE(std::regex_search(module_string, m, pow_declaration));
             REQUIRE(std::regex_search(module_string, m, pow_new_call));
             REQUIRE(!std::regex_search(module_string, m, pow_old_call));
+            REQUIRE(std::regex_search(module_string, m, log_declaration));
+            REQUIRE(std::regex_search(module_string, m, log_new_call));
+            REQUIRE(!std::regex_search(module_string, m, log_old_call));
         }
     }
 }

From 142a4c01746f318972bb1aef11bb49721cfb5da2 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 8 Apr 2022 13:56:30 +0200
Subject: [PATCH 285/331] Small fix in log replacement funcs

---
 src/codegen/llvm/replace_with_lib_functions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/codegen/llvm/replace_with_lib_functions.cpp b/src/codegen/llvm/replace_with_lib_functions.cpp
index cae6814ad2..24d53c4312 100644
--- a/src/codegen/llvm/replace_with_lib_functions.cpp
+++ b/src/codegen/llvm/replace_with_lib_functions.cpp
@@ -86,7 +86,7 @@ void ReplaceMathFunctions::add_vectorizable_functions_from_vec_lib(TargetLibrary
             DISPATCH("llvm.pow.f64", "_ZGVeN8vv_pow", FIXED(8))
             DISPATCH("llvm.log.f64", "_ZGVbN2v_log", FIXED(2))
             DISPATCH("llvm.log.f64", "_ZGVdN4v_log", FIXED(4))
-            DISPATCH("llvm.log.f64", "_ZGVeN8vv_pow", FIXED(8))
+            DISPATCH("llvm.log.f64", "_ZGVeN8v_log", FIXED(8))
             // clang-format on
         };
 #undef DISPATCH

From 9f34e75e9e752b4cf84e8c8e32f7372aaf40d433 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 8 Apr 2022 16:17:48 +0200
Subject: [PATCH 286/331] Add log in the test mod file

---
 test/integration/mod/test_math.mod | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/integration/mod/test_math.mod b/test/integration/mod/test_math.mod
index 43b6a7380b..6e3174a846 100644
--- a/test/integration/mod/test_math.mod
+++ b/test/integration/mod/test_math.mod
@@ -12,5 +12,5 @@ BREAKPOINT {
 }
 
 DERIVATIVE states {
-    m = exp(y) + x ^ 2
+    m = exp(y) + x ^ 107  + log(x)
 }

From 3118f78d42476f3357fb79922fda9cc587500d17 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 8 Apr 2022 16:38:12 +0200
Subject: [PATCH 287/331] Update hpc-coding-conventions

---
 cmake/hpc-coding-conventions | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/hpc-coding-conventions b/cmake/hpc-coding-conventions
index 7bca42c14a..7eaad9d932 160000
--- a/cmake/hpc-coding-conventions
+++ b/cmake/hpc-coding-conventions
@@ -1 +1 @@
-Subproject commit 7bca42c14a93e2eb2858ad4e90514d629aa3df5b
+Subproject commit 7eaad9d932f1fdcd7421d943cbf7bc5fcd6c5165

From f85b0d163a97d7bc8ebb3187da6e1230c5474cbb Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 8 Apr 2022 16:38:55 +0200
Subject: [PATCH 288/331] Fix clang format

---
 src/codegen/llvm/codegen_llvm_visitor.cpp | 15 ++++++++-------
 src/codegen/llvm/codegen_llvm_visitor.hpp |  3 ++-
 test/benchmark/cuda_driver.cpp            |  4 +++-
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 467fa57e65..8ecfdcc89f 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -58,12 +58,12 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
-void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel, const std::string& annotation = "kernel") {
-    llvm::Metadata* metadata[] = {
-        llvm::ValueAsMetadata::get(kernel),
-        llvm::MDString::get(*context, annotation),
-        llvm::ValueAsMetadata::get(
-            llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel,
+                                                   const std::string& annotation = "kernel") {
+    llvm::Metadata* metadata[] = {llvm::ValueAsMetadata::get(kernel),
+                                  llvm::MDString::get(*context, annotation),
+                                  llvm::ValueAsMetadata::get(
+                                      llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
     llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
     module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
 }
@@ -73,7 +73,8 @@ void CodegenLLVMVisitor::annotate_wrapper_kernels_with_nvvm() {
     auto module_named_metadata = module->getNamedMetadata("nvvm.annotations");
     module->eraseNamedMetadata(module_named_metadata);
 
-    // Then each kernel should be annotated as "device" function and wrappers should be annotated as "kernel" functions
+    // Then each kernel should be annotated as "device" function and wrappers should be annotated as
+    // "kernel" functions
     std::vector<std::string> kernel_names;
     find_kernel_names(kernel_names);
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 700b566a2e..4a70e6503d 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -300,7 +300,8 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
     /// Annotates kernel function with NVVM metadata.
     void annotate_kernel_with_nvvm(llvm::Function* kernel, const std::string& annotation);
 
-    /// Handles NVVM function annotations when we create the wrapper functions. All original kernels should be "device" functions and wrappers "kernel" functions
+    /// Handles NVVM function annotations when we create the wrapper functions. All original kernels
+    /// should be "device" functions and wrappers "kernel" functions
     void annotate_wrapper_kernels_with_nvvm();
 
     /// Accepts the given AST node and returns the processed value.
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index 878688a2cd..acffecd0be 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -138,7 +138,9 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
     // Optimize code for nvptx including the wrapper functions and generate PTX
     const auto opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
     utils::optimise_module_for_nvptx(platform, *module, opt_level_codegen, ptx_compiled_module);
-    utils::save_ir_to_ll_file(*module,  benchmark_info->output_dir + "/" + benchmark_info->filename + "_benchmark");
+    utils::save_ir_to_ll_file(*module,
+                              benchmark_info->output_dir + "/" + benchmark_info->filename +
+                                  "_benchmark");
     if (benchmark_info) {
         print_string_to_file(ptx_compiled_module,
                              benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");

From 84304ab2ce76783b0c0f4bcc66b626dfa34081de Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 12 Apr 2022 17:24:24 +0200
Subject: [PATCH 289/331] Disable addition of debug symbols in GPU code

---
 src/main.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/main.cpp b/src/main.cpp
index 93a7ea991f..eb99ead031 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -725,6 +725,13 @@ int main(int argc, const char* argv[]) {
                                   llvm_float_type,
                                   llvm_vector_width);
 
+                // GPU code generation doesn't support debug information at the moment so disable it
+                // in case it's enabled
+                if (!llvm_no_debug && platform.is_gpu()) {
+                    logger->warn("Disabling addition of debug symbols in GPU code.");
+                    llvm_no_debug = true;
+                }
+
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
                                            output_dir,

From 7bca898c9dea796613df1d595225ec6791f81823 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 20 Apr 2022 16:01:37 +0200
Subject: [PATCH 290/331] Added sleef and svml math libs in nmodl jit
 executions

---
 test/benchmark/nmodl-llvm-time.sh | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 15168f1169..353dea3b92 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -291,13 +291,22 @@ for kernel_target in ${KERNEL_TARGETS}; do
                 fast_math_opt="nonfastmath"
             fi
             echo "|  |  |  options: ${fast_math_flag}"
-            nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} --opt-level-ir 3 cpu --name ${nmodl_architecture} --vector-width ${vec_width} --math-library SVML benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${svml_lib}"
-            benchmark_nmodl_desc=${kernel_target}_nmodl-jit_${nmodl_architecture}_v${vec_width}_${fast_math_opt}
-            benchmark_description+=("${benchmark_nmodl_desc}")
-            # runs only kernel generated by LLVM IR
-            ${debug} eval "${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
-            benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
-            benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
+            for math_lib in SVML SLEEF;
+            do
+                echo "|  |  |  |  math library: ${math_lib}"
+                if [[ $math_lib == "SVML" ]]; then
+                    math_lib_path=${svml_lib}
+                else
+                    math_lib_path=${sleef_lib}
+                fi
+                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} --opt-level-ir 3 cpu --name ${nmodl_architecture} --vector-width ${vec_width} --math-library ${math_lib} benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${math_lib_path}"
+                benchmark_nmodl_desc=${kernel_target}_nmodl-jit_${math_lib}_${nmodl_architecture}_v${vec_width}_${fast_math_opt}
+                benchmark_description+=("${benchmark_nmodl_desc}")
+                # runs only kernel generated by LLVM IR
+                ${debug} eval "${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
+                benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
+                benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
+            done
         done
     done
 done

From 1519871b4f297478cec213231e1c7f523f94d170 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 26 Apr 2022 13:36:49 +0200
Subject: [PATCH 291/331] Added GPU execution in script and ncu option

---
 test/benchmark/llvm_benchmark.hpp |  2 +-
 test/benchmark/nmodl-llvm-time.sh | 27 +++++++++++++++++++--------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index e8ef56eab7..37992cb86d 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -103,7 +103,7 @@ class LLVMBenchmark {
                   const Platform& platform,
                   int opt_level_ir,
                   int opt_level_codegen,
-                  bool external_kernel
+                  bool external_kernel,
                   const GPUExecutionParameters& gpu_exec_params)
         : llvm_visitor(llvm_visitor)
         , mod_filename(mod_filename)
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 0ee25e1602..c16d98370e 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -39,6 +39,7 @@ function showusage {
 -d, --dry-run              Debug run.
 -h, --help                 Display this usage information.
 -V, --version              Show version and exit.
+-ncu, --nsight-compute     Run CUDA ncu for GPU kernels
 Driver for benchmarking.
 "
 }
@@ -78,6 +79,11 @@ while [[ "$1" != "" ]]; do
             debug=echo
             shift
             ;;
+        -ncu|--nsight-compute)
+            echo "profiling mode for gpu"
+            ncu_exec=$(which ncu)
+            shift
+            ;;
         -V|--version)
             echo "$version_string"
             exit 0
@@ -110,19 +116,21 @@ llc_exe=${llvm_path}/llc
 #gcc path
 gcc_exe=$(module show gcc 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)/g++
 
+#libdevice path
+libdevice_lib=${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc
+
 #add ld library path
 export LD_LIBRARY_PATH=`dirname $svml_lib`:`dirname $sleef_lib`:${llvm_path}/lib:$LD_LIBRARY_PATH
 
 # nmodl binary
 nmodl_src_dir=$(pwd)/../../
-nmodl_exe=${nmodl_src_dir}/build_benchmark/bin/nmodl
+nmodl_exe=${nmodl_src_dir}/build_benchmark_gpu/bin/nmodl
 
 # external kernel
 kernels_path=${nmodl_src_dir}/test/benchmark/kernels
 modfile_directory=${nmodl_src_dir}/test/benchmark/kernels
 ext_lib="libextkernel.so"
 
-
 # compiler flags
 declare -a intel_flags_skylake_avx512=(
     "-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml"
@@ -177,9 +185,9 @@ declare -a benchmark_variance
 
 # Kernels, architectures and compilers loop
 
-KERNEL_TARGETS="compute-bound memory-bound hh"
+KERNEL_TARGETS="compute-bound hh" #"compute-bound memory-bound hh"
 
-ARCHITECTURES="skylake_avx512 broadwell nehalem gpu"
+ARCHITECTURES="default" #"skylake_avx512 broadwell nehalem nvptx64"
 
 COMPILERS="intel clang gcc"
 
@@ -291,7 +299,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
                 fast_math_opt="nonfastmath"
             fi
             echo "|  |  |  options: ${fast_math_flag}"
-            if [[ $nmodl_architecture != "gpu" ]]; then
+            if [[ $nmodl_architecture != "nvptx64" ]]; then
                 for math_lib in SVML SLEEF;
                 do
                     echo "|  |  |  |  math library: ${math_lib}"
@@ -309,11 +317,14 @@ for kernel_target in ${KERNEL_TARGETS}; do
                     benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
                 done
             else
-                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} --opt-level-ir 3 gpu --name ${nmodl_architecture} --vector-width ${vec_width} --math-library libdevice benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${math_lib_path}"
-                benchmark_nmodl_desc=${kernel_target}_nmodl-jit_${math_lib}_${nmodl_architecture}_v${vec_width}_${fast_math_opt}
+                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} --opt-level-ir 3 gpu --name ${nmodl_architecture} --target-arch \"sm_70\" --math-library libdevice benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${libdevice_lib} --grid-dim-x 16384 --block-dim-x 512"
+                benchmark_nmodl_desc=${kernel_target}_nmodl-cuda-jit_libdevice_${nmodl_architecture}_v${vec_width}_${fast_math_opt}
                 benchmark_description+=("${benchmark_nmodl_desc}")
                 # runs only kernel generated by LLVM IR
-                ${debug} eval "${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
+                if [[ $ncu_exec != "" ]]; then
+                    ncu="${ncu_exec} --set detailed -f -o ${kernel_target}_${fast_math_opt}_detailed"
+                fi
+                ${debug} eval "${ncu} ${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
                 benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
                 benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
             fi

From eae47e33c810d927f7408be351cbf0ed9f4ff872 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <ioannis.magkanaris@epfl.ch>
Date: Wed, 27 Apr 2022 14:12:08 +0300
Subject: [PATCH 292/331] Removed loading git module

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6c23942204..122edd0aa2 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -94,7 +94,7 @@ test_benchmark:cpu:
   extends:
     - .benchmark_config
   script:
-    - module load unstable git
+    - module load unstable
     - . ${SPACK_ROOT}/share/spack/setup-env.sh
     - spack load nmodl/${SPACK_INSTALLED_HASH}
     - nmodl test/integration/mod/test_math.mod llvm --ir --opt-level-ir 3 benchmark --run --opt-level-codegen 3 --instance-size 10000000 --repeat 2
@@ -104,7 +104,7 @@ test_benchmark:gpu:
   extends:
     - .benchmark_config
   script:
-    - module load unstable git
+    - module load unstable
     - . ${SPACK_ROOT}/share/spack/setup-env.sh
     - spack load nmodl/${SPACK_INSTALLED_HASH}
     - nmodl test/integration/mod/test_math.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_70" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2 --grid-dim-x 4096 --block-dim-x 256

From 570c9b7d100718ed4a2c2b5b996e6d586cd35795 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <ioannis.magkanaris@epfl.ch>
Date: Wed, 27 Apr 2022 14:13:27 +0300
Subject: [PATCH 293/331] Improved condition declaration of cuda driver
 according to Pramod's comments

---
 test/benchmark/CMakeLists.txt     | 5 ++++-
 test/benchmark/cuda_driver.cpp    | 4 ----
 test/benchmark/cuda_driver.hpp    | 4 ----
 test/benchmark/llvm_benchmark.cpp | 4 +++-
 test/benchmark/llvm_benchmark.hpp | 5 ++++-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index a0320ae7c4..c4ec584a8c 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -3,9 +3,12 @@
 # =============================================================================
 set(LLVM_BENCHMARK_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
 
+if(NMODL_ENABLE_LLVM_CUDA)
+  list(APPEND LLVM_BENCHMARK_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.hpp)
+endif()
+
 # =============================================================================
 # LLVM benchmark library
 # =============================================================================
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index acffecd0be..5b14191b69 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -5,8 +5,6 @@
  * Lesser General Public License. See top-level LICENSE file for details.
  *************************************************************************/
 
-#ifdef NMODL_LLVM_CUDA_BACKEND
-
 #include <fstream>
 #include <regex>
 
@@ -197,5 +195,3 @@ void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmar
 
 }  // namespace runner
 }  // namespace nmodl
-
-#endif
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index 07323526e2..6fc8d2d435 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -15,8 +15,6 @@
  * \brief \copybrief nmodl::runner::CUDADriver
  */
 
-#ifdef NMODL_LLVM_CUDA_BACKEND
-
 #include <memory>
 #include <string>
 
@@ -187,5 +185,3 @@ class BenchmarkGPURunner: public BaseGPURunner {
 
 }  // namespace runner
 }  // namespace nmodl
-
-#endif
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 98b3c0ec04..8d0eb58bba 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -9,12 +9,14 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "llvm_benchmark.hpp"
-#include "test/benchmark/cuda_driver.hpp"
 #include "test/benchmark/jit_driver.hpp"
 #include "llvm/Support/Host.h"
 
 #include "test/unit/codegen/codegen_data_helper.hpp"
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+#include "test/benchmark/cuda_driver.hpp"
+#endif
 
 namespace nmodl {
 namespace benchmark {
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index a8d89d985f..12999b7325 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -12,10 +12,13 @@
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "gpu_parameters.hpp"
-#include "test/benchmark/cuda_driver.hpp"
 #include "test/benchmark/jit_driver.hpp"
 #include "utils/logger.hpp"
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+#include "test/benchmark/cuda_driver.hpp"
+#endif
+
 using nmodl::codegen::Platform;
 using nmodl::cuda_details::GPUExecutionParameters;
 

From c154efbaa1d817287e87411c7fb0679687af4ba7 Mon Sep 17 00:00:00 2001
From: Omar Awile <omar.awile@epfl.ch>
Date: Wed, 27 Apr 2022 14:32:20 +0200
Subject: [PATCH 294/331] JIT invocation from python for benchmarks (#832)

- Created CodegenDriver class to factor out ast preparation
- Created pybind wrappers for Jit and Codegen configuration options
- Updated benchmark runner to return runtime stats
- Return benchmark results to python
- Addressed @iomaganaris' comments.
- Add a PyJIT integration test
* Re-enable python bindings by default
---
 CMakeLists.txt                                |   2 +-
 src/codegen/CMakeLists.txt                    |   6 +-
 src/codegen/codegen_c_visitor.hpp             |   3 +-
 src/codegen/codegen_driver.cpp                | 261 ++++++++
 src/codegen/codegen_driver.hpp                | 168 +++++
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   1 +
 src/codegen/llvm/codegen_llvm_visitor.hpp     |   1 -
 src/codegen/llvm/target_platform.hpp          |   4 +-
 src/main.cpp                                  | 608 ++++--------------
 src/pybind/CMakeLists.txt                     |   6 +
 src/pybind/pynmodl.cpp                        | 114 +++-
 test/benchmark/CMakeLists.txt                 |  17 +
 test/benchmark/benchmark.py                   |  25 +
 test/benchmark/llvm_benchmark.cpp             |  50 +-
 test/benchmark/llvm_benchmark.hpp             |  14 +-
 test/unit/CMakeLists.txt                      |   6 +-
 test/unit/codegen/codegen_data_helper.hpp     |   7 +-
 test/unit/codegen/codegen_llvm_execution.cpp  |   8 +-
 .../codegen/codegen_llvm_instance_struct.cpp  |   2 +-
 19 files changed, 788 insertions(+), 515 deletions(-)
 create mode 100644 src/codegen/codegen_driver.cpp
 create mode 100644 src/codegen/codegen_driver.hpp
 create mode 100644 test/benchmark/benchmark.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5caf48c684..d2e8a84f7a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
 # =============================================================================
 # Build options for NMODL
 # =============================================================================
-option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" OFF)
+option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" ON)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
 option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
 option(NMODL_ENABLE_LLVM_GPU "Enable LLVM based GPU code generation" ON)
diff --git a/src/codegen/CMakeLists.txt b/src/codegen/CMakeLists.txt
index 2d31e1b1d6..027c99ef77 100644
--- a/src/codegen/CMakeLists.txt
+++ b/src/codegen/CMakeLists.txt
@@ -20,7 +20,11 @@ set(CODEGEN_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_ispc_visitor.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_naming.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/codegen_utils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_utils.hpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_utils.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_driver.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/codegen_driver.hpp)
+
+include_directories(${PYBIND11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS})
 
 # =============================================================================
 # Codegen library and executable
diff --git a/src/codegen/codegen_c_visitor.hpp b/src/codegen/codegen_c_visitor.hpp
index b02c2941c7..b5b58412a0 100644
--- a/src/codegen/codegen_c_visitor.hpp
+++ b/src/codegen/codegen_c_visitor.hpp
@@ -31,12 +31,11 @@
 #include "visitors/ast_visitor.hpp"
 
 
-using namespace fmt::literals;
-
 namespace nmodl {
 /// encapsulates code generation backend implementations
 namespace codegen {
 
+using namespace fmt::literals;
 /**
  * @defgroup codegen Code Generation Implementation
  * @brief Implementations of code generation backends
diff --git a/src/codegen/codegen_driver.cpp b/src/codegen/codegen_driver.cpp
new file mode 100644
index 0000000000..256d970074
--- /dev/null
+++ b/src/codegen/codegen_driver.cpp
@@ -0,0 +1,261 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <string>
+
+#include "codegen/codegen_driver.hpp"
+#include "codegen_compatibility_visitor.hpp"
+#include "utils/logger.hpp"
+#include "visitors/after_cvode_to_cnexp_visitor.hpp"
+#include "visitors/ast_visitor.hpp"
+#include "visitors/constant_folder_visitor.hpp"
+#include "visitors/global_var_visitor.hpp"
+#include "visitors/inline_visitor.hpp"
+#include "visitors/ispc_rename_visitor.hpp"
+#include "visitors/kinetic_block_visitor.hpp"
+#include "visitors/local_to_assigned_visitor.hpp"
+#include "visitors/local_var_rename_visitor.hpp"
+#include "visitors/localize_visitor.hpp"
+#include "visitors/loop_unroll_visitor.hpp"
+#include "visitors/neuron_solve_visitor.hpp"
+#include "visitors/nmodl_visitor.hpp"
+#include "visitors/perf_visitor.hpp"
+#include "visitors/semantic_analysis_visitor.hpp"
+#include "visitors/solve_block_visitor.hpp"
+#include "visitors/steadystate_visitor.hpp"
+#include "visitors/sympy_conductance_visitor.hpp"
+#include "visitors/sympy_solver_visitor.hpp"
+#include "visitors/symtab_visitor.hpp"
+#include "visitors/units_visitor.hpp"
+#include "visitors/verbatim_var_rename_visitor.hpp"
+
+using namespace nmodl;
+using namespace codegen;
+using namespace visitor;
+
+bool CodegenDriver::prepare_mod(std::shared_ptr<ast::Program> node) {
+    /// whether to update existing symbol table or create new
+    /// one whenever we run symtab visitor.
+    bool update_symtab = false;
+
+    std::string modfile;
+    std::string scratch_dir = "tmp";
+    auto filepath = [scratch_dir, modfile](const std::string& suffix, const std::string& ext) {
+        static int count = 0;
+        return "{}/{}.{}.{}.{}"_format(scratch_dir, modfile, std::to_string(count++), suffix, ext);
+    };
+
+    /// just visit the ast
+    AstVisitor().visit_program(*node);
+
+    /// Check some rules that ast should follow
+    {
+        logger->info("Running semantic analysis visitor");
+        if (SemanticAnalysisVisitor().check(*node)) {
+            return false;
+        }
+    }
+
+    /// construct symbol table
+    {
+        logger->info("Running symtab visitor");
+        SymtabVisitor(update_symtab).visit_program(*node);
+    }
+
+    /// use cnexp instead of after_cvode solve method
+    {
+        logger->info("Running CVode to cnexp visitor");
+        AfterCVodeToCnexpVisitor().visit_program(*node);
+        ast_to_nmodl(*node, filepath("after_cvode_to_cnexp", "mod"));
+    }
+
+    /// Rename variables that match ISPC compiler double constants
+    if (cfg.ispc_backend) {
+        logger->info("Running ISPC variables rename visitor");
+        IspcRenameVisitor(node).visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        ast_to_nmodl(*node, filepath("ispc_double_rename", "mod"));
+    }
+
+    /// GLOBAL to RANGE rename visitor
+    if (cfg.nmodl_global_to_range) {
+        // make sure to run perf visitor because code generator
+        // looks for read/write counts const/non-const declaration
+        PerfVisitor().visit_program(*node);
+        // make sure to run the GlobalToRange visitor after all the
+        // reinitializations of Symtab
+        logger->info("Running GlobalToRange visitor");
+        GlobalToRangeVisitor(*node).visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        ast_to_nmodl(*node, filepath("ispc_double_rename", "mod"));
+    }
+
+    /// LOCAL to ASSIGNED visitor
+    if (cfg.nmodl_local_to_range) {
+        logger->info("Running LOCAL to ASSIGNED visitor");
+        PerfVisitor().visit_program(*node);
+        LocalToAssignedVisitor().visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        ast_to_nmodl(*node, filepath("global_to_range", "mod"));
+    }
+
+    {
+        // Compatibility Checking
+        logger->info("Running code compatibility checker");
+        // run perfvisitor to update read/write counts
+        PerfVisitor().visit_program(*node);
+
+        auto ast_has_unhandled_nodes = CodegenCompatibilityVisitor().find_unhandled_ast_nodes(
+            *node);
+        // If we want to just check compatibility we return the result
+        if (cfg.only_check_compatibility) {
+            return !ast_has_unhandled_nodes;  // negate since this function returns false on failure
+        }
+
+        // If there is an incompatible construct and code generation is not forced exit NMODL
+        if (ast_has_unhandled_nodes && !cfg.force_codegen) {
+            return false;
+        }
+    }
+
+    ast_to_nmodl(*node, filepath("ast", "mod"));
+    ast_to_json(*node, filepath("ast", "json"));
+
+    if (cfg.verbatim_rename) {
+        logger->info("Running verbatim rename visitor");
+        VerbatimVarRenameVisitor().visit_program(*node);
+        ast_to_nmodl(*node, filepath("verbatim_rename", "mod"));
+    }
+
+    if (cfg.nmodl_const_folding) {
+        logger->info("Running nmodl constant folding visitor");
+        ConstantFolderVisitor().visit_program(*node);
+        ast_to_nmodl(*node, filepath("constfold", "mod"));
+    }
+
+    if (cfg.nmodl_unroll) {
+        logger->info("Running nmodl loop unroll visitor");
+        LoopUnrollVisitor().visit_program(*node);
+        ConstantFolderVisitor().visit_program(*node);
+        ast_to_nmodl(*node, filepath("unroll", "mod"));
+        SymtabVisitor(update_symtab).visit_program(*node);
+    }
+
+    /// note that we can not symtab visitor in update mode as we
+    /// replace kinetic block with derivative block of same name
+    /// in global scope
+    {
+        logger->info("Running KINETIC block visitor");
+        auto kineticBlockVisitor = KineticBlockVisitor();
+        kineticBlockVisitor.visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        const auto filename = filepath("kinetic", "mod");
+        ast_to_nmodl(*node, filename);
+        if (cfg.nmodl_ast && kineticBlockVisitor.get_conserve_statement_count()) {
+            logger->warn(
+                "{} presents non-standard CONSERVE statements in DERIVATIVE blocks. Use it only for debugging/developing"_format(
+                    filename));
+        }
+    }
+
+    {
+        logger->info("Running STEADYSTATE visitor");
+        SteadystateVisitor().visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        ast_to_nmodl(*node, filepath("steadystate", "mod"));
+    }
+
+    /// Parsing units fron "nrnunits.lib" and mod files
+    {
+        logger->info("Parsing Units");
+        UnitsVisitor(cfg.units_dir).visit_program(*node);
+    }
+
+    /// once we start modifying (especially removing) older constructs
+    /// from ast then we should run symtab visitor in update mode so
+    /// that old symbols (e.g. prime variables) are not lost
+    update_symtab = true;
+
+    if (cfg.nmodl_inline) {
+        logger->info("Running nmodl inline visitor");
+        InlineVisitor().visit_program(*node);
+        ast_to_nmodl(*node, filepath("inline", "mod"));
+    }
+
+    if (cfg.local_rename) {
+        logger->info("Running local variable rename visitor");
+        LocalVarRenameVisitor().visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        ast_to_nmodl(*node, filepath("local_rename", "mod"));
+    }
+
+    if (cfg.nmodl_localize) {
+        // localize pass must follow rename pass to avoid conflict
+        logger->info("Running localize visitor");
+        LocalizeVisitor(cfg.localize_verbatim).visit_program(*node);
+        LocalVarRenameVisitor().visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        ast_to_nmodl(*node, filepath("localize", "mod"));
+    }
+
+    if (cfg.sympy_conductance) {
+        logger->info("Running sympy conductance visitor");
+        SympyConductanceVisitor().visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        ast_to_nmodl(*node, filepath("sympy_conductance", "mod"));
+    }
+
+    if (cfg.sympy_analytic || sparse_solver_exists(*node)) {
+        if (!cfg.sympy_analytic) {
+            logger->info(
+                "Automatically enable sympy_analytic because it exists solver of type sparse");
+        }
+        logger->info("Running sympy solve visitor");
+        SympySolverVisitor(cfg.sympy_pade, cfg.sympy_cse).visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        ast_to_nmodl(*node, filepath("sympy_solve", "mod"));
+    }
+
+    {
+        logger->info("Running cnexp visitor");
+        NeuronSolveVisitor().visit_program(*node);
+        ast_to_nmodl(*node, filepath("cnexp", "mod"));
+    }
+
+    {
+        SolveBlockVisitor().visit_program(*node);
+        SymtabVisitor(update_symtab).visit_program(*node);
+        ast_to_nmodl(*node, filepath("solveblock", "mod"));
+    }
+
+    if (cfg.json_perfstat) {
+        auto file = scratch_dir + "/" + modfile + ".perf.json";
+        logger->info("Writing performance statistics to {}", file);
+        PerfVisitor(file).visit_program(*node);
+    }
+
+    {
+        // make sure to run perf visitor because code generator
+        // looks for read/write counts const/non-const declaration
+        PerfVisitor().visit_program(*node);
+    }
+    return true;
+}
+
+void CodegenDriver::ast_to_nmodl(Program& ast, const std::string& filepath) const {
+    if (cfg.nmodl_ast) {
+        NmodlPrintVisitor(filepath).visit_program(ast);
+        logger->info("AST to NMODL transformation written to {}", filepath);
+    }
+};
+
+void CodegenDriver::ast_to_json(ast::Program& ast, const std::string& filepath) const {
+    if (cfg.json_ast) {
+        JSONVisitor(filepath).write(ast);
+        logger->info("AST to JSON transformation written to {}", filepath);
+    }
+};
diff --git a/src/codegen/codegen_driver.hpp b/src/codegen/codegen_driver.hpp
new file mode 100644
index 0000000000..aa2f9921d0
--- /dev/null
+++ b/src/codegen/codegen_driver.hpp
@@ -0,0 +1,168 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+
+#pragma once
+
+#include <string>
+#include <utility>
+
+#include "ast/program.hpp"
+#include "config/config.h"
+
+namespace nmodl {
+namespace codegen {
+
+struct CodeGenConfig {
+    /// true if serial c code to be generated
+    bool c_backend = true;
+
+    /// true if c code with openmp to be generated
+    bool omp_backend = false;
+
+    /// true if ispc code to be generated
+    bool ispc_backend = false;
+
+    /// true if c code with openacc to be generated
+    bool oacc_backend = false;
+
+    /// true if cuda code to be generated
+    bool cuda_backend = false;
+
+    /// true if llvm code to be generated
+    bool llvm_backend = false;
+
+    /// true if sympy should be used for solving ODEs analytically
+    bool sympy_analytic = false;
+
+    /// true if Pade approximation to be used
+    bool sympy_pade = false;
+
+    /// true if CSE (temp variables) to be used
+    bool sympy_cse = false;
+
+    /// true if conductance keyword can be added to breakpoint
+    bool sympy_conductance = false;
+
+    /// true if inlining at nmodl level to be done
+    bool nmodl_inline = false;
+
+    /// true if unroll at nmodl level to be done
+    bool nmodl_unroll = false;
+
+    /// true if perform constant folding at nmodl level to be done
+    bool nmodl_const_folding = false;
+
+    /// true if range variables to be converted to local
+    bool nmodl_localize = false;
+
+    /// true if global variables to be converted to range
+    bool nmodl_global_to_range = false;
+
+    /// true if top level local variables to be converted to range
+    bool nmodl_local_to_range = false;
+
+    /// true if localize variables even if verbatim block is used
+    bool localize_verbatim = false;
+
+    /// true if local variables to be renamed
+    bool local_rename = true;
+
+    /// true if inline even if verbatim block exist
+    bool verbatim_inline = false;
+
+    /// true if verbatim blocks
+    bool verbatim_rename = true;
+
+    /// true if code generation is forced to happen even if there
+    /// is any incompatibility
+    bool force_codegen = false;
+
+    /// true if we want to only check compatibility without generating code
+    bool only_check_compatibility = false;
+
+    /// true if ion variable copies should be avoided
+    bool optimize_ionvar_copies_codegen = false;
+
+    /// directory where code will be generated
+    std::string output_dir = ".";
+
+    /// directory where intermediate file will be generated
+    std::string scratch_dir = "tmp";
+
+    /// directory where units lib file is located
+    std::string units_dir = NrnUnitsLib::get_path();
+
+    /// floating point data type
+    std::string data_type = "double";
+
+    /// true if ast should be converted to nmodl
+    bool nmodl_ast = false;
+
+    /// true if ast should be converted to json
+    bool json_ast = false;
+
+    /// true if performance stats should be converted to json
+    bool json_perfstat = false;
+
+#ifdef NMODL_LLVM_BACKEND
+    /// generate llvm IR
+    bool llvm_ir = false;
+
+    /// use single precision floating-point types
+    bool llvm_float_type = false;
+
+    /// optimisation level for IR generation
+    int llvm_opt_level_ir = 0;
+
+    /// math library name
+    std::string llvm_math_library = "none";
+
+    /// disable debug information generation for the IR
+    bool llvm_no_debug = false;
+
+    /// fast math flags for LLVM backend
+    std::vector<std::string> llvm_fast_math_flags;
+
+    /// traget CPU platform name
+    std::string llvm_cpu_name = "default";
+
+    /// traget GPU platform name
+    std::string llvm_gpu_name = "default";
+
+    /// GPU target architecture
+    std::string llvm_gpu_target_architecture = "sm_70";
+
+    /// llvm vector width if generating code for CPUs
+    int llvm_vector_width = 1;
+
+    /// optimisation level for machine code generation
+    int llvm_opt_level_codegen = 0;
+
+    /// list of shared libraries to link against in JIT
+    std::vector<std::string> shared_lib_paths;
+#endif
+};
+
+class CodegenDriver {
+  public:
+    explicit CodegenDriver(CodeGenConfig _cfg)
+        : cfg(std::move(_cfg)) {}
+
+    bool prepare_mod(std::shared_ptr<nmodl::ast::Program> node);
+
+  private:
+    CodeGenConfig cfg;
+
+
+    /// write ast to nmodl
+    void ast_to_nmodl(ast::Program& ast, const std::string& filepath) const;
+    void ast_to_json(ast::Program& ast, const std::string& filepath) const;
+};
+
+}  // namespace codegen
+}  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index ca3b405be3..8dfb66e9e0 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -9,6 +9,7 @@
 #include "codegen/llvm/llvm_utils.hpp"
 
 #include "ast/all.hpp"
+#include "utils/logger.hpp"
 #include "visitors/rename_visitor.hpp"
 #include "visitors/visitor_utils.hpp"
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 299071ae80..a7af83721c 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -23,7 +23,6 @@
 #include "codegen/llvm/llvm_debug_builder.hpp"
 #include "codegen/llvm/llvm_ir_builder.hpp"
 #include "symtab/symbol_table.hpp"
-#include "utils/logger.hpp"
 #include "visitors/ast_visitor.hpp"
 
 #include "llvm/IR/DIBuilder.h"
diff --git a/src/codegen/llvm/target_platform.hpp b/src/codegen/llvm/target_platform.hpp
index 8676f176b4..d15f220d1c 100644
--- a/src/codegen/llvm/target_platform.hpp
+++ b/src/codegen/llvm/target_platform.hpp
@@ -27,12 +27,12 @@ class Platform {
 
   private:
     /// Name of the platform.
-    const std::string name = Platform::DEFAULT_PLATFORM_NAME;
+    std::string name = Platform::DEFAULT_PLATFORM_NAME;
 
     /// Target chip for GPUs.
     /// TODO: this should only be available to GPUs! If we refactor target
     /// classes so that GPUPlatform <: Platform, it will be nicer!
-    const std::string subtarget_name = "sm_70";
+    std::string subtarget_name = "sm_70";
 
     /// Target-specific id to compare platforms easily.
     PlatformID platform_id;
diff --git a/src/main.cpp b/src/main.cpp
index a011cf818a..d93869e4e7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -10,10 +10,8 @@
 
 #include <CLI/CLI.hpp>
 
-#include "ast/program.hpp"
 #include "codegen/codegen_acc_visitor.hpp"
 #include "codegen/codegen_c_visitor.hpp"
-#include "codegen/codegen_compatibility_visitor.hpp"
 #include "codegen/codegen_cuda_visitor.hpp"
 #include "codegen/codegen_ispc_visitor.hpp"
 #include "codegen/codegen_omp_visitor.hpp"
@@ -23,36 +21,14 @@
 #include "test/benchmark/llvm_benchmark.hpp"
 #endif
 
+#include "codegen/codegen_driver.hpp"
 #include "config/config.h"
 #include "parser/nmodl_driver.hpp"
 #include "pybind/pyembed.hpp"
 #include "utils/common_utils.hpp"
 #include "utils/logger.hpp"
-#include "visitors/after_cvode_to_cnexp_visitor.hpp"
-#include "visitors/ast_visitor.hpp"
-#include "visitors/constant_folder_visitor.hpp"
-#include "visitors/global_var_visitor.hpp"
-#include "visitors/indexedname_visitor.hpp"
-#include "visitors/inline_visitor.hpp"
-#include "visitors/ispc_rename_visitor.hpp"
 #include "visitors/json_visitor.hpp"
-#include "visitors/kinetic_block_visitor.hpp"
-#include "visitors/local_to_assigned_visitor.hpp"
-#include "visitors/local_var_rename_visitor.hpp"
-#include "visitors/localize_visitor.hpp"
-#include "visitors/loop_unroll_visitor.hpp"
-#include "visitors/neuron_solve_visitor.hpp"
 #include "visitors/nmodl_visitor.hpp"
-#include "visitors/perf_visitor.hpp"
-#include "visitors/semantic_analysis_visitor.hpp"
-#include "visitors/solve_block_visitor.hpp"
-#include "visitors/steadystate_visitor.hpp"
-#include "visitors/sympy_conductance_visitor.hpp"
-#include "visitors/sympy_solver_visitor.hpp"
-#include "visitors/symtab_visitor.hpp"
-#include "visitors/units_visitor.hpp"
-#include "visitors/verbatim_var_rename_visitor.hpp"
-#include "visitors/verbatim_visitor.hpp"
 
 /**
  * \dir
@@ -75,94 +51,6 @@ int main(int argc, const char* argv[]) {
     /// true if debug logger statements should be shown
     std::string verbose("info");
 
-    /// true if serial c code to be generated
-    bool c_backend(true);
-
-    /// true if c code with openmp to be generated
-    bool omp_backend(false);
-
-    /// true if ispc code to be generated
-    bool ispc_backend(false);
-
-    /// true if c code with openacc to be generated
-    bool oacc_backend(false);
-
-    /// true if cuda code to be generated
-    bool cuda_backend(false);
-
-    /// true if llvm code to be generated
-    bool llvm_backend(false);
-
-    /// true if sympy should be used for solving ODEs analytically
-    bool sympy_analytic(false);
-
-    /// true if Pade approximation to be used
-    bool sympy_pade(false);
-
-    /// true if CSE (temp variables) to be used
-    bool sympy_cse(false);
-
-    /// true if conductance keyword can be added to breakpoint
-    bool sympy_conductance(false);
-
-    /// true if inlining at nmodl level to be done
-    bool nmodl_inline(false);
-
-    /// true if unroll at nmodl level to be done
-    bool nmodl_unroll(false);
-
-    /// true if perform constant folding at nmodl level to be done
-    bool nmodl_const_folding(false);
-
-    /// true if range variables to be converted to local
-    bool nmodl_localize(false);
-
-    /// true if global variables to be converted to range
-    bool nmodl_global_to_range(false);
-
-    /// true if top level local variables to be converted to range
-    bool nmodl_local_to_range(false);
-
-    /// true if localize variables even if verbatim block is used
-    bool localize_verbatim(false);
-
-    /// true if local variables to be renamed
-    bool local_rename(true);
-
-    /// true if inline even if verbatim block exist
-    bool verbatim_inline(false);
-
-    /// true if verbatim blocks
-    bool verbatim_rename(true);
-
-    /// true if code generation is forced to happen even if there
-    /// is any incompatibility
-    bool force_codegen(false);
-
-    /// true if we want to only check compatibility without generating code
-    bool only_check_compatibility(false);
-
-    /// true if ion variable copies should be avoided
-    bool optimize_ionvar_copies_codegen(false);
-
-    /// directory where code will be generated
-    std::string output_dir(".");
-
-    /// directory where intermediate file will be generated
-    std::string scratch_dir("tmp");
-
-    /// directory where units lib file is located
-    std::string units_dir(NrnUnitsLib::get_path());
-
-    /// true if ast should be converted to json
-    bool json_ast(false);
-
-    /// true if ast should be converted to nmodl
-    bool nmodl_ast(false);
-
-    /// true if performance stats should be converted to json
-    bool json_perfstat(false);
-
     /// true if symbol table should be printed
     bool show_symtab(false);
 
@@ -170,42 +58,9 @@ int main(int argc, const char* argv[]) {
     std::string data_type("double");
 
 #ifdef NMODL_LLVM_BACKEND
-    /// generate llvm IR
-    bool llvm_ir(false);
-
-    /// use single precision floating-point types
-    bool llvm_float_type(false);
-
-    /// optimisation level for IR generation
-    int llvm_opt_level_ir = 0;
-
-    /// math library name
-    std::string llvm_math_library("none");
-
-    /// disable debug information generation for the IR
-    bool llvm_no_debug(false);
-
-    /// fast math flags for LLVM backend
-    std::vector<std::string> llvm_fast_math_flags;
-
-    /// traget CPU platform name
-    std::string llvm_cpu_name = "default";
-
-    /// traget GPU platform name
-    std::string llvm_gpu_name = "default";
-
-    /// llvm vector width if generating code for CPUs
-    int llvm_vector_width = 1;
-
     /// run llvm benchmark
     bool llvm_benchmark(false);
 
-    /// optimisation level for machine code generation
-    int llvm_opt_level_codegen = 0;
-
-    /// list of shared libraries to link against in JIT
-    std::vector<std::string> shared_lib_paths;
-
     /// the size of the instance struct for the benchmark
     int instance_size = 10000;
 
@@ -213,6 +68,8 @@ int main(int argc, const char* argv[]) {
     int num_experiments = 100;
 #endif
 
+    CodeGenConfig cfg;
+
     app.get_formatter()->column_width(40);
     app.set_help_all_flag("-H,--help-all", "Print this help message including all sub-commands");
 
@@ -225,146 +82,158 @@ int main(int argc, const char* argv[]) {
         ->required()
         ->check(CLI::ExistingFile);
 
-    app.add_option("-o,--output", output_dir, "Directory for backend code output", true)
+    app.add_option("-o,--output", cfg.output_dir, "Directory for backend code output", true)
         ->ignore_case();
-    app.add_option("--scratch", scratch_dir, "Directory for intermediate code output", true)
+    app.add_option("--scratch", cfg.scratch_dir, "Directory for intermediate code output", true)
         ->ignore_case();
-    app.add_option("--units", units_dir, "Directory of units lib file", true)->ignore_case();
+    app.add_option("--units", cfg.units_dir, "Directory of units lib file", true)->ignore_case();
 
     auto host_opt = app.add_subcommand("host", "HOST/CPU code backends")->ignore_case();
-    host_opt->add_flag("--c", c_backend, "C/C++ backend ({})"_format(c_backend))->ignore_case();
-    host_opt->add_flag("--omp", omp_backend, "C/C++ backend with OpenMP ({})"_format(omp_backend))
+    host_opt->add_flag("--c", cfg.c_backend, "C/C++ backend ({})"_format(cfg.c_backend))
         ->ignore_case();
-    host_opt->add_flag("--ispc", ispc_backend, "C/C++ backend with ISPC ({})"_format(ispc_backend))
+    host_opt
+        ->add_flag("--omp",
+                   cfg.omp_backend,
+                   "C/C++ backend with OpenMP ({})"_format(cfg.omp_backend))
+        ->ignore_case();
+    host_opt
+        ->add_flag("--ispc",
+                   cfg.ispc_backend,
+                   "C/C++ backend with ISPC ({})"_format(cfg.ispc_backend))
         ->ignore_case();
 
     auto acc_opt = app.add_subcommand("acc", "Accelerator code backends")->ignore_case();
     acc_opt
-        ->add_flag("--oacc", oacc_backend, "C/C++ backend with OpenACC ({})"_format(oacc_backend))
+        ->add_flag("--oacc",
+                   cfg.oacc_backend,
+                   "C/C++ backend with OpenACC ({})"_format(cfg.oacc_backend))
         ->ignore_case();
-    acc_opt->add_flag("--cuda", cuda_backend, "C/C++ backend with CUDA ({})"_format(cuda_backend))
+    acc_opt
+        ->add_flag("--cuda",
+                   cfg.cuda_backend,
+                   "C/C++ backend with CUDA ({})"_format(cfg.cuda_backend))
         ->ignore_case();
 
     // clang-format off
     auto sympy_opt = app.add_subcommand("sympy", "SymPy based analysis and optimizations")->ignore_case();
     sympy_opt->add_flag("--analytic",
-        sympy_analytic,
-        "Solve ODEs using SymPy analytic integration ({})"_format(sympy_analytic))->ignore_case();
+        cfg.sympy_analytic,
+        "Solve ODEs using SymPy analytic integration ({})"_format(cfg.sympy_analytic))->ignore_case();
     sympy_opt->add_flag("--pade",
-        sympy_pade,
-        "Pade approximation in SymPy analytic integration ({})"_format(sympy_pade))->ignore_case();
+        cfg.sympy_pade,
+        "Pade approximation in SymPy analytic integration ({})"_format(cfg.sympy_pade))->ignore_case();
     sympy_opt->add_flag("--cse",
-        sympy_cse,
-        "CSE (Common Subexpression Elimination) in SymPy analytic integration ({})"_format(sympy_cse))->ignore_case();
+        cfg.sympy_cse,
+        "CSE (Common Subexpression Elimination) in SymPy analytic integration ({})"_format(cfg.sympy_cse))->ignore_case();
     sympy_opt->add_flag("--conductance",
-        sympy_conductance,
-        "Add CONDUCTANCE keyword in BREAKPOINT ({})"_format(sympy_conductance))->ignore_case();
+        cfg.sympy_conductance,
+        "Add CONDUCTANCE keyword in BREAKPOINT ({})"_format(cfg.sympy_conductance))->ignore_case();
 
     auto passes_opt = app.add_subcommand("passes", "Analyse/Optimization passes")->ignore_case();
     passes_opt->add_flag("--inline",
-        nmodl_inline,
-        "Perform inlining at NMODL level ({})"_format(nmodl_inline))->ignore_case();
+        cfg.nmodl_inline,
+        "Perform inlining at NMODL level ({})"_format(cfg.nmodl_inline))->ignore_case();
     passes_opt->add_flag("--unroll",
-        nmodl_unroll,
-        "Perform loop unroll at NMODL level ({})"_format(nmodl_unroll))->ignore_case();
+        cfg.nmodl_unroll,
+        "Perform loop unroll at NMODL level ({})"_format(cfg.nmodl_unroll))->ignore_case();
     passes_opt->add_flag("--const-folding",
-        nmodl_const_folding,
-        "Perform constant folding at NMODL level ({})"_format(nmodl_const_folding))->ignore_case();
+        cfg.nmodl_const_folding,
+        "Perform constant folding at NMODL level ({})"_format(cfg.nmodl_const_folding))->ignore_case();
     passes_opt->add_flag("--localize",
-        nmodl_localize,
-        "Convert RANGE variables to LOCAL ({})"_format(nmodl_localize))->ignore_case();
+        cfg.nmodl_localize,
+        "Convert RANGE variables to LOCAL ({})"_format(cfg.nmodl_localize))->ignore_case();
     passes_opt->add_flag("--global-to-range",
-         nmodl_global_to_range,
-         "Convert GLOBAL variables to RANGE ({})"_format(nmodl_global_to_range))->ignore_case();
+         cfg.nmodl_global_to_range,
+         "Convert GLOBAL variables to RANGE ({})"_format(cfg.nmodl_global_to_range))->ignore_case();
     passes_opt->add_flag("--local-to-range",
-         nmodl_local_to_range,
-         "Convert top level LOCAL variables to RANGE ({})"_format(nmodl_local_to_range))->ignore_case();
+         cfg.nmodl_local_to_range,
+         "Convert top level LOCAL variables to RANGE ({})"_format(cfg.nmodl_local_to_range))->ignore_case();
     passes_opt->add_flag("--localize-verbatim",
-        localize_verbatim,
-        "Convert RANGE variables to LOCAL even if verbatim block exist ({})"_format(localize_verbatim))->ignore_case();
+        cfg.localize_verbatim,
+        "Convert RANGE variables to LOCAL even if verbatim block exist ({})"_format(cfg.localize_verbatim))->ignore_case();
     passes_opt->add_flag("--local-rename",
-        local_rename,
-        "Rename LOCAL variable if variable of same name exist in global scope ({})"_format(local_rename))->ignore_case();
+        cfg.local_rename,
+        "Rename LOCAL variable if variable of same name exist in global scope ({})"_format(cfg.local_rename))->ignore_case();
     passes_opt->add_flag("--verbatim-inline",
-        verbatim_inline,
-        "Inline even if verbatim block exist ({})"_format(verbatim_inline))->ignore_case();
+        cfg.verbatim_inline,
+        "Inline even if verbatim block exist ({})"_format(cfg.verbatim_inline))->ignore_case();
     passes_opt->add_flag("--verbatim-rename",
-        verbatim_rename,
-        "Rename variables in verbatim block ({})"_format(verbatim_rename))->ignore_case();
+        cfg.verbatim_rename,
+        "Rename variables in verbatim block ({})"_format(cfg.verbatim_rename))->ignore_case();
     passes_opt->add_flag("--json-ast",
-        json_ast,
-        "Write AST to JSON file ({})"_format(json_ast))->ignore_case();
+        cfg.json_ast,
+        "Write AST to JSON file ({})"_format(cfg.json_ast))->ignore_case();
     passes_opt->add_flag("--nmodl-ast",
-        nmodl_ast,
-        "Write AST to NMODL file ({})"_format(nmodl_ast))->ignore_case();
+        cfg.nmodl_ast,
+        "Write AST to NMODL file ({})"_format(cfg.nmodl_ast))->ignore_case();
     passes_opt->add_flag("--json-perf",
-        json_perfstat,
-        "Write performance statistics to JSON file ({})"_format(json_perfstat))->ignore_case();
+        cfg.json_perfstat,
+        "Write performance statistics to JSON file ({})"_format(cfg.json_perfstat))->ignore_case();
     passes_opt->add_flag("--show-symtab",
         show_symtab,
         "Write symbol table to stdout ({})"_format(show_symtab))->ignore_case();
 
     auto codegen_opt = app.add_subcommand("codegen", "Code generation options")->ignore_case();
     codegen_opt->add_option("--datatype",
-        data_type,
+        cfg.data_type,
         "Data type for floating point variables",
         true)->ignore_case()->check(CLI::IsMember({"float", "double"}));
     codegen_opt->add_flag("--force",
-        force_codegen,
+        cfg.force_codegen,
         "Force code generation even if there is any incompatibility");
     codegen_opt->add_flag("--only-check-compatibility",
-                          only_check_compatibility,
+                          cfg.only_check_compatibility,
                           "Check compatibility and return without generating code");
     codegen_opt->add_flag("--opt-ionvar-copy",
-        optimize_ionvar_copies_codegen,
-        "Optimize copies of ion variables ({})"_format(optimize_ionvar_copies_codegen))->ignore_case();
+        cfg.optimize_ionvar_copies_codegen,
+        "Optimize copies of ion variables ({})"_format(cfg.optimize_ionvar_copies_codegen))->ignore_case();
 
 #ifdef NMODL_LLVM_BACKEND
 
     // LLVM IR code generation options.
     auto llvm_opt = app.add_subcommand("llvm", "LLVM code generation option")->ignore_case();
     auto llvm_ir_opt = llvm_opt->add_flag("--ir",
-        llvm_ir,
-        "Generate LLVM IR ({})"_format(llvm_ir))->ignore_case();
+        cfg.llvm_ir,
+        "Generate LLVM IR ({})"_format(cfg.llvm_ir))->ignore_case();
     llvm_ir_opt->required(true);
     llvm_opt->add_flag("--no-debug",
-        llvm_no_debug,
-        "Disable debug information ({})"_format(llvm_no_debug))->ignore_case();
+        cfg.llvm_no_debug,
+        "Disable debug information ({})"_format(cfg.llvm_no_debug))->ignore_case();
     llvm_opt->add_option("--opt-level-ir",
-        llvm_opt_level_ir,
-        "LLVM IR optimisation level (O{})"_format(llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
+        cfg.llvm_opt_level_ir,
+        "LLVM IR optimisation level (O{})"_format(cfg.llvm_opt_level_ir))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
     llvm_opt->add_flag("--single-precision",
-        llvm_float_type,
-        "Use single precision floating-point types ({})"_format(llvm_float_type))->ignore_case();
+        cfg.llvm_float_type,
+        "Use single precision floating-point types ({})"_format(cfg.llvm_float_type))->ignore_case();
     llvm_opt->add_option("--fmf",
-        llvm_fast_math_flags,
+        cfg.llvm_fast_math_flags,
         "Fast math flags for floating-point optimizations (none)")->check(CLI::IsMember({"afn", "arcp", "contract", "ninf", "nnan", "nsz", "reassoc", "fast"}));
 
     // Platform options for LLVM code generation.
     auto cpu_opt = app.add_subcommand("cpu", "LLVM CPU option")->ignore_case();
     cpu_opt->needs(llvm_opt);
     cpu_opt->add_option("--name",
-        llvm_cpu_name,
+        cfg.llvm_cpu_name,
         "Name of CPU platform to use")->ignore_case();
     auto simd_math_library_opt = cpu_opt->add_option("--math-library",
-        llvm_math_library,
-        "Math library for SIMD code generation ({})"_format(llvm_math_library));
+        cfg.llvm_math_library,
+        "Math library for SIMD code generation ({})"_format(cfg.llvm_math_library));
     simd_math_library_opt->check(CLI::IsMember({"Accelerate", "libmvec", "libsystem_m", "MASSV", "SLEEF", "SVML", "none"}));
     cpu_opt->add_option("--vector-width",
-        llvm_vector_width,
-        "Explicit vectorization width for IR generation ({})"_format(llvm_vector_width))->ignore_case();
+        cfg.llvm_vector_width,
+        "Explicit vectorization width for IR generation ({})"_format(cfg.llvm_vector_width))->ignore_case();
 
     auto gpu_opt = app.add_subcommand("gpu", "LLVM GPU option")->ignore_case();
     gpu_opt->needs(llvm_opt);
     auto gpu_target_name = gpu_opt->add_option("--name",
-        llvm_gpu_name,
+        cfg.llvm_gpu_name,
         "Name of GPU platform to use")->ignore_case();
    gpu_opt->add_option("--target-chip",
-        llvm_cpu_name,
+        cfg.llvm_gpu_target_architecture,
         "Name of target chip to use")->ignore_case();
     auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
-        llvm_math_library,
-        "Math library for GPU code generation ({})"_format(llvm_math_library));
+        cfg.llvm_math_library,
+        "Math library for GPU code generation ({})"_format(cfg.llvm_math_library));
     gpu_math_library_opt->check(CLI::IsMember({"libdevice"}));
 
     // Allow only one platform at a time.
@@ -378,9 +247,9 @@ int main(int argc, const char* argv[]) {
                             llvm_benchmark,
                             "Run LLVM benchmark ({})"_format(llvm_benchmark))->ignore_case();
     benchmark_opt->add_option("--opt-level-codegen",
-                              llvm_opt_level_codegen,
-                              "Machine code optimisation level (O{})"_format(llvm_opt_level_codegen))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
-    benchmark_opt->add_option("--libs", shared_lib_paths, "Shared libraries to link IR against")
+                              cfg.llvm_opt_level_codegen,
+                              "Machine code optimisation level (O{})"_format(cfg.llvm_opt_level_codegen))->ignore_case()->check(CLI::IsMember({"0", "1", "2", "3"}));
+    benchmark_opt->add_option("--libs", cfg.shared_lib_paths, "Shared libraries to link IR against")
             ->ignore_case()
             ->check(CLI::ExistingFile);
     benchmark_opt->add_option("--instance-size",
@@ -395,12 +264,12 @@ int main(int argc, const char* argv[]) {
     CLI11_PARSE(app, argc, argv);
 
     // if any of the other backends is used we force the C backend to be off.
-    if (omp_backend || ispc_backend) {
-        c_backend = false;
+    if (cfg.omp_backend || cfg.ispc_backend) {
+        cfg.c_backend = false;
     }
 
-    utils::make_path(output_dir);
-    utils::make_path(scratch_dir);
+    utils::make_path(cfg.output_dir);
+    utils::make_path(cfg.scratch_dir);
 
     if (sympy_opt) {
         nmodl::pybind_wrappers::EmbeddedPythonLoader::get_instance()
@@ -410,21 +279,6 @@ int main(int argc, const char* argv[]) {
 
     logger->set_level(spdlog::level::from_str(verbose));
 
-    /// write ast to nmodl
-    const auto ast_to_nmodl = [nmodl_ast](ast::Program& ast, const std::string& filepath) {
-        if (nmodl_ast) {
-            NmodlPrintVisitor(filepath).visit_program(ast);
-            logger->info("AST to NMODL transformation written to {}", filepath);
-        }
-    };
-
-    /// write ast to nmodl
-    const auto ast_to_json = [json_ast](ast::Program& ast, const std::string& filepath) {
-        if (json_ast) {
-            JSONVisitor(filepath).write(ast);
-            logger->info("AST to JSON transformation written to {}", filepath);
-        }
-    };
 
     for (const auto& file: mod_files) {
         logger->info("Processing {}", file);
@@ -432,92 +286,20 @@ int main(int argc, const char* argv[]) {
         const auto modfile = utils::remove_extension(utils::base_name(file));
 
         /// create file path for nmodl file
-        auto filepath = [scratch_dir, modfile](const std::string& suffix, const std::string& ext) {
+        auto filepath = [cfg, modfile](const std::string& suffix, const std::string& ext) {
             static int count = 0;
             return "{}/{}.{}.{}.{}"_format(
-                scratch_dir, modfile, std::to_string(count++), suffix, ext);
+                cfg.scratch_dir, modfile, std::to_string(count++), suffix, ext);
         };
 
-        /// driver object creates lexer and parser, just call parser method
-        NmodlDriver driver;
+        /// nmodl_driver object creates lexer and parser, just call parser method
+        NmodlDriver nmodl_driver;
 
         /// parse mod file and construct ast
-        const auto& ast = driver.parse_file(file);
-
-        /// whether to update existing symbol table or create new
-        /// one whenever we run symtab visitor.
-        bool update_symtab = false;
-
-        /// just visit the ast
-        AstVisitor().visit_program(*ast);
-
-        /// Check some rules that ast should follow
-        {
-            logger->info("Running semantic analysis visitor");
-            if (SemanticAnalysisVisitor().check(*ast)) {
-                return 1;
-            }
-        }
-
-        /// construct symbol table
-        {
-            logger->info("Running symtab visitor");
-            SymtabVisitor(update_symtab).visit_program(*ast);
-        }
-
-        /// use cnexp instead of after_cvode solve method
-        {
-            logger->info("Running CVode to cnexp visitor");
-            AfterCVodeToCnexpVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("after_cvode_to_cnexp", "mod"));
-        }
-
-        /// Rename variables that match ISPC compiler double constants
-        if (ispc_backend) {
-            logger->info("Running ISPC variables rename visitor");
-            IspcRenameVisitor(ast).visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("ispc_double_rename", "mod"));
-        }
-
-        /// GLOBAL to RANGE rename visitor
-        if (nmodl_global_to_range) {
-            // make sure to run perf visitor because code generator
-            // looks for read/write counts const/non-const declaration
-            PerfVisitor().visit_program(*ast);
-            // make sure to run the GlobalToRange visitor after all the
-            // reinitializations of Symtab
-            logger->info("Running GlobalToRange visitor");
-            GlobalToRangeVisitor(*ast).visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("global_to_range", "mod"));
-        }
+        const auto& ast = nmodl_driver.parse_file(file);
 
-        /// LOCAL to ASSIGNED visitor
-        if (nmodl_local_to_range) {
-            logger->info("Running LOCAL to ASSIGNED visitor");
-            PerfVisitor().visit_program(*ast);
-            LocalToAssignedVisitor().visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("local_to_assigned", "mod"));
-        }
-
-        {
-            // Compatibility Checking
-            logger->info("Running code compatibility checker");
-            // run perfvisitor to update read/write counts
-            PerfVisitor().visit_program(*ast);
-
-            // If we want to just check compatibility we return the result
-            if (only_check_compatibility) {
-                return CodegenCompatibilityVisitor().find_unhandled_ast_nodes(*ast);
-            }
-
-            // If there is an incompatible construct and code generation is not forced exit NMODL
-            if (CodegenCompatibilityVisitor().find_unhandled_ast_nodes(*ast) && !force_codegen) {
-                return 1;
-            }
-        }
+        auto cg_driver = CodegenDriver(cfg);
+        auto success = cg_driver.prepare_mod(ast);
 
         if (show_symtab) {
             logger->info("Printing symbol table");
@@ -525,204 +307,98 @@ int main(int argc, const char* argv[]) {
             symtab->print(std::cout);
         }
 
-        ast_to_nmodl(*ast, filepath("ast", "mod"));
-        ast_to_json(*ast, filepath("ast", "json"));
-
-        if (verbatim_rename) {
-            logger->info("Running verbatim rename visitor");
-            VerbatimVarRenameVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("verbatim_rename", "mod"));
-        }
-
-        if (nmodl_const_folding) {
-            logger->info("Running nmodl constant folding visitor");
-            ConstantFolderVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("constfold", "mod"));
-        }
-
-        if (nmodl_unroll) {
-            logger->info("Running nmodl loop unroll visitor");
-            LoopUnrollVisitor().visit_program(*ast);
-            ConstantFolderVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("unroll", "mod"));
-            SymtabVisitor(update_symtab).visit_program(*ast);
-        }
-
-        /// note that we can not symtab visitor in update mode as we
-        /// replace kinetic block with derivative block of same name
-        /// in global scope
-        {
-            logger->info("Running KINETIC block visitor");
-            auto kineticBlockVisitor = KineticBlockVisitor();
-            kineticBlockVisitor.visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            const auto filename = filepath("kinetic", "mod");
-            ast_to_nmodl(*ast, filename);
-            if (nmodl_ast && kineticBlockVisitor.get_conserve_statement_count()) {
-                logger->warn(
-                    "{} presents non-standard CONSERVE statements in DERIVATIVE blocks. Use it only for debugging/developing"_format(
-                        filename));
-            }
-        }
-
-        {
-            logger->info("Running STEADYSTATE visitor");
-            SteadystateVisitor().visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("steadystate", "mod"));
-        }
-
-        /// Parsing units fron "nrnunits.lib" and mod files
-        {
-            logger->info("Parsing Units");
-            UnitsVisitor(units_dir).visit_program(*ast);
-        }
-
-        /// once we start modifying (especially removing) older constructs
-        /// from ast then we should run symtab visitor in update mode so
-        /// that old symbols (e.g. prime variables) are not lost
-        update_symtab = true;
-
-        if (nmodl_inline) {
-            logger->info("Running nmodl inline visitor");
-            InlineVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("inline", "mod"));
-        }
-
-        if (local_rename) {
-            logger->info("Running local variable rename visitor");
-            LocalVarRenameVisitor().visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("local_rename", "mod"));
-        }
-
-        if (nmodl_localize) {
-            // localize pass must follow rename pass to avoid conflict
-            logger->info("Running localize visitor");
-            LocalizeVisitor(localize_verbatim).visit_program(*ast);
-            LocalVarRenameVisitor().visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("localize", "mod"));
+        if (cfg.only_check_compatibility) {
+            return !success;
         }
-
-        if (sympy_conductance) {
-            logger->info("Running sympy conductance visitor");
-            SympyConductanceVisitor().visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("sympy_conductance", "mod"));
-        }
-
-        if (sympy_analytic || sparse_solver_exists(*ast)) {
-            if (!sympy_analytic) {
-                logger->info(
-                    "Automatically enable sympy_analytic because it exists solver of type sparse");
-            }
-            logger->info("Running sympy solve visitor");
-            SympySolverVisitor(sympy_pade, sympy_cse).visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("sympy_solve", "mod"));
+        if (!success && !cfg.force_codegen) {
+            return 1;
         }
 
         {
-            logger->info("Running cnexp visitor");
-            NeuronSolveVisitor().visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("cnexp", "mod"));
-        }
-
-        {
-            SolveBlockVisitor().visit_program(*ast);
-            SymtabVisitor(update_symtab).visit_program(*ast);
-            ast_to_nmodl(*ast, filepath("solveblock", "mod"));
-        }
-
-        if (json_perfstat) {
-            auto file = scratch_dir + "/" + modfile + ".perf.json";
-            logger->info("Writing performance statistics to {}", file);
-            PerfVisitor(file).visit_program(*ast);
-        }
-
-        {
-            // make sure to run perf visitor because code generator
-            // looks for read/write counts const/non-const declaration
-            PerfVisitor().visit_program(*ast);
-        }
-
-        {
-            if (ispc_backend) {
+            if (cfg.ispc_backend) {
                 logger->info("Running ISPC backend code generator");
                 CodegenIspcVisitor visitor(modfile,
-                                           output_dir,
+                                           cfg.output_dir,
                                            data_type,
-                                           optimize_ionvar_copies_codegen);
+                                           cfg.optimize_ionvar_copies_codegen);
                 visitor.visit_program(*ast);
             }
 
-            else if (oacc_backend) {
+            else if (cfg.oacc_backend) {
                 logger->info("Running OpenACC backend code generator");
                 CodegenAccVisitor visitor(modfile,
-                                          output_dir,
+                                          cfg.output_dir,
                                           data_type,
-                                          optimize_ionvar_copies_codegen);
+                                          cfg.optimize_ionvar_copies_codegen);
                 visitor.visit_program(*ast);
             }
 
-            else if (omp_backend) {
+            else if (cfg.omp_backend) {
                 logger->info("Running OpenMP backend code generator");
                 CodegenOmpVisitor visitor(modfile,
-                                          output_dir,
+                                          cfg.output_dir,
                                           data_type,
-                                          optimize_ionvar_copies_codegen);
+                                          cfg.optimize_ionvar_copies_codegen);
                 visitor.visit_program(*ast);
             }
 
-            else if (c_backend) {
+            else if (cfg.c_backend) {
                 logger->info("Running C backend code generator");
                 CodegenCVisitor visitor(modfile,
-                                        output_dir,
+                                        cfg.output_dir,
                                         data_type,
-                                        optimize_ionvar_copies_codegen);
+                                        cfg.optimize_ionvar_copies_codegen);
                 visitor.visit_program(*ast);
             }
 
-            if (cuda_backend) {
+            if (cfg.cuda_backend) {
                 logger->info("Running CUDA backend code generator");
                 CodegenCudaVisitor visitor(modfile,
-                                           output_dir,
+                                           cfg.output_dir,
                                            data_type,
-                                           optimize_ionvar_copies_codegen);
+                                           cfg.optimize_ionvar_copies_codegen);
                 visitor.visit_program(*ast);
             }
 
 #ifdef NMODL_LLVM_BACKEND
-            if (llvm_ir || llvm_benchmark) {
+            if (cfg.llvm_ir || llvm_benchmark) {
                 // If benchmarking, we want to optimize the IR with target
                 // information and not in LLVM visitor.
-                int llvm_opt_level = llvm_benchmark ? 0 : llvm_opt_level_ir;
+                int llvm_opt_level = llvm_benchmark ? 0 : cfg.llvm_opt_level_ir;
 
                 // Create platform abstraction.
-                PlatformID pid = llvm_gpu_name == "default" ? PlatformID::CPU : PlatformID::GPU;
-                const std::string name = llvm_gpu_name == "default" ? llvm_cpu_name : llvm_gpu_name;
+                PlatformID pid = cfg.llvm_gpu_name == "default" ? PlatformID::CPU : PlatformID::GPU;
+                const std::string name = cfg.llvm_gpu_name == "default" ? cfg.llvm_cpu_name
+                                                                        : cfg.llvm_gpu_name;
                 Platform platform(pid,
                                   name,
-                                  llvm_cpu_name,
-                                  llvm_math_library,
-                                  llvm_float_type,
-                                  llvm_vector_width);
+                                  cfg.llvm_cpu_name,
+                                  cfg.llvm_math_library,
+                                  cfg.llvm_float_type,
+                                  cfg.llvm_vector_width);
 
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
-                                           output_dir,
+                                           cfg.output_dir,
                                            platform,
                                            llvm_opt_level,
-                                           !llvm_no_debug,
-                                           llvm_fast_math_flags);
+                                           !cfg.llvm_no_debug,
+                                           cfg.llvm_fast_math_flags);
                 visitor.visit_program(*ast);
-                ast_to_nmodl(*ast, filepath("llvm", "mod"));
-                ast_to_json(*ast, filepath("llvm", "json"));
+                if (cfg.nmodl_ast) {
+                    NmodlPrintVisitor(filepath("llvm", "mod")).visit_program(*ast);
+                    logger->info("AST to NMODL transformation written to {}",
+                                 filepath("llvm", "mod"));
+                }
+                if (cfg.json_ast) {
+                    JSONVisitor(filepath("llvm", "json")).write(*ast);
+                    logger->info("AST to JSON transformation written to {}",
+                                 filepath("llvm", "json"));
+                }
 
                 if (llvm_benchmark) {
                     // \todo integrate Platform class here
-                    if (llvm_gpu_name != "default") {
+                    if (cfg.llvm_gpu_name != "default") {
                         logger->warn(
                             "GPU benchmarking is not supported, targeting "
                             "CPU instead");
@@ -731,14 +407,14 @@ int main(int argc, const char* argv[]) {
                     logger->info("Running LLVM benchmark");
                     benchmark::LLVMBenchmark benchmark(visitor,
                                                        modfile,
-                                                       output_dir,
-                                                       shared_lib_paths,
+                                                       cfg.output_dir,
+                                                       cfg.shared_lib_paths,
                                                        num_experiments,
                                                        instance_size,
-                                                       llvm_cpu_name,
-                                                       llvm_opt_level_ir,
-                                                       llvm_opt_level_codegen);
-                    benchmark.run(ast);
+                                                       cfg.llvm_cpu_name,
+                                                       cfg.llvm_opt_level_ir,
+                                                       cfg.llvm_opt_level_codegen);
+                    benchmark.run();
                 }
             }
 #endif
diff --git a/src/pybind/CMakeLists.txt b/src/pybind/CMakeLists.txt
index e24861af62..16f4a586cc 100644
--- a/src/pybind/CMakeLists.txt
+++ b/src/pybind/CMakeLists.txt
@@ -71,6 +71,12 @@ if(NMODL_ENABLE_PYTHON_BINDINGS)
   add_dependencies(_nmodl pyastgen lexer_obj util_obj)
   target_link_libraries(_nmodl PRIVATE fmt::fmt pyembed)
 
+  # Additional options are needed when LLVM JIT functionality is built
+  if(NMODL_ENABLE_LLVM)
+    set_property(TARGET codegen llvm_codegen llvm_benchmark benchmark_data PROPERTY POSITION_INDEPENDENT_CODE ON)
+    target_link_libraries(_nmodl PRIVATE codegen llvm_codegen llvm_benchmark benchmark_data
+                                         ${LLVM_LIBS_TO_LINK})
+  endif()
   # in case of wheel, python module shouldn't link to wrapper library
   if(LINK_AGAINST_PYTHON)
     target_link_libraries(_nmodl PRIVATE pywrapper)
diff --git a/src/pybind/pynmodl.cpp b/src/pybind/pynmodl.cpp
index 20a2df7b19..bb27f7b3d0 100644
--- a/src/pybind/pynmodl.cpp
+++ b/src/pybind/pynmodl.cpp
@@ -8,17 +8,17 @@
 #include <memory>
 #include <set>
 
-#include <pybind11/iostream.h>
 #include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 
 #include "ast/program.hpp"
+#include "codegen/codegen_driver.hpp"
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "config/config.h"
 #include "parser/nmodl_driver.hpp"
 #include "pybind/pybind_utils.hpp"
+#include "test/benchmark/llvm_benchmark.hpp"
 #include "visitors/visitor_utils.hpp"
 
-
 /**
  * \dir
  * \brief Python Interface Implementation
@@ -110,6 +110,10 @@ static const char* to_json = R"(
     '{"Program":[{"NeuronBlock":[{"StatementBlock":[]}]}]}'
 )";
 
+static const char* jit = R"(
+    This is the Jit class documentation
+)";
+
 }  // namespace docstring
 
 
@@ -133,6 +137,57 @@ class PyNmodlDriver: public nmodl::parser::NmodlDriver {
     }
 };
 
+class JitDriver {
+  private:
+    nmodl::codegen::Platform platform;
+
+    nmodl::codegen::CodeGenConfig cfg;
+    nmodl::codegen::CodegenDriver cg_driver;
+
+    void init_platform() {
+        // Create platform abstraction.
+        nmodl::codegen::PlatformID pid = cfg.llvm_gpu_name == "default"
+                                             ? nmodl::codegen::PlatformID::CPU
+                                             : nmodl::codegen::PlatformID::GPU;
+        const std::string name = cfg.llvm_gpu_name == "default" ? cfg.llvm_cpu_name
+                                                                : cfg.llvm_gpu_name;
+        platform = nmodl::codegen::Platform(
+            pid, name, cfg.llvm_math_library, cfg.llvm_float_type, cfg.llvm_vector_width);
+    }
+
+  public:
+    JitDriver()
+        : cg_driver(cfg) {
+        init_platform();
+    }
+
+    explicit JitDriver(const nmodl::codegen::CodeGenConfig& cfg)
+        : cfg(cfg)
+        , cg_driver(cfg) {
+        init_platform();
+    }
+
+
+    benchmark::BenchmarkResults run(std::shared_ptr<nmodl::ast::Program> node,
+                                    std::string& modname,
+                                    int num_experiments,
+                                    int instance_size) {
+        cg_driver.prepare_mod(node);
+        nmodl::codegen::CodegenLLVMVisitor visitor(modname, cfg.output_dir, platform, 0);
+        visitor.visit_program(*node);
+        nmodl::benchmark::LLVMBenchmark benchmark(visitor,
+                                                  modname,
+                                                  cfg.output_dir,
+                                                  cfg.shared_lib_paths,
+                                                  num_experiments,
+                                                  instance_size,
+                                                  cfg.llvm_cpu_name,
+                                                  cfg.llvm_opt_level_ir,
+                                                  cfg.llvm_opt_level_codegen);
+        return benchmark.run();
+    }
+};
+
 }  // namespace nmodl
 
 // forward declaration of submodule init functions
@@ -164,6 +219,59 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
              nmodl::docstring::driver_parse_stream)
         .def("get_ast", &nmodl::PyNmodlDriver::get_ast, nmodl::docstring::driver_ast);
 
+    py::class_<nmodl::codegen::CodeGenConfig> cfg(m_nmodl, "CodeGenConfig");
+    cfg.def(py::init([]() {
+           auto cfg = std::make_unique<nmodl::codegen::CodeGenConfig>();
+           // set to more sensible defaults for python binding
+           cfg->llvm_backend = true;
+           return cfg;
+       }))
+        .def_readwrite("sympy_analytic", &nmodl::codegen::CodeGenConfig::sympy_analytic)
+        .def_readwrite("sympy_pade", &nmodl::codegen::CodeGenConfig::sympy_pade)
+        .def_readwrite("sympy_cse", &nmodl::codegen::CodeGenConfig::sympy_cse)
+        .def_readwrite("sympy_conductance", &nmodl::codegen::CodeGenConfig::sympy_conductance)
+        .def_readwrite("nmodl_inline", &nmodl::codegen::CodeGenConfig::nmodl_inline)
+        .def_readwrite("nmodl_unroll", &nmodl::codegen::CodeGenConfig::nmodl_unroll)
+        .def_readwrite("nmodl_const_folding", &nmodl::codegen::CodeGenConfig::nmodl_const_folding)
+        .def_readwrite("nmodl_localize", &nmodl::codegen::CodeGenConfig::nmodl_localize)
+        .def_readwrite("nmodl_global_to_range",
+                       &nmodl::codegen::CodeGenConfig::nmodl_global_to_range)
+        .def_readwrite("nmodl_local_to_range", &nmodl::codegen::CodeGenConfig::nmodl_local_to_range)
+        .def_readwrite("localize_verbatim", &nmodl::codegen::CodeGenConfig::localize_verbatim)
+        .def_readwrite("local_rename", &nmodl::codegen::CodeGenConfig::local_rename)
+        .def_readwrite("verbatim_inline", &nmodl::codegen::CodeGenConfig::verbatim_inline)
+        .def_readwrite("verbatim_rename", &nmodl::codegen::CodeGenConfig::verbatim_rename)
+        .def_readwrite("force_codegen", &nmodl::codegen::CodeGenConfig::force_codegen)
+        .def_readwrite("only_check_compatibility",
+                       &nmodl::codegen::CodeGenConfig::only_check_compatibility)
+        .def_readwrite("optimize_ionvar_copies_codegen",
+                       &nmodl::codegen::CodeGenConfig::optimize_ionvar_copies_codegen)
+        .def_readwrite("output_dir", &nmodl::codegen::CodeGenConfig::output_dir)
+        .def_readwrite("scratch_dir", &nmodl::codegen::CodeGenConfig::scratch_dir)
+        .def_readwrite("data_type", &nmodl::codegen::CodeGenConfig::data_type)
+        .def_readwrite("llvm_ir", &nmodl::codegen::CodeGenConfig::llvm_ir)
+        .def_readwrite("llvm_float_type", &nmodl::codegen::CodeGenConfig::llvm_float_type)
+        .def_readwrite("llvm_opt_level_ir", &nmodl::codegen::CodeGenConfig::llvm_opt_level_ir)
+        .def_readwrite("llvm_math_library", &nmodl::codegen::CodeGenConfig::llvm_math_library)
+        .def_readwrite("llvm_no_debug", &nmodl::codegen::CodeGenConfig::llvm_no_debug)
+        .def_readwrite("llvm_fast_math_flags", &nmodl::codegen::CodeGenConfig::llvm_fast_math_flags)
+        .def_readwrite("llvm_cpu_name", &nmodl::codegen::CodeGenConfig::llvm_cpu_name)
+        .def_readwrite("llvm_gpu_name", &nmodl::codegen::CodeGenConfig::llvm_gpu_name)
+        .def_readwrite("llvm_vector_width", &nmodl::codegen::CodeGenConfig::llvm_vector_width)
+        .def_readwrite("llvm_opt_level_codegen",
+                       &nmodl::codegen::CodeGenConfig::llvm_opt_level_codegen)
+        .def_readwrite("shared_lib_paths", &nmodl::codegen::CodeGenConfig::shared_lib_paths);
+
+    py::class_<nmodl::JitDriver> jit_driver(m_nmodl, "Jit", nmodl::docstring::jit);
+    jit_driver.def(py::init<>())
+        .def(py::init<const nmodl::codegen::CodeGenConfig&>())
+        .def("run",
+             &nmodl::JitDriver::run,
+             "node"_a,
+             "modname"_a,
+             "num_experiments"_a,
+             "instance_size"_a);
+
     m_nmodl.def("to_nmodl",
                 static_cast<std::string (*)(const nmodl::ast::Ast&,
                                             const std::set<nmodl::ast::AstNodeType>&)>(
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index 4441d53251..cc3e26bb35 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -15,3 +15,20 @@ add_dependencies(llvm_benchmark lexer util visitor)
 if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
   target_compile_definitions(llvm_benchmark PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
 endif()
+
+# =============================================================================
+# LLVM pyjit
+# =============================================================================
+
+if(NMODL_ENABLE_PYTHON_BINDINGS)
+  file(GLOB modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/*.mod")
+  foreach(modfile ${modfiles})
+    get_filename_component(modfile_name "${modfile}" NAME)
+    add_test(NAME "PyJIT/${modfile_name}"
+             COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
+                     ${modfile})
+    set_tests_properties(
+      "PyJIT/${modfile_name}" PROPERTIES ENVIRONMENT
+                                         PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH})
+  endforeach()
+endif()
diff --git a/test/benchmark/benchmark.py b/test/benchmark/benchmark.py
new file mode 100644
index 0000000000..c39f97002a
--- /dev/null
+++ b/test/benchmark/benchmark.py
@@ -0,0 +1,25 @@
+import sys
+
+import nmodl.dsl as nmodl
+from nmodl import ast, visitor
+
+def main():
+    driver = nmodl.NmodlDriver()
+    lookup_visitor = visitor.AstLookupVisitor()
+
+    cfg = nmodl.CodeGenConfig()
+    cfg.llvm_vector_width = 4
+    cfg.llvm_opt_level_ir = 2
+    fname = sys.argv[1]
+    with open(fname) as f:
+        hh = f.read()
+        modast = driver.parse_string(hh)
+        modname = lookup_visitor.lookup(modast, ast.AstNodeType.SUFFIX)[0].get_node_name()
+        jit = nmodl.Jit(cfg)
+
+        res = jit.run(modast, modname, 1000, 1000)
+        print(res)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 0e94ae231b..87d7e34512 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -6,10 +6,12 @@
  *************************************************************************/
 
 #include <chrono>
+#include <cmath>
+#include <numeric>
 
-#include "codegen/llvm/codegen_llvm_visitor.hpp"
 #include "llvm_benchmark.hpp"
 #include "test/benchmark/jit_driver.hpp"
+#include "utils/logger.hpp"
 #include "llvm/Support/Host.h"
 
 #include "test/unit/codegen/codegen_data_helper.hpp"
@@ -18,14 +20,14 @@
 namespace nmodl {
 namespace benchmark {
 
-void LLVMBenchmark::run(const std::shared_ptr<ast::Program>& node) {
+BenchmarkResults LLVMBenchmark::run() {
     // create functions
-    generate_llvm(node);
+    generate_llvm();
     // Finally, run the benchmark and log the measurements.
-    run_benchmark(node);
+    return run_benchmark();
 }
 
-void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
+void LLVMBenchmark::generate_llvm() {
     // First, visit the AST to build the LLVM IR module and wrap the kernel function calls.
     auto start = std::chrono::steady_clock::now();
     llvm_visitor.wrap_kernel_functions();
@@ -36,9 +38,9 @@ void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
     logger->info("Created LLVM IR module from NMODL AST in {} sec", diff.count());
 }
 
-void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
+BenchmarkResults LLVMBenchmark::run_benchmark() {
     // Set the codegen data helper and find the kernels.
-    auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
+    auto codegen_data = codegen::CodegenDataHelper(llvm_visitor.get_instance_struct_ptr());
     std::vector<std::string> kernel_names;
     llvm_visitor.find_kernel_names(kernel_names);
 
@@ -55,13 +57,11 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
     runner.initialize_driver();
 
+    BenchmarkResults results{};
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
-        // For every kernel run the benchmark `num_experiments` times.
-        double time_min = std::numeric_limits<double>::max();
-        double time_max = 0.0;
-        double time_sum = 0.0;
-        double time_squared_sum = 0.0;
+        // For every kernel run the benchmark `num_experiments` times and collect runtimes.
+        auto times = std::vector<double>(num_experiments, 0.0);
         for (int i = 0; i < num_experiments; ++i) {
             // Initialise the data.
             auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
@@ -80,22 +80,30 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             std::chrono::duration<double> diff = end - start;
 
             // Log the time taken for each run.
-            logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
+            logger->debug("Experiment {} compute time = {:.6f} sec", i, diff.count());
 
-            // Update statistics.
-            time_sum += diff.count();
-            time_squared_sum += diff.count() * diff.count();
-            time_min = std::min(time_min, diff.count());
-            time_max = std::max(time_max, diff.count());
+            times[i] = diff.count();
         }
+        // Calculate statistics
+        double time_mean = std::accumulate(times.begin(), times.end(), 0.0) / num_experiments;
+        double time_var = std::accumulate(times.begin(),
+                                          times.end(),
+                                          0.0,
+                                          [time_mean](const double& pres, const double& e) {
+                                              return (e - time_mean) * (e - time_mean);
+                                          }) /
+                          num_experiments;
+        double time_stdev = std::sqrt(time_var);
+        double time_min = *std::min_element(times.begin(), times.end());
+        double time_max = *std::max_element(times.begin(), times.end());
         // Log the average time taken for the kernel.
-        double time_mean = time_sum / num_experiments;
         logger->info("Average compute time = {:.6f}", time_mean);
-        logger->info("Compute time variance = {:g}",
-                     time_squared_sum / num_experiments - time_mean * time_mean);
+        logger->info("Compute time standard deviation = {:8f}", time_stdev);
         logger->info("Minimum compute time = {:.6f}", time_min);
         logger->info("Maximum compute time = {:.6f}\n", time_max);
+        results[kernel_name] = {time_mean, time_stdev, time_min, time_max};
     }
+    return results;
 }
 
 }  // namespace benchmark
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index cc9dd3bcf0..f79cad62e5 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -8,14 +8,20 @@
 #pragma once
 
 #include <fstream>
+#include <map>
 #include <string>
+#include <tuple>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
-#include "utils/logger.hpp"
 
 namespace nmodl {
 namespace benchmark {
 
+/**
+ * map of {name: [avg, stdev, min, max]}
+ */
+using BenchmarkResults = std::map<std::string, std::tuple<double, double, double, double>>;
+
 /**
  * \class LLVMBenchmark
  * \brief A wrapper to execute MOD file kernels via LLVM IR backend, and
@@ -74,14 +80,14 @@ class LLVMBenchmark {
         , opt_level_codegen(opt_level_codegen) {}
 
     /// Runs the benchmark.
-    void run(const std::shared_ptr<ast::Program>& node);
+    BenchmarkResults run();
 
   private:
     /// Visits the AST to construct the LLVM IR module.
-    void generate_llvm(const std::shared_ptr<ast::Program>& node);
+    void generate_llvm();
 
     /// Runs the main body of the benchmark, executing the compute kernels.
-    void run_benchmark(const std::shared_ptr<ast::Program>& node);
+    BenchmarkResults run_benchmark();
 
     /// Sets the log output stream (file or console).
     void set_log_output();
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 8174215729..107d856d74 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -157,10 +157,8 @@ endif()
 set(testvisitor_env "PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH}")
 if(NOT LINK_AGAINST_PYTHON)
   list(APPEND testvisitor_env "NMODL_PYLIB=$ENV{NMODL_PYLIB}")
-  list(
-    APPEND
-      testvisitor_env
-      "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  list(APPEND testvisitor_env
+       "NMODL_WRAPLIB=${PROJECT_BINARY_DIR}/lib/nmodl/libpywrapper${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif()
 
 foreach(
diff --git a/test/unit/codegen/codegen_data_helper.hpp b/test/unit/codegen/codegen_data_helper.hpp
index 76c4f422d9..c356a898ce 100644
--- a/test/unit/codegen/codegen_data_helper.hpp
+++ b/test/unit/codegen/codegen_data_helper.hpp
@@ -96,15 +96,12 @@ std::vector<T> generate_dummy_data(size_t initial_value, size_t num_elements) {
  * to the MOD file.
  */
 class CodegenDataHelper {
-    std::shared_ptr<ast::Program> program;
     std::shared_ptr<ast::InstanceStruct> instance;
 
   public:
     CodegenDataHelper() = delete;
-    CodegenDataHelper(const std::shared_ptr<ast::Program>& program,
-                      const std::shared_ptr<ast::InstanceStruct>& instance)
-        : program(program)
-        , instance(instance) {}
+    CodegenDataHelper(const std::shared_ptr<ast::InstanceStruct>& instance)
+        : instance(instance) {}
 
     CodegenInstanceData create_data(size_t num_elements, size_t seed);
 };
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index 4c9515f814..b5502cab51 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -320,7 +320,7 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
         // Create the instance struct data.
         int num_elements = 4;
         const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
-        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto codegen_data = codegen::CodegenDataHelper(generated_instance_struct);
         auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
 
         // Fill the instance struct data with some values.
@@ -404,7 +404,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
         // Create the instance struct data.
         int num_elements = 10;
         const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
-        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto codegen_data = codegen::CodegenDataHelper(generated_instance_struct);
         auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
 
         // Fill the instance struct data with some values for unit testing.
@@ -488,7 +488,7 @@ SCENARIO("Vectorised kernel with scatter instruction", "[llvm][runner]") {
         // Create the instance struct data.
         int num_elements = 5;
         const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
-        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto codegen_data = codegen::CodegenDataHelper(generated_instance_struct);
         auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
 
         // Fill the instance struct data with some values.
@@ -581,7 +581,7 @@ SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
         // Create the instance struct data.
         int num_elements = 5;
         const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
-        auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+        auto codegen_data = codegen::CodegenDataHelper(generated_instance_struct);
         auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
 
         // Fill the instance struct data with some values.
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index fbb07dfbcd..401e0a6c63 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -47,7 +47,7 @@ codegen::CodegenInstanceData generate_instance_data(const std::string& text,
     llvm_visitor.visit_program(*ast);
     llvm_visitor.dump_module();
     const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
-    auto codegen_data = codegen::CodegenDataHelper(ast, generated_instance_struct);
+    auto codegen_data = codegen::CodegenDataHelper(generated_instance_struct);
     auto instance_data = codegen_data.create_data(num_elements, seed);
     return instance_data;
 }

From a1dede62078948f10f47808ef70dda08f2bb4e18 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 27 Apr 2022 14:51:16 +0200
Subject: [PATCH 295/331] Small fixes for output

---
 test/benchmark/nmodl-llvm-time.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index c16d98370e..16d4defd15 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -185,9 +185,9 @@ declare -a benchmark_variance
 
 # Kernels, architectures and compilers loop
 
-KERNEL_TARGETS="compute-bound hh" #"compute-bound memory-bound hh"
+KERNEL_TARGETS="compute-bound memory-bound hh"
 
-ARCHITECTURES="default" #"skylake_avx512 broadwell nehalem nvptx64"
+ARCHITECTURES="skylake_avx512 broadwell nehalem default nvptx64"
 
 COMPILERS="intel clang gcc"
 
@@ -308,18 +308,18 @@ for kernel_target in ${KERNEL_TARGETS}; do
                     else
                         math_lib_path=${sleef_lib}
                     fi
-                    nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} --opt-level-ir 3 cpu --name ${nmodl_architecture} --vector-width ${vec_width} --math-library ${math_lib} benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${math_lib_path}"
                     benchmark_nmodl_desc=${kernel_target}_nmodl-jit_${math_lib}_${nmodl_architecture}_v${vec_width}_${fast_math_opt}
                     benchmark_description+=("${benchmark_nmodl_desc}")
+                    nmodl_args="${kernels_path}/${kernel_target}.mod --output ${output_dir}/${benchmark_nmodl_desc} llvm --ir ${fast_math_flag} --opt-level-ir 3 cpu --name ${nmodl_architecture} --vector-width ${vec_width} --math-library ${math_lib} benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${math_lib_path}"
                     # runs only kernel generated by LLVM IR
                     ${debug} eval "${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
                     benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
                     benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
                 done
             else
-                nmodl_args="${kernels_path}/${kernel_target}.mod llvm --ir ${fast_math_flag} --opt-level-ir 3 gpu --name ${nmodl_architecture} --target-arch \"sm_70\" --math-library libdevice benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${libdevice_lib} --grid-dim-x 16384 --block-dim-x 512"
                 benchmark_nmodl_desc=${kernel_target}_nmodl-cuda-jit_libdevice_${nmodl_architecture}_v${vec_width}_${fast_math_opt}
                 benchmark_description+=("${benchmark_nmodl_desc}")
+                nmodl_args="${kernels_path}/${kernel_target}.mod --output ${output_dir}/${benchmark_nmodl_desc} llvm --ir ${fast_math_flag} --opt-level-ir 3 gpu --name ${nmodl_architecture} --target-arch \"sm_70\" --math-library libdevice benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${libdevice_lib} --grid-dim-x 16384 --block-dim-x 512"
                 # runs only kernel generated by LLVM IR
                 if [[ $ncu_exec != "" ]]; then
                     ncu="${ncu_exec} --set detailed -f -o ${kernel_target}_${fast_math_opt}_detailed"

From 5ff83dbe2d04f7408f3a8e9650728c6c1fe0f514 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 27 Apr 2022 17:25:43 +0200
Subject: [PATCH 296/331] Added ability to run GPU jit from python

---
 src/pybind/pynmodl.cpp        | 25 ++++++++++++++++++++++---
 test/benchmark/CMakeLists.txt |  8 ++++++++
 test/benchmark/benchmark.py   |  8 +++++++-
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/src/pybind/pynmodl.cpp b/src/pybind/pynmodl.cpp
index cdebc4e001..48a47fce16 100644
--- a/src/pybind/pynmodl.cpp
+++ b/src/pybind/pynmodl.cpp
@@ -153,6 +153,17 @@ class JitDriver {
                                                                 : cfg.llvm_gpu_name;
         platform = nmodl::codegen::Platform(
             pid, name, cfg.llvm_math_library, cfg.llvm_float_type, cfg.llvm_vector_width);
+        if (platform.is_gpu() && !platform.is_CUDA_gpu()) {
+            throw std::runtime_error(
+                "Benchmarking is only supported on CUDA GPUs at the moment");
+        }
+#ifndef NMODL_LLVM_CUDA_BACKEND
+        if (platform.is_CUDA_gpu()) {
+            throw std::runtime_error(
+                "GPU benchmarking is not supported if NMODL is not built with CUDA "
+                "backend enabled.");
+        }
+#endif
     }
 
   public:
@@ -171,10 +182,14 @@ class JitDriver {
     benchmark::BenchmarkResults run(std::shared_ptr<nmodl::ast::Program> node,
                                     std::string& modname,
                                     int num_experiments,
-                                    int instance_size) {
+                                    int instance_size,
+                                    int cuda_grid_dim_x,
+                                    int cuda_block_dim_x) {
         cg_driver.prepare_mod(node);
         nmodl::codegen::CodegenLLVMVisitor visitor(modname, cfg.output_dir, platform, 0);
         visitor.visit_program(*node);
+        const GPUExecutionParameters gpu_execution_parameters{cuda_grid_dim_x,
+                                                              cuda_block_dim_x};
         nmodl::benchmark::LLVMBenchmark benchmark(visitor,
                                                   modname,
                                                   cfg.output_dir,
@@ -183,7 +198,8 @@ class JitDriver {
                                                   instance_size,
                                                   platform,
                                                   cfg.llvm_opt_level_ir,
-                                                  cfg.llvm_opt_level_codegen);
+                                                  cfg.llvm_opt_level_codegen,
+                                                  gpu_execution_parameters);
         return benchmark.run();
     }
 };
@@ -257,6 +273,7 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
         .def_readwrite("llvm_fast_math_flags", &nmodl::codegen::CodeGenConfig::llvm_fast_math_flags)
         .def_readwrite("llvm_cpu_name", &nmodl::codegen::CodeGenConfig::llvm_cpu_name)
         .def_readwrite("llvm_gpu_name", &nmodl::codegen::CodeGenConfig::llvm_gpu_name)
+        .def_readwrite("llvm_gpu_target_architecture", &nmodl::codegen::CodeGenConfig::llvm_gpu_target_architecture)
         .def_readwrite("llvm_vector_width", &nmodl::codegen::CodeGenConfig::llvm_vector_width)
         .def_readwrite("llvm_opt_level_codegen",
                        &nmodl::codegen::CodeGenConfig::llvm_opt_level_codegen)
@@ -270,7 +287,9 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
              "node"_a,
              "modname"_a,
              "num_experiments"_a,
-             "instance_size"_a);
+             "instance_size"_a,
+             "cuda_grid_dim_x"_a = 1,
+             "cuda_block_dim_x"_a = 1);
 
     m_nmodl.def("to_nmodl",
                 static_cast<std::string (*)(const nmodl::ast::Ast&,
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index 8d2dd77636..36b983b8a7 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -37,5 +37,13 @@ if(NMODL_ENABLE_PYTHON_BINDINGS)
     set_tests_properties(
       "PyJIT/${modfile_name}" PROPERTIES ENVIRONMENT
                                          PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH})
+    if(NMODL_ENABLE_LLVM_CUDA)
+      add_test(NAME "PyJIT/${modfile_name}_gpu"
+              COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
+                      ${modfile} gpu)
+      set_tests_properties(
+        "PyJIT/${modfile_name}_gpu" PROPERTIES ENVIRONMENT
+                                          PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH})
+    endif()
   endforeach()
 endif()
diff --git a/test/benchmark/benchmark.py b/test/benchmark/benchmark.py
index c39f97002a..aef7daaec6 100644
--- a/test/benchmark/benchmark.py
+++ b/test/benchmark/benchmark.py
@@ -1,4 +1,5 @@
 import sys
+import os
 
 import nmodl.dsl as nmodl
 from nmodl import ast, visitor
@@ -11,6 +12,11 @@ def main():
     cfg.llvm_vector_width = 4
     cfg.llvm_opt_level_ir = 2
     fname = sys.argv[1]
+    if len(sys.argv) > 2:  # GPU enabled
+        cfg.llvm_math_library = "libdevice"
+        cfg.llvm_gpu_name = "nvptx64"
+        cfg.llvm_gpu_target_architecture = "sm_70"
+        cfg.shared_lib_paths = [os.getenv("CUDA_HOME") + "/nvvm/libdevice/libdevice.10.bc"]
     with open(fname) as f:
         hh = f.read()
         modast = driver.parse_string(hh)
@@ -22,4 +28,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 90445f69df9c5ec8b11e61859098445f43f58855 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 27 Apr 2022 17:26:04 +0200
Subject: [PATCH 297/331] Added hh.mod file for unit test of pyjit

---
 test/benchmark/kernels/hh.mod | 114 ++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 test/benchmark/kernels/hh.mod

diff --git a/test/benchmark/kernels/hh.mod b/test/benchmark/kernels/hh.mod
new file mode 100644
index 0000000000..d92a686714
--- /dev/null
+++ b/test/benchmark/kernels/hh.mod
@@ -0,0 +1,114 @@
+TITLE hh.mod   squid sodium, potassium, and leak channels
+COMMENT
+    This is the original Hodgkin-Huxley treatment for the set of sodium,
+    potassium, and leakage channels found in the squid giant axon membrane.
+    ("A quantitative description of membrane current and its application
+    conduction and excitation in nerve" J.Physiol. (Lond.) 117:500-544 (1952).)
+    Membrane voltage is in absolute mV and has been reversed in polarity
+    from the original HH convention and shifted to reflect a resting potential
+    of -65 mV.
+    Remember to set celsius=6.3 (or whatever) in your HOC file.
+    See squid.hoc for an example of a simulation using this model.
+    SW Jaslove  6 March, 1992
+ENDCOMMENT
+UNITS {
+    (mA) = (milliamp)
+    (mV) = (millivolt)
+    (S) = (siemens)
+}
+NEURON {
+    SUFFIX hh
+    USEION na READ ena WRITE ina
+    USEION k READ ek WRITE ik
+    NONSPECIFIC_CURRENT il
+    RANGE gnabar, gkbar, gl, el, gna, gk
+    RANGE minf, hinf, ninf, mtau, htau, ntau
+    THREADSAFE
+}
+PARAMETER {
+    gnabar = .12 (S/cm2) <0,1e9>
+    gkbar = .036 (S/cm2) <0,1e9>
+    gl = .0003 (S/cm2) <0,1e9>
+    el = -54.3 (mV)
+}
+STATE {
+    m
+    h
+    n
+}
+ASSIGNED {
+    v (mV)
+    celsius (degC)
+    ena (mV)
+    ek (mV)
+    gna (S/cm2)
+    gk (S/cm2)
+    ina (mA/cm2)
+    ik (mA/cm2)
+    il (mA/cm2)
+    minf
+    hinf
+    ninf
+    mtau (ms)
+    htau (ms)
+    ntau (ms)
+}
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+    gna = gnabar*m*m*m*h
+    ina = gna*(v-ena)
+    gk = gkbar*n*n*n*n
+    ik = gk*(v-ek)
+    il = gl*(v-el)
+}
+INITIAL {
+    {
+        : inlined rates
+        LOCAL alpha, beta, sum, q10, vtrap_in_0, v_in_0
+        v_in_0 = v
+        q10 = 3*((celsius-6.3)/10)
+        alpha = .07*exp(-(v_in_0+65)/20)
+        beta = 1/(exp(-(v_in_0+35)/10)+1)
+        sum = alpha+beta
+        htau = 1/(q10*sum)
+        hinf = alpha/sum
+        {
+            : inlined vtrap
+            LOCAL x_in_0, y_in_0
+            x_in_0 = alpha
+            y_in_0 = alpha
+            : no control flow
+            vtrap_in_0 = y_in_0*(1-x_in_0/y_in_0/2)
+        }
+        hinf = vtrap_in_0
+    }
+    m = minf
+    h = hinf
+    n = ninf
+}
+DERIVATIVE states {
+    {
+        : inlined rates
+        LOCAL alpha, beta, sum, q10, vtrap_in_0, v_in_1
+        v_in_1 = v
+        q10 = 3*((celsius-6.3)/10)
+        alpha = .07*exp(-(v_in_1+65)/20)
+        beta = 1/(exp(-(v_in_1+35)/10)+1)
+        sum = alpha+beta
+        htau = 1/(q10*sum)
+        hinf = alpha/sum
+        {
+           : inlined vtrap
+            LOCAL x_in_0, y_in_0
+            x_in_0 = alpha
+            y_in_0 = alpha
+            : no control flow
+            vtrap_in_0 = y_in_0*(1-x_in_0/y_in_0/2)  
+        }
+        hinf = vtrap_in_0
+    }
+    m = m+(1.0-exp(dt*((((-1.0)))/mtau)))*(-(((minf))/mtau)/((((-1.0)))/mtau)-m)
+    h = h+(1.0-exp(dt*((((-1.0)))/htau)))*(-(((hinf))/htau)/((((-1.0)))/htau)-h)
+    n = n+(1.0-exp(dt*((((-1.0)))/ntau)))*(-(((ninf))/ntau)/((((-1.0)))/ntau)-n)
+}
+UNITSON

From 6ebdf7b66064d9041d9ed92b59a1aa6da7fd5ac2 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Wed, 27 Apr 2022 17:34:36 +0200
Subject: [PATCH 298/331] Use the integrated tests in the gitlab CI to test the
 GPU jit as well

---
 .gitlab-ci.yml                | 34 +++-------------------------------
 test/benchmark/CMakeLists.txt |  1 +
 2 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 122edd0aa2..4035e2331f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -43,14 +43,6 @@ build:intel:
   variables:
     SPACK_PACKAGE_COMPILER: intel
 
-build:gcc:
-  extends:
-    - .spack_build
-    - .spack_nmodl
-  variables:
-    SPACK_PACKAGE_COMPILER: gcc
-    SPACK_PACKAGE_DEPENDENCIES: ^bison%gcc^flex%gcc^py-jinja2%gcc^py-sympy%gcc^py-pyyaml%gcc
-
 .nmodl_tests:
   variables:
     # https://github.com/BlueBrain/nmodl/issues/737
@@ -62,12 +54,6 @@ test:intel:
     - .nmodl_tests
   needs: ["build:intel"]
 
-test:gcc:
-  extends:
-    - .ctest
-    - .nmodl_tests
-  needs: ["build:gcc"]
-
 .benchmark_config:
   variables:
     bb5_ntasks: 1
@@ -86,26 +72,12 @@ build_cuda:gcc:
   extends: [.spack_build, .build_allocation]
   variables:
     SPACK_PACKAGE: nmodl
-    SPACK_PACKAGE_SPEC: ~legacy-unit~python+llvm+llvm_cuda
+    SPACK_PACKAGE_SPEC: ~legacy-unit+python+llvm+llvm_cuda
     SPACK_INSTALL_EXTRA_FLAGS: -v
     SPACK_PACKAGE_COMPILER: gcc
 
-test_benchmark:cpu:
+test_benchmark:
   extends:
     - .benchmark_config
-  script:
-    - module load unstable
-    - . ${SPACK_ROOT}/share/spack/setup-env.sh
-    - spack load nmodl/${SPACK_INSTALLED_HASH}
-    - nmodl test/integration/mod/test_math.mod llvm --ir --opt-level-ir 3 benchmark --run --opt-level-codegen 3 --instance-size 10000000 --repeat 2
-  needs: ["build_cuda:gcc"]
-
-test_benchmark:gpu:
-  extends:
-    - .benchmark_config
-  script:
-    - module load unstable
-    - . ${SPACK_ROOT}/share/spack/setup-env.sh
-    - spack load nmodl/${SPACK_INSTALLED_HASH}
-    - nmodl test/integration/mod/test_math.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_70" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2 --grid-dim-x 4096 --block-dim-x 256
+    - .ctest
   needs: ["build_cuda:gcc"]
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index 36b983b8a7..9bd2c39ad0 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -29,6 +29,7 @@ endif()
 
 if(NMODL_ENABLE_PYTHON_BINDINGS)
   file(GLOB modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/*.mod")
+  list(APPEND modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/integration/mod/test_math.mod")
   foreach(modfile ${modfiles})
     get_filename_component(modfile_name "${modfile}" NAME)
     add_test(NAME "PyJIT/${modfile_name}"

From b273e603c987dc32ae49560930f99c133ac61833 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <ioannis.magkanaris@epfl.ch>
Date: Thu, 28 Apr 2022 17:44:01 +0300
Subject: [PATCH 299/331] Throw better error if CUDA_HOME is not set in
 benchmarking script

---
 test/benchmark/benchmark.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/benchmark/benchmark.py b/test/benchmark/benchmark.py
index aef7daaec6..bb0c879109 100644
--- a/test/benchmark/benchmark.py
+++ b/test/benchmark/benchmark.py
@@ -16,6 +16,8 @@ def main():
         cfg.llvm_math_library = "libdevice"
         cfg.llvm_gpu_name = "nvptx64"
         cfg.llvm_gpu_target_architecture = "sm_70"
+        if not os.environ.get("CUDA_HOME"):
+            raise RuntimeError("CUDA_HOME environment variable not set")
         cfg.shared_lib_paths = [os.getenv("CUDA_HOME") + "/nvvm/libdevice/libdevice.10.bc"]
     with open(fname) as f:
         hh = f.read()

From 690ffabe3a772ebf5ecc47f533f2ae6ea2f4704d Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 28 Apr 2022 17:13:20 +0200
Subject: [PATCH 300/331] Fix CUDA_HOME path and make cmake-format happy

---
 .gitlab-ci.yml                |  2 +-
 src/pybind/CMakeLists.txt     |  3 ++-
 test/benchmark/CMakeLists.txt | 15 ++++++++++-----
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4035e2331f..13a347b7e0 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -76,7 +76,7 @@ build_cuda:gcc:
     SPACK_INSTALL_EXTRA_FLAGS: -v
     SPACK_PACKAGE_COMPILER: gcc
 
-test_benchmark:
+test_benchmark:gcc:
   extends:
     - .benchmark_config
     - .ctest
diff --git a/src/pybind/CMakeLists.txt b/src/pybind/CMakeLists.txt
index 16f4a586cc..43be3b01a1 100644
--- a/src/pybind/CMakeLists.txt
+++ b/src/pybind/CMakeLists.txt
@@ -73,7 +73,8 @@ if(NMODL_ENABLE_PYTHON_BINDINGS)
 
   # Additional options are needed when LLVM JIT functionality is built
   if(NMODL_ENABLE_LLVM)
-    set_property(TARGET codegen llvm_codegen llvm_benchmark benchmark_data PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_property(TARGET codegen llvm_codegen llvm_benchmark benchmark_data
+                 PROPERTY POSITION_INDEPENDENT_CODE ON)
     target_link_libraries(_nmodl PRIVATE codegen llvm_codegen llvm_benchmark benchmark_data
                                          ${LLVM_LIBS_TO_LINK})
   endif()
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index 9bd2c39ad0..de3362a07f 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -6,7 +6,8 @@ set(LLVM_BENCHMARK_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
 
 if(NMODL_ENABLE_LLVM_CUDA)
-  list(APPEND LLVM_BENCHMARK_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.hpp)
+  list(APPEND LLVM_BENCHMARK_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.cpp
+       ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.hpp)
 endif()
 
 # =============================================================================
@@ -40,11 +41,15 @@ if(NMODL_ENABLE_PYTHON_BINDINGS)
                                          PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH})
     if(NMODL_ENABLE_LLVM_CUDA)
       add_test(NAME "PyJIT/${modfile_name}_gpu"
-              COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
-                      ${modfile} gpu)
+               COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
+                       ${modfile} gpu)
+      message(STATUS "CUDA_HOME is ${CUDAToolkit_TARGET_DIR}")
       set_tests_properties(
-        "PyJIT/${modfile_name}_gpu" PROPERTIES ENVIRONMENT
-                                          PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH})
+        "PyJIT/${modfile_name}_gpu"
+        PROPERTIES
+          ENVIRONMENT
+          "PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH};CUDA_HOME=${CUDAToolkit_TARGET_DIR}"
+      )
     endif()
   endforeach()
 endif()

From 64e8cee558fd1d0e592b5369ea5a38da25982dc3 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 29 Apr 2022 22:19:11 +0300
Subject: [PATCH 301/331] Fixes issue with debug printing of visitors (#854)

* Fixes issue with debug printing of the various stages of code generation in files
* Small fix and comment addition
---
 src/codegen/codegen_driver.cpp |  5 ++---
 src/codegen/codegen_driver.hpp |  2 +-
 src/main.cpp                   |  2 +-
 src/pybind/pynmodl.cpp         | 10 +++++++++-
 test/benchmark/benchmark.py    |  3 ++-
 test/benchmark/jit_driver.hpp  |  4 ++--
 6 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/codegen/codegen_driver.cpp b/src/codegen/codegen_driver.cpp
index 256d970074..b2e90e1a75 100644
--- a/src/codegen/codegen_driver.cpp
+++ b/src/codegen/codegen_driver.cpp
@@ -37,13 +37,12 @@ using namespace nmodl;
 using namespace codegen;
 using namespace visitor;
 
-bool CodegenDriver::prepare_mod(std::shared_ptr<ast::Program> node) {
+bool CodegenDriver::prepare_mod(std::shared_ptr<ast::Program> node, const std::string& modfile) {
     /// whether to update existing symbol table or create new
     /// one whenever we run symtab visitor.
     bool update_symtab = false;
 
-    std::string modfile;
-    std::string scratch_dir = "tmp";
+    const auto scratch_dir = cfg.scratch_dir;
     auto filepath = [scratch_dir, modfile](const std::string& suffix, const std::string& ext) {
         static int count = 0;
         return "{}/{}.{}.{}.{}"_format(scratch_dir, modfile, std::to_string(count++), suffix, ext);
diff --git a/src/codegen/codegen_driver.hpp b/src/codegen/codegen_driver.hpp
index aa2f9921d0..78c95421da 100644
--- a/src/codegen/codegen_driver.hpp
+++ b/src/codegen/codegen_driver.hpp
@@ -153,7 +153,7 @@ class CodegenDriver {
     explicit CodegenDriver(CodeGenConfig _cfg)
         : cfg(std::move(_cfg)) {}
 
-    bool prepare_mod(std::shared_ptr<nmodl::ast::Program> node);
+    bool prepare_mod(std::shared_ptr<nmodl::ast::Program> node, const std::string& modfile);
 
   private:
     CodeGenConfig cfg;
diff --git a/src/main.cpp b/src/main.cpp
index d93869e4e7..c394a160f9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -299,7 +299,7 @@ int main(int argc, const char* argv[]) {
         const auto& ast = nmodl_driver.parse_file(file);
 
         auto cg_driver = CodegenDriver(cfg);
-        auto success = cg_driver.prepare_mod(ast);
+        auto success = cg_driver.prepare_mod(ast, modfile);
 
         if (show_symtab) {
             logger->info("Printing symbol table");
diff --git a/src/pybind/pynmodl.cpp b/src/pybind/pynmodl.cpp
index bb27f7b3d0..3b75a7f30a 100644
--- a/src/pybind/pynmodl.cpp
+++ b/src/pybind/pynmodl.cpp
@@ -172,7 +172,12 @@ class JitDriver {
                                     std::string& modname,
                                     int num_experiments,
                                     int instance_size) {
-        cg_driver.prepare_mod(node);
+        // New directory is needed to be created otherwise the directory cannot be created
+        // automatically through python
+        if (cfg.nmodl_ast || cfg.json_ast || cfg.json_perfstat) {
+            utils::make_path(cfg.scratch_dir);
+        }
+        cg_driver.prepare_mod(node, modname);
         nmodl::codegen::CodegenLLVMVisitor visitor(modname, cfg.output_dir, platform, 0);
         visitor.visit_program(*node);
         nmodl::benchmark::LLVMBenchmark benchmark(visitor,
@@ -249,6 +254,9 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
         .def_readwrite("output_dir", &nmodl::codegen::CodeGenConfig::output_dir)
         .def_readwrite("scratch_dir", &nmodl::codegen::CodeGenConfig::scratch_dir)
         .def_readwrite("data_type", &nmodl::codegen::CodeGenConfig::data_type)
+        .def_readwrite("nmodl_ast", &nmodl::codegen::CodeGenConfig::nmodl_ast)
+        .def_readwrite("json_ast", &nmodl::codegen::CodeGenConfig::json_ast)
+        .def_readwrite("json_perfstat", &nmodl::codegen::CodeGenConfig::json_perfstat)
         .def_readwrite("llvm_ir", &nmodl::codegen::CodeGenConfig::llvm_ir)
         .def_readwrite("llvm_float_type", &nmodl::codegen::CodeGenConfig::llvm_float_type)
         .def_readwrite("llvm_opt_level_ir", &nmodl::codegen::CodeGenConfig::llvm_opt_level_ir)
diff --git a/test/benchmark/benchmark.py b/test/benchmark/benchmark.py
index c39f97002a..c133f8d59c 100644
--- a/test/benchmark/benchmark.py
+++ b/test/benchmark/benchmark.py
@@ -10,6 +10,7 @@ def main():
     cfg = nmodl.CodeGenConfig()
     cfg.llvm_vector_width = 4
     cfg.llvm_opt_level_ir = 2
+    cfg.nmodl_ast = True
     fname = sys.argv[1]
     with open(fname) as f:
         hh = f.read()
@@ -22,4 +23,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index 7106311523..ed86684f76 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -76,7 +76,7 @@ class JITDriver {
         if (!expected_symbol)
             throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
 
-        auto (*res)() = (ReturnType(*)())(intptr_t) expected_symbol->getAddress();
+        auto(*res)() = (ReturnType(*)())(intptr_t) expected_symbol->getAddress();
         ReturnType result = res();
         return result;
     }
@@ -88,7 +88,7 @@ class JITDriver {
         if (!expected_symbol)
             throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
 
-        auto (*res)(ArgType) = (ReturnType(*)(ArgType))(intptr_t) expected_symbol->getAddress();
+        auto(*res)(ArgType) = (ReturnType(*)(ArgType))(intptr_t) expected_symbol->getAddress();
         ReturnType result = res(arg);
         return result;
     }

From 5952a87542ea58804109982c518d96037006ac6d Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Mon, 2 May 2022 11:51:00 +0200
Subject: [PATCH 302/331] Support for Breakpoint block (nrn_cur) for code
 generation (#645)

* Support for Breakpoint block (nrn_cur) for code generation
* similar to DERIVATIVE (nrn_state), handle BREAKPOINT (nrn_cur)
   blocks with AST level transformation
* Move common code from CodegenCVisitor to CodegenInfo
* Add tests

fixes #644

Co-authored-by: George Mitenkov <georgemitenk0v@gmail.com>
---
 src/codegen/codegen_acc_visitor.cpp           |   8 +-
 src/codegen/codegen_c_visitor.cpp             |  45 +-
 src/codegen/codegen_c_visitor.hpp             |  17 -
 src/codegen/codegen_cuda_visitor.cpp          |   8 +-
 src/codegen/codegen_driver.cpp                |   2 +-
 src/codegen/codegen_info.cpp                  |  29 ++
 src/codegen/codegen_info.hpp                  |  30 ++
 src/codegen/codegen_ispc_visitor.cpp          |   4 +-
 src/codegen/codegen_naming.hpp                |   6 +
 .../llvm/codegen_llvm_helper_visitor.cpp      | 410 ++++++++++++++++--
 .../llvm/codegen_llvm_helper_visitor.hpp      |  21 +
 src/codegen/llvm/codegen_llvm_visitor.cpp     |   6 +
 src/codegen/llvm/codegen_llvm_visitor.hpp     |   3 +
 src/codegen/llvm/llvm_ir_builder.cpp          |  11 +-
 src/language/nmodl.yaml                       |  21 +-
 test/unit/codegen/codegen_llvm_execution.cpp  |   2 +
 test/unit/codegen/codegen_llvm_ir.cpp         |  89 +++-
 test/unit/codegen/codegen_llvm_visitor.cpp    | 404 ++++++++++++++++-
 18 files changed, 987 insertions(+), 129 deletions(-)

diff --git a/src/codegen/codegen_acc_visitor.cpp b/src/codegen/codegen_acc_visitor.cpp
index 93b04ef480..544b252a41 100644
--- a/src/codegen/codegen_acc_visitor.cpp
+++ b/src/codegen/codegen_acc_visitor.cpp
@@ -200,8 +200,8 @@ void CodegenAccVisitor::print_net_init_acc_serial_annotation_block_end() {
 }
 
 void CodegenAccVisitor::print_nrn_cur_matrix_shadow_update() {
-    auto rhs_op = operator_for_rhs();
-    auto d_op = operator_for_d();
+    const auto& rhs_op = info.operator_for_rhs();
+    const auto& d_op = info.operator_for_d();
     print_atomic_reduction_pragma();
     printer->add_line("vec_rhs[node_id] {} rhs;"_format(rhs_op));
     print_atomic_reduction_pragma();
@@ -213,8 +213,8 @@ void CodegenAccVisitor::print_fast_imem_calculation() {
         return;
     }
 
-    auto rhs_op = operator_for_rhs();
-    auto d_op = operator_for_d();
+    const auto& rhs_op = info.operator_for_rhs();
+    const auto& d_op = info.operator_for_d();
     printer->start_block("if (nt->nrn_fast_imem)");
     print_atomic_reduction_pragma();
     printer->add_line("nt->nrn_fast_imem->nrn_sav_rhs[node_id] {} rhs;"_format(rhs_op));
diff --git a/src/codegen/codegen_c_visitor.cpp b/src/codegen/codegen_c_visitor.cpp
index 2dfdedec51..e7a28fa80f 100644
--- a/src/codegen/codegen_c_visitor.cpp
+++ b/src/codegen/codegen_c_visitor.cpp
@@ -325,37 +325,6 @@ void CodegenCVisitor::visit_update_dt(const ast::UpdateDt& node) {
 /*                               Common helper routines                                 */
 /****************************************************************************************/
 
-
-/**
- * \details Certain statements like unit, comment, solve can/need to be skipped
- * during code generation. Note that solve block is wrapped in expression
- * statement and hence we have to check inner expression. It's also true
- * for the initial block defined inside net receive block.
- */
-bool CodegenCVisitor::statement_to_skip(const Statement& node) const {
-    // clang-format off
-    if (node.is_unit_state()
-        || node.is_line_comment()
-        || node.is_block_comment()
-        || node.is_solve_block()
-        || node.is_conductance_hint()
-        || node.is_table_statement()) {
-        return true;
-    }
-    // clang-format on
-    if (node.is_expression_statement()) {
-        auto expression = dynamic_cast<const ExpressionStatement*>(&node)->get_expression();
-        if (expression->is_solve_block()) {
-            return true;
-        }
-        if (expression->is_initial_block()) {
-            return true;
-        }
-    }
-    return false;
-}
-
-
 /**
  * \details When floating point data type is not default (i.e. double) then we
  * have to copy old array to new type (for range variables).
@@ -974,8 +943,8 @@ void CodegenCVisitor::print_nrn_cur_matrix_shadow_update() {
             printer->add_line("shadow_rhs[id] = rhs;");
             printer->add_line("shadow_d[id] = g;");
         } else {
-            auto rhs_op = operator_for_rhs();
-            auto d_op = operator_for_d();
+            const auto& rhs_op = info.operator_for_rhs();
+            const auto& d_op = info.operator_for_d();
             print_atomic_reduction_pragma();
             printer->add_line("vec_rhs[node_id] {} rhs;"_format(rhs_op));
             print_atomic_reduction_pragma();
@@ -986,8 +955,8 @@ void CodegenCVisitor::print_nrn_cur_matrix_shadow_update() {
 
 
 void CodegenCVisitor::print_nrn_cur_matrix_shadow_reduction() {
-    auto rhs_op = operator_for_rhs();
-    auto d_op = operator_for_d();
+    const auto& rhs_op = info.operator_for_rhs();
+    const auto& d_op = info.operator_for_d();
     if (channel_task_dependency_enabled()) {
         auto rhs = get_variable_name("ml_rhs");
         auto d = get_variable_name("ml_d");
@@ -1167,7 +1136,7 @@ void CodegenCVisitor::print_statement_block(const ast::StatementBlock& node,
 
     auto statements = node.get_statements();
     for (const auto& statement: statements) {
-        if (statement_to_skip(*statement)) {
+        if (info.statement_to_skip(*statement)) {
             continue;
         }
         /// not necessary to add indent for verbatim block (pretty-printing)
@@ -4337,8 +4306,8 @@ void CodegenCVisitor::print_fast_imem_calculation() {
         return;
     }
     std::string rhs, d;
-    auto rhs_op = operator_for_rhs();
-    auto d_op = operator_for_d();
+    const auto& rhs_op = info.operator_for_rhs();
+    const auto& d_op = info.operator_for_d();
     if (channel_task_dependency_enabled()) {
         rhs = get_variable_name("ml_rhs");
         d = get_variable_name("ml_d");
diff --git a/src/codegen/codegen_c_visitor.hpp b/src/codegen/codegen_c_visitor.hpp
index b5b58412a0..0fe4e1308c 100644
--- a/src/codegen/codegen_c_visitor.hpp
+++ b/src/codegen/codegen_c_visitor.hpp
@@ -218,23 +218,6 @@ class CodegenCVisitor: public visitor::ConstAstVisitor {
         return "\"" + text + "\"";
     }
 
-
-    /**
-     * Operator for rhs vector update (matrix update)
-     */
-    std::string operator_for_rhs() const noexcept {
-        return info.electrode_current ? "+=" : "-=";
-    }
-
-
-    /**
-     * Operator for diagonal vector update (matrix update)
-     */
-    std::string operator_for_d() const noexcept {
-        return info.electrode_current ? "-=" : "+=";
-    }
-
-
     /**
      * Data type for the local variables
      */
diff --git a/src/codegen/codegen_cuda_visitor.cpp b/src/codegen/codegen_cuda_visitor.cpp
index 199d9bdd66..92488f3d80 100644
--- a/src/codegen/codegen_cuda_visitor.cpp
+++ b/src/codegen/codegen_cuda_visitor.cpp
@@ -96,8 +96,8 @@ void CodegenCudaVisitor::print_device_method_annotation() {
 
 
 void CodegenCudaVisitor::print_nrn_cur_matrix_shadow_update() {
-    auto rhs_op = operator_for_rhs();
-    auto d_op = operator_for_d();
+    auto rhs_op = info.operator_for_rhs();
+    auto d_op = info.operator_for_d();
     stringutils::remove_character(rhs_op, '=');
     stringutils::remove_character(d_op, '=');
     print_atomic_op("vec_rhs[node_id]", rhs_op, "rhs");
@@ -109,8 +109,8 @@ void CodegenCudaVisitor::print_fast_imem_calculation() {
         return;
     }
 
-    auto rhs_op = operator_for_rhs();
-    auto d_op = operator_for_d();
+    auto rhs_op = info.operator_for_rhs();
+    auto d_op = info.operator_for_d();
     stringutils::remove_character(rhs_op, '=');
     stringutils::remove_character(d_op, '=');
     printer->start_block("if (nt->nrn_fast_imem)");
diff --git a/src/codegen/codegen_driver.cpp b/src/codegen/codegen_driver.cpp
index b2e90e1a75..0bdf37a29c 100644
--- a/src/codegen/codegen_driver.cpp
+++ b/src/codegen/codegen_driver.cpp
@@ -179,7 +179,7 @@ bool CodegenDriver::prepare_mod(std::shared_ptr<ast::Program> node, const std::s
     /// that old symbols (e.g. prime variables) are not lost
     update_symtab = true;
 
-    if (cfg.nmodl_inline) {
+    if (cfg.nmodl_inline || cfg.llvm_ir) {
         logger->info("Running nmodl inline visitor");
         InlineVisitor().visit_program(*node);
         ast_to_nmodl(*node, filepath("inline", "mod"));
diff --git a/src/codegen/codegen_info.cpp b/src/codegen/codegen_info.cpp
index 522922552e..16b358ef1a 100644
--- a/src/codegen/codegen_info.cpp
+++ b/src/codegen/codegen_info.cpp
@@ -404,5 +404,34 @@ void CodegenInfo::get_float_variables() {
     }
 }
 
+/**
+ * \details Certain statements like unit, comment, solve can/need to be skipped
+ * during code generation. Note that solve block is wrapped in expression
+ * statement and hence we have to check inner expression. It's also true
+ * for the initial block defined inside net receive block.
+ */
+bool CodegenInfo::statement_to_skip(const ast::Statement& node) const {
+    // clang-format off
+    if (node.is_unit_state()
+        || node.is_line_comment()
+        || node.is_block_comment()
+        || node.is_solve_block()
+        || node.is_conductance_hint()
+        || node.is_table_statement()) {
+        return true;
+    }
+    // clang-format on
+    if (node.is_expression_statement()) {
+        auto expression = dynamic_cast<const ast::ExpressionStatement*>(&node)->get_expression();
+        if (expression->is_solve_block()) {
+            return true;
+        }
+        if (expression->is_initial_block()) {
+            return true;
+        }
+    }
+    return false;
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/codegen_info.hpp b/src/codegen/codegen_info.hpp
index 5cbb8eb2aa..fb506a1344 100644
--- a/src/codegen/codegen_info.hpp
+++ b/src/codegen/codegen_info.hpp
@@ -590,6 +590,29 @@ struct CodegenInfo {
     }
 
 
+    /**
+     * Operator for rhs vector update (matrix update)
+     *
+     * Note that we only rely on following two syntax for
+     * increment and decrement. Code generation backends
+     * are relying on this convention.
+     */
+    std::string operator_for_rhs() const noexcept {
+        return electrode_current ? "+=" : "-=";
+    }
+
+
+    /**
+     * Operator for diagonal vector update (matrix update)
+     *
+     * Note that we only rely on following two syntax for
+     * increment and decrement. Code generation backends
+     * are relying on this convention.
+     */
+    std::string operator_for_d() const noexcept {
+        return electrode_current ? "-=" : "+=";
+    }
+
     /**
      * Check if net_receive function is required
      */
@@ -657,6 +680,13 @@ struct CodegenInfo {
      * \return A \c vector of \c float variables
      */
     void get_float_variables();
+
+    /**
+     * Check if statement should be skipped for code generation
+     * @param node Statement to be checked for code generation
+     * @return True if statement should be skipped otherwise false
+     */
+    bool statement_to_skip(const ast::Statement& node) const;
 };
 
 /** @} */  // end of codegen_backends
diff --git a/src/codegen/codegen_ispc_visitor.cpp b/src/codegen/codegen_ispc_visitor.cpp
index 5d1c2de485..df04a96a41 100644
--- a/src/codegen/codegen_ispc_visitor.cpp
+++ b/src/codegen/codegen_ispc_visitor.cpp
@@ -248,8 +248,8 @@ void CodegenIspcVisitor::print_atomic_op(const std::string& lhs,
 
 
 void CodegenIspcVisitor::print_nrn_cur_matrix_shadow_reduction() {
-    auto rhs_op = operator_for_rhs();
-    auto d_op = operator_for_d();
+    const auto& rhs_op = info.operator_for_rhs();
+    const auto& d_op = info.operator_for_d();
     if (info.point_process) {
         printer->add_line("uniform int node_id = node_index[id];");
         printer->add_line("vec_rhs[node_id] {} shadow_rhs[id];"_format(rhs_op));
diff --git a/src/codegen/codegen_naming.hpp b/src/codegen/codegen_naming.hpp
index b6c8aa9df1..b1e1b82267 100644
--- a/src/codegen/codegen_naming.hpp
+++ b/src/codegen/codegen_naming.hpp
@@ -92,6 +92,12 @@ static constexpr char NTHREAD_RHS_SHADOW[] = "_shadow_rhs";
 /// shadow d variable in neuron thread structure
 static constexpr char NTHREAD_D_SHADOW[] = "_shadow_d";
 
+/// rhs variable in neuron thread structure
+static constexpr char NTHREAD_RHS[] = "vec_rhs";
+
+/// d variable in neuron thread structure
+static constexpr char NTHREAD_D[] = "vec_d";
+
 /// t variable in neuron thread structure
 static constexpr char NTHREAD_T_VARIABLE[] = "t";
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c029bb736f..5f8119a4d1 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -92,18 +92,18 @@ static std::shared_ptr<ast::Expression> create_statement_as_expression(const std
 }
 
 /**
- * \brief Create expression for given NMODL code expression
+ * \brief Create an expression for a given NMODL expression in string form
  * @param code NMODL code expression
- * @return Expression representing NMODL code
+ * @return Expression ast node representing NMODL code
  */
-std::shared_ptr<ast::Expression> create_expression(const std::string& code) {
+static ast::Expression* create_expression(const std::string& code) {
     /// as provided code is only expression and not a full statement, create
     /// a temporary assignment statement
     const auto& wrapped_expr = create_statement_as_expression("some_var = " + code);
     /// now extract RHS (representing original code) and return it as expression
     auto expr = std::dynamic_pointer_cast<ast::WrappedExpression>(wrapped_expr)->get_expression();
     auto rhs = std::dynamic_pointer_cast<ast::BinaryExpression>(expr)->get_rhs();
-    return std::make_shared<ast::WrappedExpression>(rhs->clone());
+    return new ast::WrappedExpression(rhs->clone());
 }
 
 CodegenFunctionVector CodegenLLVMHelperVisitor::get_codegen_functions(const ast::Program& node) {
@@ -246,35 +246,72 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
     add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
 
+    // As we do not have `NrnThread` object as an argument, we store points to rhs
+    // and d to in the instance struct as well. Also need their respective shadow variables
+    // in case of point process mechanism.
+    // Note: shadow variables are not used at the moment because reduction will be taken care
+    // by LLVM backend (even on CPU via sequential add like ISPC).
+    add_var_with_type(naming::NTHREAD_RHS, FLOAT_TYPE, /*is_pointer=*/1);
+    add_var_with_type(naming::NTHREAD_D, FLOAT_TYPE, /*is_pointer=*/1);
+    add_var_with_type(naming::NTHREAD_RHS_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
+    add_var_with_type(naming::NTHREAD_D_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
+
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
 
+/**
+ * Append all code specific statements from StatementBlock to given StatementVector
+ * @param statements Statement vector to which statements to be added
+ * @param block Statement block from which statetments should be appended
+ * @param info CodegenInfo object with necessary data and helper functions
+ */
 static void append_statements_from_block(ast::StatementVector& statements,
-                                         const std::shared_ptr<ast::StatementBlock>& block) {
-    const auto& block_statements = block->get_statements();
-    for (const auto& statement: block_statements) {
-        const auto& expression_statement = std::dynamic_pointer_cast<ast::ExpressionStatement>(
-            statement);
-        if (!expression_statement || !expression_statement->get_expression()->is_solve_block())
-            statements.push_back(statement);
+                                         const std::shared_ptr<ast::StatementBlock> block,
+                                         const codegen::CodegenInfo& info) {
+    for (const auto& statement: block->get_statements()) {
+        if (!info.statement_to_skip(*statement)) {
+            statements.emplace_back(statement->clone());
+        }
     }
 }
 
+/**
+ * Create atomic statement for given expression of the form a[i] += expression
+ * @param var Name of the variable on the LHS (it's an array), e.g. `a`
+ * @param var_index Name of the index variable to access variable `var` e.g. `i`
+ * @param op_str Operators like += or -=
+ * @param rhs_str expression that will be added or subtracted from `var[var_index]`
+ * @return A statement representing atomic operation using `ast::CodegenAtomicStatement`
+ */
 static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(
-    std::string& ion_varname,
-    std::string& index_varname,
-    std::string& op_str,
-    std::string& rhs_str) {
+    const std::string& var,
+    const std::string& var_index,
+    const std::string& op_str,
+    const std::string& rhs_str) {
     // create lhs expression
-    auto varname = new ast::Name(new ast::String(ion_varname));
-    auto index = new ast::Name(new ast::String(index_varname));
-    auto lhs = std::make_shared<ast::VarName>(new ast::IndexedName(varname, index),
-                                              /*at=*/nullptr,
-                                              /*index=*/nullptr);
-
-    auto op = ast::BinaryOperator(ast::string_to_binaryop(op_str));
-    auto rhs = create_expression(rhs_str);
-    return std::make_shared<ast::CodegenAtomicStatement>(lhs, op, rhs);
+    auto varname = new ast::Name(new ast::String(var));
+    auto index = new ast::Name(new ast::String(var_index));
+    auto lhs = new ast::VarName(new ast::IndexedName(varname, index),
+                                /*at=*/nullptr,
+                                /*index=*/nullptr);
+
+    // LLVM IR generation is now only supporting assignment (=) and not += or -=
+    // So we need to write increment operation a += b as an assignment operation
+    // a = a + b.
+    // See https://github.com/BlueBrain/nmodl/issues/851
+
+    std::string op(op_str);
+    stringutils::remove_character(op, '=');
+
+    // make sure only + or - operator is used
+    if (op_str != "-" && op_str != "+") {
+        throw std::runtime_error("Unsupported binary operator for atomic statement");
+    }
+
+    auto* rhs = create_expression("{}[{}] {} {} "_format(var, var_index, op, rhs_str));
+    return std::make_shared<ast::CodegenAtomicStatement>(lhs,
+                                                         ast::BinaryOperator{ast::BOP_ASSIGN},
+                                                         rhs);
 }
 
 /**
@@ -289,7 +326,7 @@ static std::shared_ptr<ast::CodegenAtomicStatement> create_atomic_statement(
  * @param type The type of code block being generated
  * @param int_variables Index variables to be created
  * @param double_variables Floating point variables to be created
- * @param index_statements Statements for loading indexes (typically for ions)
+ * @param index_statements Statements for loading indexes (typically for ions, rhs, d)
  * @param body_statements main compute/update statements
  *
  * \todo After looking into mod2c and neuron implementation, it seems like
@@ -377,8 +414,24 @@ void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
         // push index definition, index statement and actual write statement
         int_variables.push_back(index_varname);
         index_statements.push_back(visitor::create_statement(index_statement));
+
         // pass ion variable to write and its index
-        body_statements.push_back(create_atomic_statement(ion_varname, index_varname, op, rhs));
+
+        // lhs variable
+        std::string lhs = "{}[{}] "_format(ion_varname, index_varname);
+
+        // lets turn a += b into a = a + b if applicable
+        // note that this is done in order to facilitate existing implementation in the llvm
+        // backend which doesn't support += or -= operators.
+        std::string statement;
+        if (!op.compare("+=")) {
+            statement = "{} = {} + {}"_format(lhs, lhs, rhs);
+        } else if (!op.compare("-=")) {
+            statement = "{} = {} - {}"_format(lhs, lhs, rhs);
+        } else {
+            statement = "{} {} {}"_format(lhs, op, rhs);
+        }
+        body_statements.push_back(visitor::create_statement(statement));
     };
 
     /// iterate over all ions and create write ion statements for given block type
@@ -397,7 +450,7 @@ void CodegenLLVMHelperVisitor::ion_write_statements(BlockType type,
                     // for synapse type
                     if (info.point_process) {
                         auto area = codegen::naming::NODE_AREA_VARIABLE;
-                        rhs += "*(1.e2/{})"_format(area);
+                        rhs += "*(1.e2/{0}[{0}_id])"_format(area);
                     }
                     create_write_statements(lhs, op, rhs);
                 }
@@ -627,19 +680,17 @@ std::shared_ptr<ast::Expression> CodegenLLVMHelperVisitor::loop_count_expression
  * create new code generation function.
  */
 void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
-    /// statements for new function to be generated
-    ast::StatementVector function_statements;
-
-    /// create vectors of local variables that would be used in compute part
+    // create vectors of local variables that would be used in compute part
     std::vector<std::string> int_variables{"node_id"};
     std::vector<std::string> double_variables{"v"};
 
-    /// create now main compute part
-
-    /// compute body : initialization + solve blocks
-    ast::StatementVector def_statements;
+    // statements to load indexes for gather/scatter like variables
     ast::StatementVector index_statements;
+
+    // statements for the main body of nrn_state
     ast::StatementVector body_statements;
+
+    // prepare main body of the compute function
     {
         /// access node index and corresponding voltage
         index_statements.push_back(
@@ -656,13 +707,13 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
             const auto& solution = std::dynamic_pointer_cast<ast::SolutionExpression>(statement);
             const auto& block = std::dynamic_pointer_cast<ast::StatementBlock>(
                 solution->get_node_to_solve());
-            append_statements_from_block(body_statements, block);
+            append_statements_from_block(body_statements, block, info);
         }
 
         /// add breakpoint block if no current
         if (info.currents.empty() && info.breakpoint_node != nullptr) {
             auto block = info.breakpoint_node->get_statement_block();
-            append_statements_from_block(body_statements, block);
+            append_statements_from_block(body_statements, block, info);
         }
 
         /// write ion statements
@@ -674,10 +725,12 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
 
     /// create target-specific compute body
     ast::StatementVector compute_body;
-    compute_body.insert(compute_body.end(), def_statements.begin(), def_statements.end());
     compute_body.insert(compute_body.end(), index_statements.begin(), index_statements.end());
     compute_body.insert(compute_body.end(), body_statements.begin(), body_statements.end());
 
+    /// statements for new function to be generated
+    ast::StatementVector function_statements;
+
     std::vector<std::string> induction_variables{naming::INDUCTION_VAR};
     function_statements.push_back(
         create_local_variable_statement(induction_variables, INTEGER_TYPE));
@@ -696,9 +749,8 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
     auto name = new ast::Name(new ast::String(function_name));
     auto return_type = new ast::CodegenVarType(ast::AstNodeType::VOID);
 
-    /// \todo : currently there are no arguments
+    // argument to function: currently only instance structure
     ast::CodegenVarWithTypeVector code_arguments;
-
     auto instance_var_type = new ast::CodegenVarType(ast::AstNodeType::INSTANCE_STRUCT);
     auto instance_var_name = new ast::Name(new ast::String(naming::MECH_INSTANCE_VAR));
     auto instance_var = new ast::CodegenVarWithType(instance_var_type, 1, instance_var_name);
@@ -709,7 +761,8 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         std::make_shared<ast::CodegenFunction>(return_type, name, code_arguments, function_block);
     codegen_functions.push_back(function);
 
-    std::cout << nmodl::to_nmodl(function) << std::endl;
+    // todo: remove this, temporary
+    std::cout << nmodl::to_nmodl(*function) << std::endl;
 }
 
 void CodegenLLVMHelperVisitor::create_gpu_compute_body(ast::StatementVector& body,
@@ -802,6 +855,281 @@ void CodegenLLVMHelperVisitor::remove_inlined_nodes(ast::Program& node) {
     node.erase_node(nodes_to_erase);
 }
 
+/**
+ * Print `nrn_cur` kernel with`CONDUCTANCE` statements in the BREAKPOINT block
+ * @param node Ast node representing BREAKPOINT block
+ * @param int_variables Vector of integer variables in the kernel being generated
+ * @param double_variables Vector of double variables in the kernel being generated
+ * @param index_statements Statements for loading indexes (typically for ions, rhs, d)
+ * @param body_statements Vector of statements representing loop body of the `nrn_cur` kernel
+ */
+void CodegenLLVMHelperVisitor::print_nrn_cur_conductance_kernel(
+    const ast::BreakpointBlock& node,
+    std::vector<std::string>& int_variables,
+    std::vector<std::string>& double_variables,
+    ast::StatementVector& index_statements,
+    ast::StatementVector& body_statements) {
+    // TODO: this is not used by default but only with sympy --conductance option. This should be
+    // implemented later and hence just throw an error for now.
+    throw std::runtime_error(
+        "BREAKPOINT block with CONDUCTANCE statements is not supported in the LLVM backend yet");
+}
+
+/**
+ * Print `nrn_current` function that is typically generated as part of `nrn_cur()`
+ * @param node Ast node representing BREAKPOINT block
+ * @param body_statements Vector of statements representing loop body of the `nrn_cur` kernel
+ * @param variable Variable to which computed current will be assigned
+ */
+void CodegenLLVMHelperVisitor::print_nrn_current_body(const ast::BreakpointBlock& node,
+                                                      ast::StatementVector& body_statements,
+                                                      const std::string& variable) {
+    ast::StatementVector statements;
+
+    // starts with current initialized to 0
+    statements.emplace_back(visitor::create_statement("current = 0"));
+
+    // append compatible code statements from the breakpoint block
+    append_statements_from_block(statements, node.get_statement_block(), info);
+
+    // sum now all currents
+    for (auto& current: info.currents) {
+        statements.emplace_back(
+            visitor::create_statement("current = current + {}"_format(current)));
+    }
+
+    // assign computed current to the given variable
+    statements.emplace_back(visitor::create_statement("{} = current"_format(variable)));
+
+    // create StatementBlock for better readability of the generated code and add that to the main
+    // body statements
+    body_statements.emplace_back(new ast::ExpressionStatement(new ast::StatementBlock(statements)));
+}
+
+/**
+ * Print `nrn_cur` kernel without `CONDUCTANCE` statements in the BREAKPOINT block
+ * @param node Ast node representing BREAKPOINT block
+ * @param int_variables Vector of integer variables in the kernel being generated
+ * @param double_variables Vector of double variables in the kernel being generated
+ * @param index_statements Statements for loading indexes (typically for ions, rhs, d)
+ * @param body_statements Vector of statements representing loop body of the `nrn_cur` kernel
+ */
+void CodegenLLVMHelperVisitor::print_nrn_cur_non_conductance_kernel(
+    const ast::BreakpointBlock& node,
+    std::vector<std::string>& int_variables,
+    std::vector<std::string>& double_variables,
+    ast::StatementVector& index_statements,
+    ast::StatementVector& body_statements) {
+    // add double variables needed in the local scope
+    double_variables.emplace_back("g");
+    double_variables.emplace_back("rhs");
+    double_variables.emplace_back("v_org");
+    double_variables.emplace_back("current");
+
+    // store original voltage value as we are going to calculate current with v + 0.001
+    body_statements.emplace_back(visitor::create_statement("v_org = v"));
+
+    // first current calculation with v+0.001 and assign it to variable g
+    body_statements.emplace_back(visitor::create_statement("v = v + 0.001"));
+    print_nrn_current_body(node, body_statements, "g");
+
+    // now store all ionic currents to local variable
+    for (const auto& ion: info.ions) {
+        for (const auto& var: ion.writes) {
+            if (ion.is_ionic_current(var)) {
+                // also create local variable
+                std::string name{"di{}"_format(ion.name)};
+                double_variables.emplace_back(name);
+                body_statements.emplace_back(
+                    visitor::create_statement("{} = {}"_format(name, var)));
+            }
+        }
+    }
+
+    // now restore original v and calculate current and store it in rhs
+    body_statements.emplace_back(visitor::create_statement("v = v_org"));
+    print_nrn_current_body(node, body_statements, "rhs");
+
+    // calculate g
+    body_statements.emplace_back(visitor::create_statement("g = (g-rhs)/0.001"));
+
+    // in case of point process we need to load area from another vector.
+    if (info.point_process) {
+        // create integer variable for index and then load value from area_index vector
+        int_variables.emplace_back("{}_id"_format(naming::NODE_AREA_VARIABLE));
+        index_statements.emplace_back(visitor::create_statement(
+            " {0}_id = {0}_index[id]"_format(naming::NODE_AREA_VARIABLE)));
+    }
+
+    // update all ionic currents now
+    for (const auto& ion: info.ions) {
+        for (const auto& var: ion.writes) {
+            if (ion.is_ionic_current(var)) {
+                // variable on the lhs
+                std::string lhs{"{}di{}dv"_format(naming::ION_VARNAME_PREFIX, ion.name)};
+
+                // expression on the rhs
+                std::string rhs{"(di{}-{})/0.001"_format(ion.name, var)};
+                if (info.point_process) {
+                    rhs += "*1.e2/{0}[{0}_id]"_format(naming::NODE_AREA_VARIABLE);
+                }
+
+                // load the index for lhs variable
+                int_variables.emplace_back(lhs + "_id");
+                std::string index_statement{"{}_id = {}_index[id]"_format(lhs, lhs)};
+                index_statements.emplace_back(visitor::create_statement(index_statement));
+
+                // add statement that actually updates the
+                body_statements.emplace_back(
+                    visitor::create_statement("{0}[{0}_id] = {0}[{0}_id] + {1}"_format(lhs, rhs)));
+            }
+        }
+    }
+}
+
+/**
+ * \brief Convert ast::BreakpointBlock to corresponding code generation function nrn_cur
+ * @param node AST node representing ast::BreakpointBlock
+ *
+ * The BREAKPOINT block from MOD file (ast::NrnStateBlock node in the AST) is converted
+ * to `nrn_cur` function in the generated CPP code via various transformations. Here we
+ * perform those transformations and create new codegen node in the AST.
+ */
+void CodegenLLVMHelperVisitor::visit_breakpoint_block(ast::BreakpointBlock& node) {
+    // no-op in case there are no currents or breakpoint block doesn't exist
+    if (!info.nrn_cur_required()) {
+        return;
+    }
+
+    /// local variables in the function scope for integer and double variables
+    std::vector<std::string> int_variables{"node_id"};
+    std::vector<std::string> double_variables{"v"};
+
+    /// statements to load indexes for gather/scatter like expressions
+    ast::StatementVector index_statements;
+
+    /// statements for the rest of compute body
+    ast::StatementVector body_statements;
+
+    /// prepare all function statements
+    {
+        /// access node index and corresponding voltage
+        index_statements.push_back(
+            visitor::create_statement("node_id = node_index[{}]"_format(naming::INDUCTION_VAR)));
+        body_statements.push_back(visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
+
+        /// read ion variables
+        ion_read_statements(BlockType::Equation,
+                            int_variables,
+                            double_variables,
+                            index_statements,
+                            body_statements);
+
+        /// print main current kernel based on conductance exist of not
+        if (info.conductances.empty()) {
+            print_nrn_cur_non_conductance_kernel(
+                node, int_variables, double_variables, index_statements, body_statements);
+        } else {
+            print_nrn_cur_conductance_kernel(
+                node, int_variables, double_variables, index_statements, body_statements);
+        }
+
+        /// add write ion statements
+        ion_write_statements(BlockType::Equation,
+                             int_variables,
+                             double_variables,
+                             index_statements,
+                             body_statements);
+
+        /// in case of point process, we have to scale values based on the area
+        if (info.point_process) {
+            double_variables.emplace_back("mfactor");
+            body_statements.emplace_back(visitor::create_statement(
+                "mfactor = 1.e2/{0}[{0}_id]"_format(naming::NODE_AREA_VARIABLE)));
+            body_statements.emplace_back(visitor::create_statement("g = g*mfactor"));
+            body_statements.emplace_back(visitor::create_statement("rhs = rhs*mfactor"));
+        }
+
+        /// as multiple point processes can exist at same node, with simd or gpu execution we have
+        /// to create atomic statements that will be handled by llvm ir generation
+        // \todo note that we are not creating rhs and d updates based on the shadow vectors. This
+        //       is because llvm backend for cpu as well as gpu is going to take care for
+        //       reductions. if these codegen functions will be used for C backend then we will need
+        //       to implement separate reduction loop like mod2c or nmodl's c backend.
+        if (info.point_process && (platform.is_gpu() || platform.is_cpu_with_simd())) {
+            body_statements.emplace_back(create_atomic_statement(
+                naming::NTHREAD_RHS, "node_id", info.operator_for_rhs(), "rhs"));
+            body_statements.emplace_back(create_atomic_statement(
+                naming::NTHREAD_D, "node_id", info.operator_for_rhs(), "g"));
+        } else {
+            auto rhs_op(info.operator_for_rhs());
+            auto d_op(info.operator_for_d());
+
+            // convert a += b to a = a + b, see BlueBrain/nmodl/issues/851
+            // hence write update of rhs and de in the form of assignment statements
+            stringutils::remove_character(rhs_op, '=');
+            stringutils::remove_character(d_op, '=');
+
+            body_statements.emplace_back(visitor::create_statement(
+                "vec_rhs[node_id] = vec_rhs[node_id] {} rhs"_format(rhs_op)));
+            body_statements.emplace_back(
+                visitor::create_statement("vec_d[node_id] = vec_d[node_id] {} g"_format(d_op)));
+        }
+    }
+
+    /// now create codegen function
+    {
+        /// compute body, index loading statements at the begining and then compute functions
+        ast::StatementVector compute_body;
+        compute_body.insert(compute_body.end(), index_statements.begin(), index_statements.end());
+        compute_body.insert(compute_body.end(), body_statements.begin(), body_statements.end());
+
+        /// statements for new function to be generated
+        ast::StatementVector function_statements;
+
+        std::vector<std::string> induction_variables{naming::INDUCTION_VAR};
+        function_statements.push_back(
+            create_local_variable_statement(induction_variables, INTEGER_TYPE));
+
+        if (platform.is_gpu()) {
+            create_gpu_compute_body(compute_body,
+                                    function_statements,
+                                    int_variables,
+                                    double_variables);
+        } else {
+            create_cpu_compute_body(compute_body,
+                                    function_statements,
+                                    int_variables,
+                                    double_variables);
+        }
+
+        /// new block for the function
+        auto function_block = new ast::StatementBlock(function_statements);
+
+        /// name of the function and it's return type
+        std::string function_name = "nrn_cur_" + stringutils::tolower(info.mod_suffix);
+        auto name = new ast::Name(new ast::String(function_name));
+        auto return_type = new ast::CodegenVarType(ast::AstNodeType::VOID);
+
+        /// only instance struct as an argument for now
+        ast::CodegenVarWithTypeVector code_arguments;
+        auto instance_var_type = new ast::CodegenVarType(ast::AstNodeType::INSTANCE_STRUCT);
+        auto instance_var_name = new ast::Name(new ast::String(naming::MECH_INSTANCE_VAR));
+        auto instance_var = new ast::CodegenVarWithType(instance_var_type, 1, instance_var_name);
+        code_arguments.emplace_back(instance_var);
+
+        /// finally, create new function
+        auto function = std::make_shared<ast::CodegenFunction>(return_type,
+                                                               name,
+                                                               code_arguments,
+                                                               function_block);
+        codegen_functions.push_back(function);
+
+        // todo: remove this, temporary
+        std::cout << nmodl::to_nmodl(*function) << std::endl;
+    }
+}
+
 void CodegenLLVMHelperVisitor::visit_program(ast::Program& node) {
     /// run codegen helper visitor to collect information
     CodegenHelperVisitor v;
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
index aea2f5aea8..ab554521fc 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.hpp
@@ -167,6 +167,13 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
     void visit_procedure_block(ast::ProcedureBlock& node) override;
     void visit_function_block(ast::FunctionBlock& node) override;
     void visit_nrn_state_block(ast::NrnStateBlock& node) override;
+
+    /**
+     * \brief Convert ast::BreakpointBlock to corresponding code generation function nrn_cur
+     * @param node AST node representing ast::BreakpointBlock
+     */
+    void visit_breakpoint_block(ast::BreakpointBlock& node) override;
+
     void visit_program(ast::Program& node) override;
 
   private:
@@ -195,6 +202,20 @@ class CodegenLLVMHelperVisitor: public visitor::AstVisitor {
                                   std::vector<std::string>& int_variables,
                                   std::vector<std::string>& double_variables,
                                   bool is_remainder_loop = false);
+
+    void print_nrn_current_body(const ast::BreakpointBlock& node,
+                                ast::StatementVector& body_statements,
+                                const std::string& variable);
+    void print_nrn_cur_non_conductance_kernel(const ast::BreakpointBlock& node,
+                                              std::vector<std::string>& int_variables,
+                                              std::vector<std::string>& double_variables,
+                                              ast::StatementVector& index_statements,
+                                              ast::StatementVector& body_statements);
+    void print_nrn_cur_conductance_kernel(const ast::BreakpointBlock& node,
+                                          std::vector<std::string>& int_variables,
+                                          std::vector<std::string>& double_variables,
+                                          ast::StatementVector& index_statements,
+                                          ast::StatementVector& body_statements);
 };
 
 /** @} */  // end of llvm_codegen_details
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 8dfb66e9e0..9e159f7aff 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -1098,5 +1098,11 @@ void CodegenLLVMVisitor::visit_while_statement(const ast::WhileStatement& node)
     ir_builder.set_insertion_point(exit);
 }
 
+// for the llvm backend we only support breakpoint and derivative blocks
+void CodegenLLVMVisitor::print_compute_functions() {
+    print_nrn_cur();
+    print_nrn_state();
+}
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index a7af83721c..0862307337 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -299,6 +299,9 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
     /// the kernel.
     void wrap_kernel_functions();
 
+    /// print compute functions relevant for this backend
+    void print_compute_functions() override;
+
   private:
     // Annotates kernel function with NVVM metadata.
     void annotate_kernel_with_nvvm(llvm::Function* kernel);
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index b99cc81817..82cb820049 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -59,9 +59,14 @@ llvm::Type* IRBuilder::get_void_type() {
 
 llvm::Type* IRBuilder::get_struct_ptr_type(const std::string& struct_type_name,
                                            TypeVector& member_types) {
-    llvm::StructType* llvm_struct_type = llvm::StructType::create(builder.getContext(),
-                                                                  struct_type_name);
-    llvm_struct_type->setBody(member_types);
+    llvm::StructType* llvm_struct_type = llvm::StructType::getTypeByName(builder.getContext(),
+                                                                         struct_type_name);
+
+    if (!llvm_struct_type) {
+        llvm_struct_type = llvm::StructType::create(builder.getContext(), struct_type_name);
+        llvm_struct_type->setBody(member_types);
+    }
+
     return llvm::PointerType::get(llvm_struct_type, /*AddressSpace=*/0);
 }
 
diff --git a/src/language/nmodl.yaml b/src/language/nmodl.yaml
index 2bafd00af5..7c087d7b2f 100644
--- a/src/language/nmodl.yaml
+++ b/src/language/nmodl.yaml
@@ -1579,11 +1579,28 @@
                                 \sa nmodl::visitor::SympyConductanceVisitor
 
                   - ExpressionStatement:
-                      brief: "TODO"
                       members:
                         - expression:
-                            brief: "TODO"
+                            brief: "An expression representing a construct in the mod file"
                             type: Expression
+                      brief: "Represent statement encpasulated by underlying expression of ast node typeExpression"
+                      description: |
+                                Certain statements defined in the NMODL are complex than typical "single line" statements.
+                                For example, often SOLVE block is written as:
+
+                                    SOLVE states METHOD cnexp
+
+                                but language allow it to be more complex as:
+
+                                    SOLVE states METHOD cnexp {
+                                        statement_1
+                                        statement_2
+                                    }
+
+                                So this type of construct is not really "single line" statement. There are other such cases
+                                where they are categorised as "statement" in the bison specification. Also, there are cases
+                                when a binary expression `a = b` is also a full statement..
+                                In all such cases we wrap underlying expression as statement using ExpressionStatement node.
 
                   - ProtectStatement:
                       brief: "TODO"
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index b5502cab51..ce0c644451 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -287,6 +287,7 @@ SCENARIO("Simple scalar kernel", "[llvm][runner]") {
                 v
                 x0
                 x1
+                i (mA/cm2)
             }
 
             BREAKPOINT {
@@ -370,6 +371,7 @@ SCENARIO("Simple vectorised kernel", "[llvm][runner]") {
                 v
                 x0
                 x1
+                i (mA/cm2)
             }
 
             BREAKPOINT {
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index a680206271..b19ff95066 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -951,10 +951,16 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
                 m
             }
 
+            PARAMETER {
+                gl = .0003 (S/cm2)  <0,1e9>
+                el = -54.3 (mV)
+            }
+
             ASSIGNED {
                 v (mV)
                 minf
                 mtau (ms)
+                il (mA/cm2)
             }
 
             BREAKPOINT {
@@ -974,7 +980,8 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             // Check the struct type with correct attributes and the kernel declaration.
             std::regex struct_type(
                 "%.*__instance_var__type = type \\{ double\\*, double\\*, double\\*, double\\*, "
-                "double\\*, double\\*, double\\*, i32\\*, double, double, double, i32, i32 \\}");
+                "double\\*, double\\*, double\\*, double\\*, double\\*, double\\*, i32\\*, double, "
+                "double, double, i32, i32, double\\*, double\\*, double\\*, double\\* \\}");
             std::regex kernel_declaration(
                 R"(define void @nrn_state_hh\(%.*__instance_var__type\* noalias nocapture readonly .*\) #0)");
             REQUIRE(std::regex_search(module_string, m, struct_type));
@@ -1063,6 +1070,7 @@ SCENARIO("Vectorised simple kernel", "[visitor][llvm]") {
 
             ASSIGNED {
                 v (mV)
+                i (mA/cm2)
             }
 
             BREAKPOINT {
@@ -1251,7 +1259,7 @@ SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
             }
         )";
 
-        std::string expected_loop = R"(
+        std::string expected_state_loop = R"(
             for(id = 0; id<mech->node_count; id = id+1) {
                 node_id = mech->node_index[id]
                 v = mech->voltage[node_id]
@@ -1263,10 +1271,10 @@ SCENARIO("Scalar derivative block", "[visitor][llvm][derivative]") {
             auto result = run_llvm_visitor_helper(nmodl_text,
                                                   default_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
-            REQUIRE(result.size() == 1);
+            REQUIRE(result.size() == 2);
 
-            auto main_loop = reindent_text(to_nmodl(result[0]));
-            REQUIRE(main_loop == reindent_text(expected_loop));
+            auto main_state_loop = reindent_text(to_nmodl(result[1]));
+            REQUIRE(main_state_loop == reindent_text(expected_state_loop));
         }
     }
 }
@@ -1276,39 +1284,86 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
         std::string nmodl_text = R"(
             NEURON {
                 SUFFIX hh
+                USEION na READ ena WRITE ina
                 NONSPECIFIC_CURRENT il
-                RANGE minf, mtau
+                RANGE minf, mtau, gna, gnabar
             }
             STATE {
-                m
+                m h
+            }
+            PARAMETER {
+                gnabar = .12 (S/cm2) <0,1e9>
             }
             ASSIGNED {
                 v (mV)
                 minf
                 mtau (ms)
+                ena (mV)
+                ina (mA/cm2)
+                gna (S/cm2)
             }
             BREAKPOINT {
                 SOLVE states METHOD cnexp
-                il = 2
+                gna = gnabar*m*m*m*h
+                ina = gna*(v - ena)
             }
             DERIVATIVE states {
                 m = (minf-m)/mtau
             }
         )";
 
-        std::string expected_main_loop = R"(
+        std::string expected_state_main_loop = R"(
             for(id = 0; id<mech->node_count-7; id = id+8) {
                 node_id = mech->node_index[id]
+                ena_id = mech->ion_ena_index[id]
                 v = mech->voltage[node_id]
+                mech->ena[id] = mech->ion_ena[ena_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
             })";
-        std::string expected_epilogue_loop = R"(
+
+        std::string expected_state_epilogue_loop = R"(
             for(; id<mech->node_count; id = id+1) {
                 epilogue_node_id = mech->node_index[id]
+                epilogue_ena_id = mech->ion_ena_index[id]
                 epilogue_v = mech->voltage[epilogue_node_id]
+                mech->ena[id] = mech->ion_ena[epilogue_ena_id]
                 mech->m[id] = (mech->minf[id]-mech->m[id])/mech->mtau[id]
             })";
 
+        std::string expected_cur_main_loop = R"(
+            for(id = 0; id<mech->node_count-7; id = id+8) {
+                node_id = mech->node_index[id]
+                ena_id = mech->ion_ena_index[id]
+                ion_dinadv_id = mech->ion_dinadv_index[id]
+                ion_ina_id = mech->ion_ina_index[id]
+                v = mech->voltage[node_id]
+                mech->ena[id] = mech->ion_ena[ena_id]
+                v_org = v
+                v = v+0.001
+                {
+                    current = 0
+                    mech->gna[id] = mech->gnabar[id]*mech->m[id]*mech->m[id]*mech->m[id]*mech->h[id]
+                    mech->ina[id] = mech->gna[id]*(v-mech->ena[id])
+                    current = current+il
+                    current = current+mech->ina[id]
+                    g = current
+                }
+                dina = mech->ina[id]
+                v = v_org
+                {
+                    current = 0
+                    mech->gna[id] = mech->gnabar[id]*mech->m[id]*mech->m[id]*mech->m[id]*mech->h[id]
+                    mech->ina[id] = mech->gna[id]*(v-mech->ena[id])
+                    current = current+il
+                    current = current+mech->ina[id]
+                    rhs = current
+                }
+                g = (g-rhs)/0.001
+                mech->ion_dinadv[ion_dinadv_id] = mech->ion_dinadv[ion_dinadv_id]+(dina-mech->ina[id])/0.001
+                mech->ion_ina[ion_ina_id] = mech->ion_ina[ion_ina_id]+mech->ina[id]
+                mech->vec_rhs[node_id] = mech->vec_rhs[node_id]-rhs
+                mech->vec_d[node_id] = mech->vec_d[node_id]+g
+            })";
 
         THEN("vector and epilogue scalar loops are constructed") {
             codegen::Platform simd_platform(/*use_single_precision=*/false,
@@ -1316,13 +1371,16 @@ SCENARIO("Vectorised derivative block", "[visitor][llvm][derivative]") {
             auto result = run_llvm_visitor_helper(nmodl_text,
                                                   simd_platform,
                                                   {ast::AstNodeType::CODEGEN_FOR_STATEMENT});
-            REQUIRE(result.size() == 2);
+            REQUIRE(result.size() == 4);
+
+            auto cur_main_loop = reindent_text(to_nmodl(result[0]));
+            REQUIRE(cur_main_loop == reindent_text(expected_cur_main_loop));
 
-            auto main_loop = reindent_text(to_nmodl(result[0]));
-            REQUIRE(main_loop == reindent_text(expected_main_loop));
+            auto state_main_loop = reindent_text(to_nmodl(result[2]));
+            REQUIRE(state_main_loop == reindent_text(expected_state_main_loop));
 
-            auto epilogue_loop = reindent_text(to_nmodl(result[1]));
-            REQUIRE(epilogue_loop == reindent_text(expected_epilogue_loop));
+            auto state_epilogue_loop = reindent_text(to_nmodl(result[3]));
+            REQUIRE(state_epilogue_loop == reindent_text(expected_state_epilogue_loop));
         }
     }
 }
@@ -1343,6 +1401,7 @@ SCENARIO("Vector library calls", "[visitor][llvm][vector_lib]") {
             }
             ASSIGNED {
                 v (mV)
+                il (mA/cm2)
             }
             BREAKPOINT {
                 SOLVE states METHOD cnexp
diff --git a/test/unit/codegen/codegen_llvm_visitor.cpp b/test/unit/codegen/codegen_llvm_visitor.cpp
index d2a058b3c5..1906d0d27c 100644
--- a/test/unit/codegen/codegen_llvm_visitor.cpp
+++ b/test/unit/codegen/codegen_llvm_visitor.cpp
@@ -13,6 +13,7 @@
 #include "config/config.h"
 #include "parser/nmodl_driver.hpp"
 #include "test/unit/utils/test_utils.hpp"
+#include "visitors/inline_visitor.hpp"
 #include "visitors/neuron_solve_visitor.hpp"
 #include "visitors/solve_block_visitor.hpp"
 #include "visitors/symtab_visitor.hpp"
@@ -49,6 +50,23 @@ std::string get_wrapper_instance_struct(const std::string& nmodl_text) {
     return strbuf.str();
 }
 
+// Run LLVM codegen helper visitor with given platform as target
+static std::vector<std::shared_ptr<ast::Ast>> run_llvm_visitor_helper(
+    const std::string& text,
+    codegen::Platform& platform,
+    const std::vector<ast::AstNodeType>& nodes_to_collect) {
+    NmodlDriver driver;
+    const auto& ast = driver.parse_string(text);
+
+    SymtabVisitor().visit_program(*ast);
+    InlineVisitor().visit_program(*ast);
+    NeuronSolveVisitor().visit_program(*ast);
+    SolveBlockVisitor().visit_program(*ast);
+    CodegenLLVMHelperVisitor(platform).visit_program(*ast);
+
+    return collect_nodes(*ast, nodes_to_collect);
+}
+
 SCENARIO("Check instance struct declaration and setup in wrapper",
          "[codegen][llvm][instance_struct]") {
     GIVEN("hh: simple mod file") {
@@ -158,6 +176,10 @@ SCENARIO("Check instance struct declaration and setup in wrapper",
                 double celsius;
                 int secondorder;
                 int node_count;
+                double* __restrict__ vec_rhs;
+                double* __restrict__ vec_d;
+                double* __restrict__ _shadow_rhs;
+                double* __restrict__ _shadow_d;
             };
         )";
         std::string generated_instance_struct_setup = R"(
@@ -213,10 +235,9 @@ SCENARIO("Check instance struct declaration and setup in wrapper",
             }
         )";
 
-        THEN("index and nt variables") {
+        THEN("index and nt variables created correctly") {
             auto result_instance_struct_declaration_setup = reindent_text(
                 get_wrapper_instance_struct(nmodl_text));
-            std::cout << "Result\n" << result_instance_struct_declaration_setup << std::endl;
 
             auto expected_instance_struct_declaration = reindent_text(
                 generated_instance_struct_declaration);
@@ -229,3 +250,382 @@ SCENARIO("Check instance struct declaration and setup in wrapper",
         }
     }
 }
+
+
+SCENARIO("Channel: Derivative and breakpoint block llvm transformations",
+         "[visitor][llvm_helper][channel]") {
+    GIVEN("A hh.mod file with derivative and breakpoint block") {
+        std::string nmodl_text = R"(
+            TITLE hh.mod   squid sodium, potassium, and leak channels
+
+            UNITS {
+                (mA) = (milliamp)
+                (mV) = (millivolt)
+                (S) = (siemens)
+            }
+
+            NEURON {
+                SUFFIX hh
+                USEION na READ ena WRITE ina
+                USEION k READ ek WRITE ik
+                NONSPECIFIC_CURRENT il
+                RANGE gnabar, gkbar, gl, el, gna, gk
+                RANGE minf, hinf, ninf, mtau, htau, ntau
+                THREADSAFE
+            }
+
+            PARAMETER {
+                gnabar = .12 (S/cm2) <0,1e9>
+                gkbar = .036 (S/cm2) <0,1e9>
+                gl = .0003 (S/cm2) <0,1e9>
+                el = -54.3 (mV)
+            }
+
+            STATE {
+                m
+                h
+                n
+            }
+
+            ASSIGNED {
+                v (mV)
+                celsius (degC)
+                ena (mV)
+                ek (mV)
+                gna (S/cm2)
+                gk (S/cm2)
+                ina (mA/cm2)
+                ik (mA/cm2)
+                il (mA/cm2)
+                minf
+                hinf
+                ninf
+                mtau (ms)
+                htau (ms)
+                ntau (ms)
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+                gna = gnabar*m*m*m*h
+                ina = gna*(v-ena)
+                gk = gkbar*n*n*n*n
+                ik = gk*(v-ek)
+                il = gl*(v-el)
+            }
+
+            DERIVATIVE states {
+                rates(v)
+                m' = (minf-m)/mtau
+                h' = (hinf-h)/htau
+                n' = (ninf-n)/ntau
+            }
+
+            PROCEDURE rates(v(mV)) {
+                LOCAL alpha, beta, sum, q10
+                UNITSOFF
+                q10 = 3^((celsius-6.3)/10)
+                alpha = .1*vtrap(-(v+40), 10)
+                beta = 4*exp(-(v+65)/18)
+                sum = alpha+beta
+                mtau = 1/(q10*sum)
+                minf = alpha/sum
+                alpha = .07*exp(-(v+65)/20)
+                beta = 1/(exp(-(v+35)/10)+1)
+                sum = alpha+beta
+                htau = 1/(q10*sum)
+                hinf = alpha/sum
+                alpha = .01*vtrap(-(v+55), 10)
+                beta = .125*exp(-(v+65)/80)
+                sum = alpha+beta
+                ntau = 1/(q10*sum)
+                ninf = alpha/sum
+            }
+
+            FUNCTION vtrap(x, y) {
+                IF (fabs(x/y)<1e-6) {
+                    vtrap = y*(1-x/y/2)
+                } ELSE {
+                    vtrap = x/(exp(x/y)-1)
+                }
+            }
+        )";
+
+        std::string expected_state_function = R"(
+            VOID nrn_state_hh(INSTANCE_STRUCT *mech){
+                INTEGER id
+                INTEGER node_id, ena_id, ek_id
+                DOUBLE v
+                for(id = 0; id<mech->node_count; id = id+1) {
+                    node_id = mech->node_index[id]
+                    ena_id = mech->ion_ena_index[id]
+                    ek_id = mech->ion_ek_index[id]
+                    v = mech->voltage[node_id]
+                    mech->ena[id] = mech->ion_ena[ena_id]
+                    mech->ek[id] = mech->ion_ek[ek_id]
+                    {
+                        DOUBLE alpha, beta, sum, q10, vtrap_in_0, vtrap_in_1, v_in_0
+                        v_in_0 = v
+                        UNITSOFF
+                        q10 = 3^((mech->celsius-6.3)/10)
+                        {
+                            DOUBLE x_in_0, y_in_0
+                            x_in_0 = -(v_in_0+40)
+                            y_in_0 = 10
+                            IF (fabs(x_in_0/y_in_0)<1e-6) {
+                                vtrap_in_0 = y_in_0*(1-x_in_0/y_in_0/2)
+                            } ELSE {
+                                vtrap_in_0 = x_in_0/(exp(x_in_0/y_in_0)-1)
+                            }
+                        }
+                        alpha = .1*vtrap_in_0
+                        beta = 4*exp(-(v_in_0+65)/18)
+                        sum = alpha+beta
+                        mech->mtau[id] = 1/(q10*sum)
+                        mech->minf[id] = alpha/sum
+                        alpha = .07*exp(-(v_in_0+65)/20)
+                        beta = 1/(exp(-(v_in_0+35)/10)+1)
+                        sum = alpha+beta
+                        mech->htau[id] = 1/(q10*sum)
+                        mech->hinf[id] = alpha/sum
+                        {
+                            DOUBLE x_in_1, y_in_1
+                            x_in_1 = -(v_in_0+55)
+                            y_in_1 = 10
+                            IF (fabs(x_in_1/y_in_1)<1e-6) {
+                                vtrap_in_1 = y_in_1*(1-x_in_1/y_in_1/2)
+                            } ELSE {
+                                vtrap_in_1 = x_in_1/(exp(x_in_1/y_in_1)-1)
+                            }
+                        }
+                        alpha = .01*vtrap_in_1
+                        beta = .125*exp(-(v_in_0+65)/80)
+                        sum = alpha+beta
+                        mech->ntau[id] = 1/(q10*sum)
+                        mech->ninf[id] = alpha/sum
+                    }
+                    mech->m[id] = mech->m[id]+(1.0-exp(mech->dt*((((-1.0)))/mech->mtau[id])))*(-(((mech->minf[id]))/mech->mtau[id])/((((-1.0)))/mech->mtau[id])-mech->m[id])
+                    mech->h[id] = mech->h[id]+(1.0-exp(mech->dt*((((-1.0)))/mech->htau[id])))*(-(((mech->hinf[id]))/mech->htau[id])/((((-1.0)))/mech->htau[id])-mech->h[id])
+                    mech->n[id] = mech->n[id]+(1.0-exp(mech->dt*((((-1.0)))/mech->ntau[id])))*(-(((mech->ninf[id]))/mech->ntau[id])/((((-1.0)))/mech->ntau[id])-mech->n[id])
+                }
+            })";
+
+        std::string expected_cur_function = R"(
+            VOID nrn_cur_hh(INSTANCE_STRUCT *mech){
+                INTEGER id
+                INTEGER node_id, ena_id, ek_id, ion_dinadv_id, ion_dikdv_id, ion_ina_id, ion_ik_id
+                DOUBLE v, g, rhs, v_org, current, dina, dik
+                for(id = 0; id<mech->node_count; id = id+1) {
+                    node_id = mech->node_index[id]
+                    ena_id = mech->ion_ena_index[id]
+                    ek_id = mech->ion_ek_index[id]
+                    ion_dinadv_id = mech->ion_dinadv_index[id]
+                    ion_dikdv_id = mech->ion_dikdv_index[id]
+                    ion_ina_id = mech->ion_ina_index[id]
+                    ion_ik_id = mech->ion_ik_index[id]
+                    v = mech->voltage[node_id]
+                    mech->ena[id] = mech->ion_ena[ena_id]
+                    mech->ek[id] = mech->ion_ek[ek_id]
+                    v_org = v
+                    v = v+0.001
+                    {
+                        current = 0
+                        mech->gna[id] = mech->gnabar[id]*mech->m[id]*mech->m[id]*mech->m[id]*mech->h[id]
+                        mech->ina[id] = mech->gna[id]*(v-mech->ena[id])
+                        mech->gk[id] = mech->gkbar[id]*mech->n[id]*mech->n[id]*mech->n[id]*mech->n[id]
+                        mech->ik[id] = mech->gk[id]*(v-mech->ek[id])
+                        mech->il[id] = mech->gl[id]*(v-mech->el[id])
+                        current = current+mech->il[id]
+                        current = current+mech->ina[id]
+                        current = current+mech->ik[id]
+                        g = current
+                    }
+                    dina = mech->ina[id]
+                    dik = mech->ik[id]
+                    v = v_org
+                    {
+                        current = 0
+                        mech->gna[id] = mech->gnabar[id]*mech->m[id]*mech->m[id]*mech->m[id]*mech->h[id]
+                        mech->ina[id] = mech->gna[id]*(v-mech->ena[id])
+                        mech->gk[id] = mech->gkbar[id]*mech->n[id]*mech->n[id]*mech->n[id]*mech->n[id]
+                        mech->ik[id] = mech->gk[id]*(v-mech->ek[id])
+                        mech->il[id] = mech->gl[id]*(v-mech->el[id])
+                        current = current+mech->il[id]
+                        current = current+mech->ina[id]
+                        current = current+mech->ik[id]
+                        rhs = current
+                    }
+                    g = (g-rhs)/0.001
+                    mech->ion_dinadv[ion_dinadv_id] = mech->ion_dinadv[ion_dinadv_id]+(dina-mech->ina[id])/0.001
+                    mech->ion_dikdv[ion_dikdv_id] = mech->ion_dikdv[ion_dikdv_id]+(dik-mech->ik[id])/0.001
+                    mech->ion_ina[ion_ina_id] = mech->ion_ina[ion_ina_id]+mech->ina[id]
+                    mech->ion_ik[ion_ik_id] = mech->ion_ik[ion_ik_id]+mech->ik[id]
+                    mech->vec_rhs[node_id] = mech->vec_rhs[node_id]-rhs
+                    mech->vec_d[node_id] = mech->vec_d[node_id]+g
+                }
+            })";
+
+        THEN("codegen functions are constructed correctly for density channel") {
+            codegen::Platform simd_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/1);
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  simd_platform,
+                                                  {ast::AstNodeType::CODEGEN_FUNCTION});
+            REQUIRE(result.size() == 2);
+
+            auto cur_function = reindent_text(to_nmodl(result[0]));
+            REQUIRE(cur_function == reindent_text(expected_cur_function));
+
+            auto state_function = reindent_text(to_nmodl(result[1]));
+            REQUIRE(state_function == reindent_text(expected_state_function));
+        }
+    }
+}
+
+SCENARIO("Synapse: Derivative and breakpoint block llvm transformations",
+         "[visitor][llvm_helper][derivative]") {
+    GIVEN("A exp2syn.mod file with derivative and breakpoint block") {
+        // note that USEION statement is added just for better code coverage (ionic current)
+        std::string nmodl_text = R"(
+            NEURON {
+                POINT_PROCESS Exp2Syn
+                USEION na READ ena WRITE ina
+                RANGE tau1, tau2, e, i
+                NONSPECIFIC_CURRENT i
+                RANGE g, gna
+            }
+
+            UNITS {
+                (nA) = (nanoamp)
+                (mV) = (millivolt)
+                (uS) = (microsiemens)
+            }
+
+            PARAMETER {
+                tau1 = 0.1 (ms) <1e-9,1e9>
+                tau2 = 10 (ms) <1e-9,1e9>
+                e = 0 (mV)
+            }
+
+            ASSIGNED {
+                v (mV)
+                i (nA)
+                g (uS)
+                gna (S/cm2)
+                factor
+            }
+
+            STATE {
+                A (uS)
+                B (uS)
+            }
+
+            INITIAL {
+                LOCAL tp
+                IF (tau1/tau2>0.9999) {
+                    tau1 = 0.9999*tau2
+                }
+                IF (tau1/tau2<1e-9) {
+                    tau1 = tau2*1e-9
+                }
+                A = 0
+                B = 0
+                tp = (tau1*tau2)/(tau2-tau1)*log(tau2/tau1)
+                factor = -exp(-tp/tau1)+exp(-tp/tau2)
+                factor = 1/factor
+            }
+
+            BREAKPOINT {
+                SOLVE state METHOD cnexp
+                ina = gna*(v-ena)
+                g = B-A
+                i = g*(v-e)
+            }
+
+            DERIVATIVE state {
+                A' = -A/tau1
+                B' = -B/tau2
+            }
+
+            NET_RECEIVE (weight(uS)) {
+                A = A+weight*factor
+                B = B+weight*factor
+            })";
+
+        std::string expected_cur_function = R"(
+            VOID nrn_cur_exp2syn(INSTANCE_STRUCT *mech){
+                INTEGER id
+                INTEGER node_id, ena_id, node_area_id, ion_dinadv_id, ion_ina_id
+                DOUBLE v, g, rhs, v_org, current, dina, mfactor
+                for(id = 0; id<mech->node_count; id = id+1) {
+                    node_id = mech->node_index[id]
+                    ena_id = mech->ion_ena_index[id]
+                    node_area_id = mech->node_area_index[id]
+                    ion_dinadv_id = mech->ion_dinadv_index[id]
+                    ion_ina_id = mech->ion_ina_index[id]
+                    v = mech->voltage[node_id]
+                    mech->ena[id] = mech->ion_ena[ena_id]
+                    v_org = v
+                    v = v+0.001
+                    {
+                        current = 0
+                        mech->ina[id] = mech->gna[id]*(v-mech->ena[id])
+                        mech->g[id] = mech->B[id]-mech->A[id]
+                        mech->i[id] = mech->g[id]*(v-mech->e[id])
+                        current = current+mech->i[id]
+                        current = current+mech->ina[id]
+                        mech->g[id] = current
+                    }
+                    dina = mech->ina[id]
+                    v = v_org
+                    {
+                        current = 0
+                        mech->ina[id] = mech->gna[id]*(v-mech->ena[id])
+                        mech->g[id] = mech->B[id]-mech->A[id]
+                        mech->i[id] = mech->g[id]*(v-mech->e[id])
+                        current = current+mech->i[id]
+                        current = current+mech->ina[id]
+                        rhs = current
+                    }
+                    mech->g[id] = (mech->g[id]-rhs)/0.001
+                    mech->ion_dinadv[ion_dinadv_id] = mech->ion_dinadv[ion_dinadv_id]+(dina-mech->ina[id])/0.001*1.e2/mech->node_area[node_area_id]
+                    mech->ion_ina[ion_ina_id] = mech->ion_ina[ion_ina_id]+mech->ina[id]*(1.e2/mech->node_area[node_area_id])
+                    mfactor = 1.e2/mech->node_area[node_area_id]
+                    mech->g[id] = mech->g[id]*mfactor
+                    rhs = rhs*mfactor
+                    mech->vec_rhs[node_id] = mech->vec_rhs[node_id]-rhs
+                    mech->vec_d[node_id] = mech->vec_d[node_id]+mech->g[id]
+                }
+            })";
+
+        std::string expected_state_function = R"(
+            VOID nrn_state_exp2syn(INSTANCE_STRUCT *mech){
+                INTEGER id
+                INTEGER node_id, ena_id
+                DOUBLE v
+                for(id = 0; id<mech->node_count; id = id+1) {
+                    node_id = mech->node_index[id]
+                    ena_id = mech->ion_ena_index[id]
+                    v = mech->voltage[node_id]
+                    mech->ena[id] = mech->ion_ena[ena_id]
+                    mech->A[id] = mech->A[id]+(1.0-exp(mech->dt*((-1.0)/mech->tau1[id])))*(-(0.0)/((-1.0)/mech->tau1[id])-mech->A[id])
+                    mech->B[id] = mech->B[id]+(1.0-exp(mech->dt*((-1.0)/mech->tau2[id])))*(-(0.0)/((-1.0)/mech->tau2[id])-mech->B[id])
+                }
+            })";
+
+        THEN("codegen functions are constructed correctly for synapse") {
+            codegen::Platform simd_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/1);
+            auto result = run_llvm_visitor_helper(nmodl_text,
+                                                  simd_platform,
+                                                  {ast::AstNodeType::CODEGEN_FUNCTION});
+            REQUIRE(result.size() == 2);
+
+            auto cur_function = reindent_text(to_nmodl(result[0]));
+            REQUIRE(cur_function == reindent_text(expected_cur_function));
+
+            auto state_function = reindent_text(to_nmodl(result[1]));
+            REQUIRE(state_function == reindent_text(expected_state_function));
+        }
+    }
+}

From 261677a7fa2ac9262b2dd529809fc649b7574eb7 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 2 May 2022 15:53:06 +0200
Subject: [PATCH 303/331] Copy memory to the GPU explicitly

---
 test/benchmark/llvm_benchmark.cpp         | 84 ++++++++++++++++++++++-
 test/benchmark/nmodl-llvm-time.sh         | 20 ++++--
 test/unit/codegen/codegen_data_helper.cpp | 24 ++++---
 test/unit/codegen/codegen_data_helper.hpp |  3 +
 4 files changed, 115 insertions(+), 16 deletions(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 7930da2300..63b16167f5 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -38,6 +38,71 @@ void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
     logger->info("Created LLVM IR module from NMODL AST in {} sec", diff.count());
 }
 
+void checkCudaErrors(cudaError error) {
+    if (error != cudaSuccess) {
+        throw std::runtime_error(
+                "CUDA Execution Error: {}\n"_format(cudaGetErrorString(error)));
+    }
+}
+
+void* copy_instance_data_gpu(const codegen::CodegenInstanceData& data) {
+    // // Copy CodegenInstanceData struct to GPU
+    // logger->info("Copying struct to GPU");
+    // codegen::CodegenInstanceData* data_dev, data_dev_helper;
+    // // data_dev_helper = (codegen::CodegenInstanceData*)malloc(sizeof(codegen::CodegenInstanceData));
+    // checkCudaErrors(cudaMalloc((void**)&data_dev, sizeof(codegen::CodegenInstanceData)));
+    // checkCudaErrors(cudaMemcpy(data_dev, &data, sizeof(codegen::CodegenInstanceData), cudaMemcpyHostToDevice));
+    // // Update internal members of CodegenInstance data to the GPU
+    // void** dev_ptrs = new (void*)[]
+    // for (auto i = 0; i < data.num_ptr_members; i++) {
+    //     // void** dev_member_ptr = &(data_dev->members[i]);
+    //     logger->info("Allocating each member in the GPU");
+    //     checkCudaErrors(cudaMalloc(&(data_dev->members.data()[i]), sizeof(double) * data.num_elements));
+    //     // Copy data to GPU
+    //     logger->info("Copying {} ({})", data.members[i], sizeof(double) * data.num_elements);
+    //     checkCudaErrors(cudaMemcpy(data_dev->members.data()[i], data.members[i], sizeof(double) * data.num_elements, cudaMemcpyHostToDevice));
+    //     logger->info("Copied to {}", data_dev->members[i]);
+    //     // logger->info("Copying {} to {} ({})", data.members[i], *member_dev_ptr, data.offsets[i+1] - data.offsets[i]);
+    // }
+    // logger->info("Copying base_ptr to GPU");
+    // checkCudaErrors(cudaMemcpy(&(data_dev->base_ptr), &(data_dev->members.data()[0]), sizeof(void*), cudaMemcpyDeviceToDevice));
+    // const auto scalar_variables = data.members.size() - data.num_ptr_members;
+    // logger->info("Copying scalar variables to GPU");
+    // for (auto i = data.num_ptr_members; i < data.num_ptr_members + scalar_variables; i++) {
+    //     // Copy data to GPU
+    //     checkCudaErrors(cudaMemcpy(data_dev->members[i], data.members[i], sizeof(double), cudaMemcpyHostToDevice));
+    // }
+    // return data_dev;
+    void* dev_base_ptr;
+    const auto ptr_vars_size = data.num_ptr_members * sizeof(double*);
+    auto scalar_vars_size = 0;
+    const auto num_scalar_vars = data.members.size() - data.num_ptr_members;
+    for (int i = 0; i < num_scalar_vars; i++) {
+        scalar_vars_size += data.members_size[i+data.num_ptr_members];
+    }
+    logger->info("Malloc dev_base_ptr for the struct");
+    checkCudaErrors(cudaMalloc(&dev_base_ptr, ptr_vars_size + scalar_vars_size));
+    logger->info("dev_base_ptr addr: {}", dev_base_ptr);
+    for (auto i = 0; i < data.num_ptr_members; i++) {
+        // Allocate a vector with the correct size
+        void* dev_member_ptr;
+        auto size_of_var = data.members_size[i];
+        logger->info("Malloc member {}", i);
+        checkCudaErrors(cudaMalloc(&dev_member_ptr, size_of_var*data.num_elements));
+        logger->info("Memcpy vector of member {}: {} ({})", i, data.members[i], size_of_var*data.num_elements);
+        checkCudaErrors(cudaMemcpy(dev_member_ptr, data.members[i], size_of_var*data.num_elements, cudaMemcpyHostToDevice));
+        // Copy the pointer addresses to the struct
+        auto offseted_place = (char*)dev_base_ptr+data.offsets[i];
+        logger->info("Memcpy pointer to dev_base_ptr {}: {} ({})", i, dev_member_ptr, sizeof(double*));
+        checkCudaErrors(cudaMemcpy(offseted_place, &dev_member_ptr, sizeof(double*), cudaMemcpyHostToDevice));
+    }
+    // memcpy the scalar values
+    auto offseted_place_dev = (char*)dev_base_ptr+data.offsets[data.num_ptr_members];
+    auto offseted_place_host = (char*)(data.base_ptr)+data.offsets[data.num_ptr_members];
+    checkCudaErrors(cudaMemcpy(offseted_place_dev, offseted_place_host, scalar_vars_size, cudaMemcpyHostToDevice));
+    return dev_base_ptr;
+}
+
 void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     // Set the codegen data helper and find the kernels.
     auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
@@ -133,7 +198,12 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             for (int i = 0; i < num_experiments; ++i) {
                 // Initialise the data.
                 auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-
+#ifdef NMODL_LLVM_CUDA_BACKEND
+                void* dev_ptr;
+                if (platform.is_CUDA_gpu()) {
+                    dev_ptr = copy_instance_data_gpu(instance_data);
+                }
+#endif
                 // Log instance size once.
                 if (i == 0) {
                     double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
@@ -145,8 +215,18 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
                 auto start = std::chrono::steady_clock::now();
 #ifdef NMODL_LLVM_CUDA_BACKEND
                 if (platform.is_CUDA_gpu()) {
+                    // int deviceId;
+                    // cudaGetDevice(&deviceId);
+                    // int cudaDevAttrConcurrentManagedAccess_value;
+                    // cudaDeviceGetAttribute(&cudaDevAttrConcurrentManagedAccess_value, cudaDevAttrConcurrentManagedAccess, deviceId);
+                    // logger->info("Using GPU with deviceId {} number of bytes {} cudaDevAttrConcurrentManagedAccess {}", deviceId, instance_data.num_bytes, cudaDevAttrConcurrentManagedAccess_value);
+                    // cudaMemPrefetchAsync(instance_data.base_ptr, instance_data.num_bytes, deviceId);
+                    // void* base_ptr_dev;
+                    // cudaMemcpy(base_ptr_dev, instance_data.base_ptr, instance_data.num_bytes, cudaMemcpyHostToDevice);
+                    // prefetch_gpu_memory(instance_data);
+                    // const auto& dev_ptr = copy_instance_data_gpu(instance_data);
                     cuda_runner->run_with_argument<void*>(wrapper_name,
-                                                          instance_data.base_ptr,
+                                                          dev_ptr,
                                                           gpu_execution_parameters);
                 } else {
 #endif
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 16d4defd15..32a52371b3 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -80,10 +80,15 @@ while [[ "$1" != "" ]]; do
             shift
             ;;
         -ncu|--nsight-compute)
-            echo "profiling mode for gpu"
+            echo "Nsight Compute mode for gpu"
             ncu_exec=$(which ncu)
             shift
             ;;
+        -nsys|--nsys-profile)
+            echo "Nsys profiling mode for gpu"
+            nsys_exec=$(which nsys)
+            shift
+            ;;
         -V|--version)
             echo "$version_string"
             exit 0
@@ -185,9 +190,9 @@ declare -a benchmark_variance
 
 # Kernels, architectures and compilers loop
 
-KERNEL_TARGETS="compute-bound memory-bound hh"
+KERNEL_TARGETS="hh" #"compute-bound memory-bound hh"
 
-ARCHITECTURES="skylake_avx512 broadwell nehalem default nvptx64"
+ARCHITECTURES="nvptx64" #"skylake_avx512 broadwell nehalem default nvptx64"
 
 COMPILERS="intel clang gcc"
 
@@ -290,7 +295,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
             done
 		fi
         echo "|  |  NMODL JIT"
-        for fast_math in true false; do
+        for fast_math in false; do
             if $fast_math; then
                 fast_math_flag="--fmf nnan contract afn"
                 fast_math_opt="nnancontractafn"
@@ -322,9 +327,12 @@ for kernel_target in ${KERNEL_TARGETS}; do
                 nmodl_args="${kernels_path}/${kernel_target}.mod --output ${output_dir}/${benchmark_nmodl_desc} llvm --ir ${fast_math_flag} --opt-level-ir 3 gpu --name ${nmodl_architecture} --target-arch \"sm_70\" --math-library libdevice benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${libdevice_lib} --grid-dim-x 16384 --block-dim-x 512"
                 # runs only kernel generated by LLVM IR
                 if [[ $ncu_exec != "" ]]; then
-                    ncu="${ncu_exec} --set detailed -f -o ${kernel_target}_${fast_math_opt}_detailed"
+                    nvidia_profile="${ncu_exec} --set detailed -f -o ${kernel_target}_${fast_math_opt}_detailed"
+                fi
+                if [[ $nsys_exec != "" ]]; then
+                    nvidia_profile="${nsys_exec} profile --stats=true --force-overwrite=true -o ${output_dir}/${kernel_target}_${fast_math_opt}_nsys"
                 fi
-                ${debug} eval "${ncu} ${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
+                ${debug} eval "${nvidia_profile} ${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
                 benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
                 benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
             fi
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index ce54b9041c..b7fdfa25ec 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -98,11 +98,11 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
     size_t member_size = std::max(sizeof(double), sizeof(double*));
 
 // allocate instance object with memory alignment
-#ifdef NMODL_LLVM_CUDA_BACKEND
-    cudaMallocManaged(&base, member_size * variables.size());
-#else
+// #ifdef NMODL_LLVM_CUDA_BACKEND
+//     cudaMallocManaged(&base, member_size * variables.size());
+// #else
     posix_memalign(&base, NBYTE_ALIGNMENT, member_size * variables.size());
-#endif
+// #endif
 
     data.base_ptr = base;
     data.num_bytes += member_size * variables.size();
@@ -128,14 +128,19 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
         } else if (type == ast::AstNodeType::INTEGER) {
             member_size = sizeof(int);
         }
+        data.members_size.push_back(member_size);
 
         // allocate memory and setup a pointer
         void* member;
-#ifdef NMODL_LLVM_CUDA_BACKEND
-        cudaMallocManaged(&member, member_size * num_elements);
-#else
+// #ifdef NMODL_LLVM_CUDA_BACKEND
+//         cudaMallocManaged(&member, member_size * num_elements);
+//         int deviceId;
+//         cudaGetDevice(&deviceId);
+//         cudaMemPrefetchAsync(&member, member_size * num_elements, deviceId);
+// #else
         posix_memalign(&member, NBYTE_ALIGNMENT, member_size * num_elements);
-#endif
+// #endif
+        logger->info("Allocated {} bytes in {}", member_size * num_elements, member);
 
         // integer values are often offsets so they must start from
         // 0 to num_elements-1 to avoid out of bound accesses.
@@ -199,18 +204,21 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
 
         if (type == ast::AstNodeType::DOUBLE) {
             *((double*) ptr) = value;
+            data.members_size.push_back(sizeof(double));
             data.offsets.push_back(offset);
             data.members.push_back(ptr);
             offset += sizeof(double);
             ptr = (char*) base + offset;
         } else if (type == ast::AstNodeType::FLOAT) {
             *((float*) ptr) = float(value);
+            data.members_size.push_back(sizeof(float));
             data.offsets.push_back(offset);
             data.members.push_back(ptr);
             offset += sizeof(float);
             ptr = (char*) base + offset;
         } else if (type == ast::AstNodeType::INTEGER) {
             *((int*) ptr) = int(value);
+            data.members_size.push_back(sizeof(int));
             data.offsets.push_back(offset);
             data.members.push_back(ptr);
             offset += sizeof(int);
diff --git a/test/unit/codegen/codegen_data_helper.hpp b/test/unit/codegen/codegen_data_helper.hpp
index 76c4f422d9..101bc8444d 100644
--- a/test/unit/codegen/codegen_data_helper.hpp
+++ b/test/unit/codegen/codegen_data_helper.hpp
@@ -38,6 +38,9 @@ struct CodegenInstanceData {
     /// number of pointer members
     size_t num_ptr_members = 0;
 
+    /// size of member type. If member is ptr size of elements of the vector
+    std::vector<size_t> members_size;
+
     /// offset relative to base_ptr to locate
     /// each member variable in instance struct
     std::vector<size_t> offsets;

From d6d419a53b80f2ede54d514ab7fcee612c614b99 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 3 May 2022 14:45:28 +0300
Subject: [PATCH 304/331] Rearrange vec_rhs and vec_d to allocate memory
 properly (#856)

* Rearrange vec_rhs and vec_d to allocate memory properly
* Setup rhs, d and their shadow vectors
* Fix test

Co-authored-by: Ioannis Magkanaris <ioannis.magkanaris@epfl.ch>
---
 .../llvm/codegen_llvm_helper_visitor.cpp       | 18 +++++++++++-------
 src/codegen/llvm/codegen_llvm_visitor.cpp      |  6 ++++++
 .../codegen/codegen_llvm_instance_struct.cpp   | 18 +++++++++++++-----
 test/unit/codegen/codegen_llvm_ir.cpp          |  4 ++--
 test/unit/codegen/codegen_llvm_visitor.cpp     | 12 ++++++++----
 5 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 5f8119a4d1..5800beae6b 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -239,13 +239,6 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     add_var_with_type(VOLTAGE_VAR, FLOAT_TYPE, /*is_pointer=*/1);
     add_var_with_type(NODE_INDEX_VAR, INTEGER_TYPE, /*is_pointer=*/1);
 
-    // add dt, t, celsius
-    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
-
     // As we do not have `NrnThread` object as an argument, we store points to rhs
     // and d to in the instance struct as well. Also need their respective shadow variables
     // in case of point process mechanism.
@@ -256,6 +249,17 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     add_var_with_type(naming::NTHREAD_RHS_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
     add_var_with_type(naming::NTHREAD_D_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
 
+    // NOTE: All the pointer variables should be declared before the scalar variables otherwise
+    // the allocation of memory for the variables in the InstanceStruct and their offsets will be
+    // wrong
+
+    // add dt, t, celsius
+    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
+
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index f4392048d9..de6c7ad914 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -998,6 +998,12 @@ void CodegenLLVMVisitor::print_instance_variable_setup() {
     // Pass ml->nodeindices pointer to node_index
     printer->add_line("inst->node_index = ml->nodeindices;");
 
+    // Setup rhs, d and their shadow vectors
+    printer->add_line(fmt::format("inst->{} = nt->_actual_rhs;", naming::NTHREAD_RHS));
+    printer->add_line(fmt::format("inst->{} = nt->_actual_d;", naming::NTHREAD_D));
+    printer->add_line(fmt::format("inst->{} = nt->_shadow_rhs;", naming::NTHREAD_RHS_SHADOW));
+    printer->add_line(fmt::format("inst->{} = nt->_shadow_d;", naming::NTHREAD_D_SHADOW));
+
     // Setup global variables
     printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_T_VARIABLE));
     printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_DT_VARIABLE));
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 401e0a6c63..9c22fdda78 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -120,11 +120,15 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
             size_t ion_ena_index_index = 8;
             size_t voltage_index = 9;
             size_t node_index_index = 10;
-            size_t t_index = 11;
-            size_t dt_index = 12;
-            size_t celsius_index = 13;
-            size_t secondorder_index = 14;
-            size_t node_count_index = 15;
+            size_t rhs_index = 11;
+            size_t d_index = 12;
+            size_t rhs_shadow_index = 13;
+            size_t d_shadow_index = 14;
+            size_t t_index = 15;
+            size_t dt_index = 16;
+            size_t celsius_index = 17;
+            size_t secondorder_index = 18;
+            size_t node_count_index = 19;
             // Check if the various instance struct fields are properly initialized
             REQUIRE(compare(instance_data.members[minf_index],
                             generate_dummy_data<double>(minf_index, num_elements)));
@@ -155,6 +159,10 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
                 int* ion_ena_index;
                 double* voltage;
                 int* node_index;
+                double* vec_rhs;
+                double* vec_d;
+                double* _shadow_rhs;
+                double* _shadow_d;
                 double t;
                 double dt;
                 double celsius;
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 13b4e9068d..ebef71688e 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -980,8 +980,8 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             // Check the struct type with correct attributes and the kernel declaration.
             std::regex struct_type(
                 "%.*__instance_var__type = type \\{ double\\*, double\\*, double\\*, double\\*, "
-                "double\\*, double\\*, double\\*, double\\*, double\\*, double\\*, i32\\*, double, "
-                "double, double, i32, i32, double\\*, double\\*, double\\*, double\\* \\}");
+                "double\\*, double\\*, double\\*, double\\*, double\\*, double\\*, i32\\*, "
+                "double\\*, double\\*, double\\*, double\\*, double, double, double, i32, i32 \\}");
             std::regex kernel_declaration(
                 R"(define void @nrn_state_hh\(%.*__instance_var__type\* noalias nocapture readonly .*\) #0)");
             REQUIRE(std::regex_search(module_string, m, struct_type));
diff --git a/test/unit/codegen/codegen_llvm_visitor.cpp b/test/unit/codegen/codegen_llvm_visitor.cpp
index 1906d0d27c..af9bed5e7c 100644
--- a/test/unit/codegen/codegen_llvm_visitor.cpp
+++ b/test/unit/codegen/codegen_llvm_visitor.cpp
@@ -171,15 +171,15 @@ SCENARIO("Check instance struct declaration and setup in wrapper",
                 int* __restrict__ ion_dikdv_index;
                 double* __restrict__ voltage;
                 int* __restrict__ node_index;
+                double* __restrict__ vec_rhs;
+                double* __restrict__ vec_d;
+                double* __restrict__ _shadow_rhs;
+                double* __restrict__ _shadow_d;
                 double t;
                 double dt;
                 double celsius;
                 int secondorder;
                 int node_count;
-                double* __restrict__ vec_rhs;
-                double* __restrict__ vec_d;
-                double* __restrict__ _shadow_rhs;
-                double* __restrict__ _shadow_d;
             };
         )";
         std::string generated_instance_struct_setup = R"(
@@ -226,6 +226,10 @@ SCENARIO("Check instance struct declaration and setup in wrapper",
                 inst->ion_dikdv_index = indexes+5*pnodecount;
                 inst->voltage = nt->_actual_v;
                 inst->node_index = ml->nodeindices;
+                inst->vec_rhs = nt->_actual_rhs;
+                inst->vec_d = nt->_actual_d;
+                inst->_shadow_rhs = nt->_shadow_rhs;
+                inst->_shadow_d = nt->_shadow_d;
                 inst->t = nt->t;
                 inst->dt = nt->dt;
                 inst->celsius = celsius;

From 984933b1e35e56fe3a9d81788d96b0ec22fc6f01 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <ioannis.magkanaris@epfl.ch>
Date: Tue, 3 May 2022 15:47:52 +0300
Subject: [PATCH 305/331] Use hh.mod from CoreNEURON for the benchmark test and
 enable inlining if llvm_backend is enabled

---
 src/codegen/codegen_driver.cpp |   2 +-
 test/benchmark/kernels/hh.mod  | 197 +++++++++++++++++----------------
 2 files changed, 105 insertions(+), 94 deletions(-)

diff --git a/src/codegen/codegen_driver.cpp b/src/codegen/codegen_driver.cpp
index 0bdf37a29c..231bf48a0e 100644
--- a/src/codegen/codegen_driver.cpp
+++ b/src/codegen/codegen_driver.cpp
@@ -179,7 +179,7 @@ bool CodegenDriver::prepare_mod(std::shared_ptr<ast::Program> node, const std::s
     /// that old symbols (e.g. prime variables) are not lost
     update_symtab = true;
 
-    if (cfg.nmodl_inline || cfg.llvm_ir) {
+    if (cfg.nmodl_inline || cfg.llvm_backend) {
         logger->info("Running nmodl inline visitor");
         InlineVisitor().visit_program(*node);
         ast_to_nmodl(*node, filepath("inline", "mod"));
diff --git a/test/benchmark/kernels/hh.mod b/test/benchmark/kernels/hh.mod
index d92a686714..053a15f43f 100644
--- a/test/benchmark/kernels/hh.mod
+++ b/test/benchmark/kernels/hh.mod
@@ -1,114 +1,125 @@
 TITLE hh.mod   squid sodium, potassium, and leak channels
+ 
 COMMENT
-    This is the original Hodgkin-Huxley treatment for the set of sodium,
-    potassium, and leakage channels found in the squid giant axon membrane.
-    ("A quantitative description of membrane current and its application
-    conduction and excitation in nerve" J.Physiol. (Lond.) 117:500-544 (1952).)
-    Membrane voltage is in absolute mV and has been reversed in polarity
-    from the original HH convention and shifted to reflect a resting potential
-    of -65 mV.
-    Remember to set celsius=6.3 (or whatever) in your HOC file.
-    See squid.hoc for an example of a simulation using this model.
-    SW Jaslove  6 March, 1992
+ This is the original Hodgkin-Huxley treatment for the set of sodium, 
+  potassium, and leakage channels found in the squid giant axon membrane.
+  ("A quantitative description of membrane current and its application 
+  conduction and excitation in nerve" J.Physiol. (Lond.) 117:500-544 (1952).)
+ Membrane voltage is in absolute mV and has been reversed in polarity
+  from the original HH convention and shifted to reflect a resting potential
+  of -65 mV.
+ Remember to set celsius=6.3 (or whatever) in your HOC file.
+ See squid.hoc for an example of a simulation using this model.
+ SW Jaslove  6 March, 1992
 ENDCOMMENT
+ 
 UNITS {
-    (mA) = (milliamp)
-    (mV) = (millivolt)
-    (S) = (siemens)
+        (mA) = (milliamp)
+        (mV) = (millivolt)
+	(S) = (siemens)
 }
+ 
+? interface
 NEURON {
-    SUFFIX hh
-    USEION na READ ena WRITE ina
-    USEION k READ ek WRITE ik
-    NONSPECIFIC_CURRENT il
-    RANGE gnabar, gkbar, gl, el, gna, gk
-    RANGE minf, hinf, ninf, mtau, htau, ntau
-    THREADSAFE
+        SUFFIX hh
+        USEION na READ ena WRITE ina
+        USEION k READ ek WRITE ik
+        NONSPECIFIC_CURRENT il
+        RANGE gnabar, gkbar, gl, el, gna, gk
+        :GLOBAL minf, hinf, ninf, mtau, htau, ntau
+        RANGE minf, hinf, ninf, mtau, htau, ntau
+	THREADSAFE : assigned GLOBALs will be per thread
 }
+ 
 PARAMETER {
-    gnabar = .12 (S/cm2) <0,1e9>
-    gkbar = .036 (S/cm2) <0,1e9>
-    gl = .0003 (S/cm2) <0,1e9>
-    el = -54.3 (mV)
+        gnabar = .12 (S/cm2)	<0,1e9>
+        gkbar = .036 (S/cm2)	<0,1e9>
+        gl = .0003 (S/cm2)	<0,1e9>
+        el = -54.3 (mV)
 }
+ 
 STATE {
-    m
-    h
-    n
+        m h n
 }
+ 
 ASSIGNED {
-    v (mV)
-    celsius (degC)
-    ena (mV)
-    ek (mV)
-    gna (S/cm2)
-    gk (S/cm2)
-    ina (mA/cm2)
-    ik (mA/cm2)
-    il (mA/cm2)
-    minf
-    hinf
-    ninf
-    mtau (ms)
-    htau (ms)
-    ntau (ms)
+        v (mV)
+        celsius (degC)
+        ena (mV)
+        ek (mV)
+
+	gna (S/cm2)
+	gk (S/cm2)
+        ina (mA/cm2)
+        ik (mA/cm2)
+        il (mA/cm2)
+        minf hinf ninf
+	mtau (ms) htau (ms) ntau (ms)
 }
+ 
+? currents
 BREAKPOINT {
-    SOLVE states METHOD cnexp
-    gna = gnabar*m*m*m*h
-    ina = gna*(v-ena)
-    gk = gkbar*n*n*n*n
-    ik = gk*(v-ek)
-    il = gl*(v-el)
+        SOLVE states METHOD cnexp
+        gna = gnabar*m*m*m*h
+	ina = gna*(v - ena)
+        gk = gkbar*n*n*n*n
+	ik = gk*(v - ek)      
+        il = gl*(v - el)
 }
+ 
+ 
 INITIAL {
-    {
-        : inlined rates
-        LOCAL alpha, beta, sum, q10, vtrap_in_0, v_in_0
-        v_in_0 = v
-        q10 = 3*((celsius-6.3)/10)
-        alpha = .07*exp(-(v_in_0+65)/20)
-        beta = 1/(exp(-(v_in_0+35)/10)+1)
-        sum = alpha+beta
-        htau = 1/(q10*sum)
-        hinf = alpha/sum
-        {
-            : inlined vtrap
-            LOCAL x_in_0, y_in_0
-            x_in_0 = alpha
-            y_in_0 = alpha
-            : no control flow
-            vtrap_in_0 = y_in_0*(1-x_in_0/y_in_0/2)
-        }
-        hinf = vtrap_in_0
-    }
-    m = minf
-    h = hinf
-    n = ninf
+	rates(v)
+	m = minf
+	h = hinf
+	n = ninf
 }
-DERIVATIVE states {
-    {
-        : inlined rates
-        LOCAL alpha, beta, sum, q10, vtrap_in_0, v_in_1
-        v_in_1 = v
-        q10 = 3*((celsius-6.3)/10)
-        alpha = .07*exp(-(v_in_1+65)/20)
-        beta = 1/(exp(-(v_in_1+35)/10)+1)
-        sum = alpha+beta
-        htau = 1/(q10*sum)
+
+? states
+DERIVATIVE states {  
+        rates(v)
+        m' =  (minf-m)/mtau
+        h' = (hinf-h)/htau
+        n' = (ninf-n)/ntau
+}
+ 
+:LOCAL q10
+
+
+? rates
+PROCEDURE rates(v(mV)) {  :Computes rate and other constants at current v.
+                      :Call once from HOC to initialize inf at resting v.
+        LOCAL  alpha, beta, sum, q10
+:        TABLE minf, mtau, hinf, htau, ninf, ntau DEPEND celsius FROM -100 TO 100 WITH 200
+
+UNITSOFF
+        q10 = 3^((celsius - 6.3)/10)
+                :"m" sodium activation system
+        alpha = .1 * vtrap(-(v+40),10)
+        beta =  4 * exp(-(v+65)/18)
+        sum = alpha + beta
+	mtau = 1/(q10*sum)
+        minf = alpha/sum
+                :"h" sodium inactivation system
+        alpha = .07 * exp(-(v+65)/20)
+        beta = 1 / (exp(-(v+35)/10) + 1)
+        sum = alpha + beta
+	htau = 1/(q10*sum)
         hinf = alpha/sum
-        {
-           : inlined vtrap
-            LOCAL x_in_0, y_in_0
-            x_in_0 = alpha
-            y_in_0 = alpha
-            : no control flow
-            vtrap_in_0 = y_in_0*(1-x_in_0/y_in_0/2)  
+                :"n" potassium activation system
+        alpha = .01*vtrap(-(v+55),10) 
+        beta = .125*exp(-(v+65)/80)
+	sum = alpha + beta
+        ntau = 1/(q10*sum)
+        ninf = alpha/sum
+}
+ 
+FUNCTION vtrap(x,y) {  :Traps for 0 in denominator of rate eqns.
+        if (fabs(x/y) < 1e-6) {
+                vtrap = y*(1 - x/y/2)
+        }else{
+                vtrap = x/(exp(x/y) - 1)
         }
-        hinf = vtrap_in_0
-    }
-    m = m+(1.0-exp(dt*((((-1.0)))/mtau)))*(-(((minf))/mtau)/((((-1.0)))/mtau)-m)
-    h = h+(1.0-exp(dt*((((-1.0)))/htau)))*(-(((hinf))/htau)/((((-1.0)))/htau)-h)
-    n = n+(1.0-exp(dt*((((-1.0)))/ntau)))*(-(((ninf))/ntau)/((((-1.0)))/ntau)-n)
 }
+ 
 UNITSON

From d7615af9ce494da49c98ae9c1ee88331f8e5ac5d Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <ioannis.magkanaris@epfl.ch>
Date: Tue, 3 May 2022 16:18:28 +0300
Subject: [PATCH 306/331] Cleaned up llvm_ir and llvm_backend since there were
 2 variables for the same purpose

---
 src/codegen/codegen_driver.cpp | 2 +-
 src/codegen/codegen_driver.hpp | 3 ---
 src/pybind/pynmodl.cpp         | 2 +-
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/codegen/codegen_driver.cpp b/src/codegen/codegen_driver.cpp
index 231bf48a0e..0bdf37a29c 100644
--- a/src/codegen/codegen_driver.cpp
+++ b/src/codegen/codegen_driver.cpp
@@ -179,7 +179,7 @@ bool CodegenDriver::prepare_mod(std::shared_ptr<ast::Program> node, const std::s
     /// that old symbols (e.g. prime variables) are not lost
     update_symtab = true;
 
-    if (cfg.nmodl_inline || cfg.llvm_backend) {
+    if (cfg.nmodl_inline || cfg.llvm_ir) {
         logger->info("Running nmodl inline visitor");
         InlineVisitor().visit_program(*node);
         ast_to_nmodl(*node, filepath("inline", "mod"));
diff --git a/src/codegen/codegen_driver.hpp b/src/codegen/codegen_driver.hpp
index 78c95421da..14d8ed76ab 100644
--- a/src/codegen/codegen_driver.hpp
+++ b/src/codegen/codegen_driver.hpp
@@ -33,9 +33,6 @@ struct CodeGenConfig {
     /// true if cuda code to be generated
     bool cuda_backend = false;
 
-    /// true if llvm code to be generated
-    bool llvm_backend = false;
-
     /// true if sympy should be used for solving ODEs analytically
     bool sympy_analytic = false;
 
diff --git a/src/pybind/pynmodl.cpp b/src/pybind/pynmodl.cpp
index 30517d8ff0..bb3cb443f0 100644
--- a/src/pybind/pynmodl.cpp
+++ b/src/pybind/pynmodl.cpp
@@ -244,7 +244,7 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
     cfg.def(py::init([]() {
            auto cfg = std::make_unique<nmodl::codegen::CodeGenConfig>();
            // set to more sensible defaults for python binding
-           cfg->llvm_backend = true;
+           cfg->llvm_ir = true;
            return cfg;
        }))
         .def_readwrite("sympy_analytic", &nmodl::codegen::CodeGenConfig::sympy_analytic)

From a8a44ed9f834e9ecc5aaacffb304cbf1dce1eb30 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <ioannis.magkanaris@epfl.ch>
Date: Tue, 3 May 2022 16:20:48 +0300
Subject: [PATCH 307/331] Added arg parsing in benchmark script and added
 expsyn.mod test

---
 test/benchmark/CMakeLists.txt     |  9 +++++--
 test/benchmark/benchmark.py       | 20 ++++++++++++---
 test/benchmark/kernels/expsyn.mod | 42 +++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 5 deletions(-)
 create mode 100644 test/benchmark/kernels/expsyn.mod

diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index de3362a07f..b95801b0cd 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -32,17 +32,22 @@ if(NMODL_ENABLE_PYTHON_BINDINGS)
   file(GLOB modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/*.mod")
   list(APPEND modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/integration/mod/test_math.mod")
   foreach(modfile ${modfiles})
+    # For expsyn.mod set the vector width to 1 since atomic operations are not supported for vector
+    # widths > 1. See https://github.com/BlueBrain/nmodl/issues/857
+    if(${modfile} STREQUAL "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/expsyn.mod")
+      set(extra_args "--vec 1")
+    endif()
     get_filename_component(modfile_name "${modfile}" NAME)
     add_test(NAME "PyJIT/${modfile_name}"
              COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
-                     ${modfile})
+                     --file ${modfile} ${extra_args})
     set_tests_properties(
       "PyJIT/${modfile_name}" PROPERTIES ENVIRONMENT
                                          PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH})
     if(NMODL_ENABLE_LLVM_CUDA)
       add_test(NAME "PyJIT/${modfile_name}_gpu"
                COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
-                       ${modfile} gpu)
+                      --file ${modfile} --gpu ${extra_args})
       message(STATUS "CUDA_HOME is ${CUDAToolkit_TARGET_DIR}")
       set_tests_properties(
         "PyJIT/${modfile_name}_gpu"
diff --git a/test/benchmark/benchmark.py b/test/benchmark/benchmark.py
index 6cc9ee83ec..9144fa549d 100644
--- a/test/benchmark/benchmark.py
+++ b/test/benchmark/benchmark.py
@@ -1,19 +1,33 @@
+import argparse
 import sys
 import os
 
 import nmodl.dsl as nmodl
 from nmodl import ast, visitor
 
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Benchmark test script for NMODL.')
+    parser.add_argument('--gpu', action='store_true', default=False,
+                        help='Enable GPU JIT execution')
+    parser.add_argument('--vec', type=int, default=1,
+                        help='Vector width for CPU execution')
+    parser.add_argument('--file', type=str,
+                        help='NMODL file to benchmark')
+    args, _ = parser.parse_known_args()
+    return args
+
 def main():
+    args = parse_arguments()
+
     driver = nmodl.NmodlDriver()
     lookup_visitor = visitor.AstLookupVisitor()
 
     cfg = nmodl.CodeGenConfig()
-    cfg.llvm_vector_width = 4
+    cfg.llvm_vector_width = args.vec
     cfg.llvm_opt_level_ir = 2
     cfg.nmodl_ast = True
-    fname = sys.argv[1]
-    if len(sys.argv) > 2:  # GPU enabled
+    fname = args.file
+    if args.gpu:  # GPU enabled
         cfg.llvm_math_library = "libdevice"
         cfg.llvm_gpu_name = "nvptx64"
         cfg.llvm_gpu_target_architecture = "sm_70"
diff --git a/test/benchmark/kernels/expsyn.mod b/test/benchmark/kernels/expsyn.mod
new file mode 100644
index 0000000000..56ddde3b19
--- /dev/null
+++ b/test/benchmark/kernels/expsyn.mod
@@ -0,0 +1,42 @@
+NEURON {
+	POINT_PROCESS ExpSyn
+	RANGE tau, e, i
+	NONSPECIFIC_CURRENT i
+}
+
+UNITS {
+	(nA) = (nanoamp)
+	(mV) = (millivolt)
+	(uS) = (microsiemens)
+}
+
+PARAMETER {
+	tau = 0.1 (ms) <1e-9,1e9>
+	e = 0	(mV)
+}
+
+ASSIGNED {
+	v (mV)
+	i (nA)
+}
+
+STATE {
+	g (uS)
+}
+
+INITIAL {
+	g=0
+}
+
+BREAKPOINT {
+	SOLVE state METHOD cnexp
+	i = g*(v - e)
+}
+
+DERIVATIVE state {
+	g' = -g/tau
+}
+
+NET_RECEIVE(weight (uS)) {
+	g = g + weight
+}

From 2594854dc91c2ec566806e1331322709274713d3 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 3 May 2022 16:09:23 +0200
Subject: [PATCH 308/331] Replace fabs with the libdevice corresponding
 function

---
 src/codegen/llvm/replace_with_lib_functions.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/codegen/llvm/replace_with_lib_functions.cpp b/src/codegen/llvm/replace_with_lib_functions.cpp
index 24d53c4312..750e2c2318 100644
--- a/src/codegen/llvm/replace_with_lib_functions.cpp
+++ b/src/codegen/llvm/replace_with_lib_functions.cpp
@@ -173,7 +173,9 @@ bool ReplaceWithLibdevice::replace_call(CallInst& call_inst) {
                                                                       {"llvm.pow.f32", "__nv_powf"},
                                                                       {"llvm.pow.f64", "__nv_pow"},
                                                                       {"llvm.log.f32", "__nv_logf"},
-                                                                      {"llvm.log.f64", "__nv_log"}};
+                                                                      {"llvm.log.f64", "__nv_log"},
+                                                                      {"llvm.fabs.f32", "__nv_fabsf"},
+                                                                      {"llvm.fabs.f64", "__nv_fabs"}};
 
     // If replacement is not supported, abort.
     std::string old_name = function->getName().str();

From f970ac4674bab6a0d247782190e0eee49589944d Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 3 May 2022 16:25:13 +0200
Subject: [PATCH 309/331] Disable expsyn test on GPU because atomic
 instructions are not yet supported on GPU

---
 test/benchmark/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index b95801b0cd..5529b505d2 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -44,7 +44,9 @@ if(NMODL_ENABLE_PYTHON_BINDINGS)
     set_tests_properties(
       "PyJIT/${modfile_name}" PROPERTIES ENVIRONMENT
                                          PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH})
-    if(NMODL_ENABLE_LLVM_CUDA)
+    # Disable running the expsyn.mod on GPU because atomic instructions are not supported yet on GPU
+    # See https://github.com/BlueBrain/nmodl/issues/834
+    if(NMODL_ENABLE_LLVM_CUDA AND NOT ${modfile} STREQUAL "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/expsyn.mod")
       add_test(NAME "PyJIT/${modfile_name}_gpu"
                COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
                       --file ${modfile} --gpu ${extra_args})

From 283c833b8d2eb25a36ba6fcb6021c0c491448a57 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 3 May 2022 17:17:26 +0200
Subject: [PATCH 310/331] Working test passing explicitly results back and
 forth of the GPU

---
 test/benchmark/llvm_benchmark.cpp         | 94 ++++++++++++++---------
 test/unit/codegen/codegen_data_helper.cpp |  8 +-
 2 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 63b16167f5..1b00e4771d 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -46,33 +46,6 @@ void checkCudaErrors(cudaError error) {
 }
 
 void* copy_instance_data_gpu(const codegen::CodegenInstanceData& data) {
-    // // Copy CodegenInstanceData struct to GPU
-    // logger->info("Copying struct to GPU");
-    // codegen::CodegenInstanceData* data_dev, data_dev_helper;
-    // // data_dev_helper = (codegen::CodegenInstanceData*)malloc(sizeof(codegen::CodegenInstanceData));
-    // checkCudaErrors(cudaMalloc((void**)&data_dev, sizeof(codegen::CodegenInstanceData)));
-    // checkCudaErrors(cudaMemcpy(data_dev, &data, sizeof(codegen::CodegenInstanceData), cudaMemcpyHostToDevice));
-    // // Update internal members of CodegenInstance data to the GPU
-    // void** dev_ptrs = new (void*)[]
-    // for (auto i = 0; i < data.num_ptr_members; i++) {
-    //     // void** dev_member_ptr = &(data_dev->members[i]);
-    //     logger->info("Allocating each member in the GPU");
-    //     checkCudaErrors(cudaMalloc(&(data_dev->members.data()[i]), sizeof(double) * data.num_elements));
-    //     // Copy data to GPU
-    //     logger->info("Copying {} ({})", data.members[i], sizeof(double) * data.num_elements);
-    //     checkCudaErrors(cudaMemcpy(data_dev->members.data()[i], data.members[i], sizeof(double) * data.num_elements, cudaMemcpyHostToDevice));
-    //     logger->info("Copied to {}", data_dev->members[i]);
-    //     // logger->info("Copying {} to {} ({})", data.members[i], *member_dev_ptr, data.offsets[i+1] - data.offsets[i]);
-    // }
-    // logger->info("Copying base_ptr to GPU");
-    // checkCudaErrors(cudaMemcpy(&(data_dev->base_ptr), &(data_dev->members.data()[0]), sizeof(void*), cudaMemcpyDeviceToDevice));
-    // const auto scalar_variables = data.members.size() - data.num_ptr_members;
-    // logger->info("Copying scalar variables to GPU");
-    // for (auto i = data.num_ptr_members; i < data.num_ptr_members + scalar_variables; i++) {
-    //     // Copy data to GPU
-    //     checkCudaErrors(cudaMemcpy(data_dev->members[i], data.members[i], sizeof(double), cudaMemcpyHostToDevice));
-    // }
-    // return data_dev;
     void* dev_base_ptr;
     const auto ptr_vars_size = data.num_ptr_members * sizeof(double*);
     auto scalar_vars_size = 0;
@@ -95,6 +68,7 @@ void* copy_instance_data_gpu(const codegen::CodegenInstanceData& data) {
         auto offseted_place = (char*)dev_base_ptr+data.offsets[i];
         logger->info("Memcpy pointer to dev_base_ptr {}: {} ({})", i, dev_member_ptr, sizeof(double*));
         checkCudaErrors(cudaMemcpy(offseted_place, &dev_member_ptr, sizeof(double*), cudaMemcpyHostToDevice));
+        logger->info("Pointer saved to {}", (void*)offseted_place);
     }
     // memcpy the scalar values
     auto offseted_place_dev = (char*)dev_base_ptr+data.offsets[data.num_ptr_members];
@@ -103,6 +77,30 @@ void* copy_instance_data_gpu(const codegen::CodegenInstanceData& data) {
     return dev_base_ptr;
 }
 
+void copy_instance_data_host(codegen::CodegenInstanceData& data, void* dev_base_ptr) {
+    const auto ptr_vars_size = data.num_ptr_members * sizeof(double*);
+    auto scalar_vars_size = 0;
+    const auto num_scalar_vars = data.members.size() - data.num_ptr_members;
+    for (int i = 0; i < num_scalar_vars; i++) {
+        scalar_vars_size += data.members_size[i+data.num_ptr_members];
+    }
+    const auto host_base_ptr = data.base_ptr;
+    logger->info("dev_base_ptr addr: {} host_base_ptr {}", dev_base_ptr, host_base_ptr);
+    for (auto i = 0; i < data.num_ptr_members; i++) {
+        auto size_of_var = data.members_size[i];
+        void* offset_dev_ptr = (char*)dev_base_ptr+data.offsets[i];
+        logger->info("Copying data {} from GPU back to host {} from {}", i, data.members[i], offset_dev_ptr);
+        void* gpu_offset_addr;
+        checkCudaErrors(cudaMemcpy(&gpu_offset_addr, offset_dev_ptr, sizeof(double*), cudaMemcpyDeviceToHost));
+        logger->info("GPU mem addr {}", gpu_offset_addr);
+        checkCudaErrors(cudaMemcpy(data.members[i], gpu_offset_addr, size_of_var*data.num_elements, cudaMemcpyDeviceToHost));
+    }
+    // memcpy the scalar values
+    void* offseted_place_dev = (char*)dev_base_ptr+data.offsets[data.num_ptr_members];
+    void* offseted_place_host = (char*)(data.base_ptr)+data.offsets[data.num_ptr_members];
+    checkCudaErrors(cudaMemcpy(offseted_place_host, offseted_place_dev, scalar_vars_size, cudaMemcpyDeviceToHost));
+}
+
 void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
     // Set the codegen data helper and find the kernels.
     auto codegen_data = codegen::CodegenDataHelper(node, llvm_visitor.get_instance_struct_ptr());
@@ -212,19 +210,30 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 
                 // Record the execution time of the kernel.
                 std::string wrapper_name = "__" + kernel_name + "_wrapper";
+                struct test__instance_var__type  {
+                    double* __restrict__ x;
+                    double* __restrict__ y;
+                    double* __restrict__ m;
+                    double* __restrict__ Dm;
+                    double* __restrict__ v_unused;
+                    double* __restrict__ g_unused;
+                    double* __restrict__ voltage;
+                    int* __restrict__ node_index;
+                    double t;
+                    double dt;
+                    double celsius;
+                    int secondorder;
+                    int node_count;
+                };
+                for(int i = 0; i < 5; ++i) {
+                    std::cout << static_cast<test__instance_var__type*>(instance_data.base_ptr)->y[i] << std::endl;;
+                }
+                for(int i = 0; i < 5; ++i) {
+                    std::cout << static_cast<test__instance_var__type*>(instance_data.base_ptr)->m[i] << std::endl;;
+                }
                 auto start = std::chrono::steady_clock::now();
 #ifdef NMODL_LLVM_CUDA_BACKEND
                 if (platform.is_CUDA_gpu()) {
-                    // int deviceId;
-                    // cudaGetDevice(&deviceId);
-                    // int cudaDevAttrConcurrentManagedAccess_value;
-                    // cudaDeviceGetAttribute(&cudaDevAttrConcurrentManagedAccess_value, cudaDevAttrConcurrentManagedAccess, deviceId);
-                    // logger->info("Using GPU with deviceId {} number of bytes {} cudaDevAttrConcurrentManagedAccess {}", deviceId, instance_data.num_bytes, cudaDevAttrConcurrentManagedAccess_value);
-                    // cudaMemPrefetchAsync(instance_data.base_ptr, instance_data.num_bytes, deviceId);
-                    // void* base_ptr_dev;
-                    // cudaMemcpy(base_ptr_dev, instance_data.base_ptr, instance_data.num_bytes, cudaMemcpyHostToDevice);
-                    // prefetch_gpu_memory(instance_data);
-                    // const auto& dev_ptr = copy_instance_data_gpu(instance_data);
                     cuda_runner->run_with_argument<void*>(wrapper_name,
                                                           dev_ptr,
                                                           gpu_execution_parameters);
@@ -236,6 +245,17 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 #endif
                 auto end = std::chrono::steady_clock::now();
                 std::chrono::duration<double> diff = end - start;
+#ifdef NMODL_LLVM_CUDA_BACKEND
+                if (platform.is_CUDA_gpu()) {
+                    copy_instance_data_host(instance_data, dev_ptr);
+                }
+#endif
+                for(int i = 0; i < 5; ++i) {
+                    std::cout << static_cast<test__instance_var__type*>(instance_data.base_ptr)->y[i] << std::endl;;
+                }
+                for(int i = 0; i < 5; ++i) {
+                    std::cout << static_cast<test__instance_var__type*>(instance_data.base_ptr)->m[i] << std::endl;;
+                }
 
                 // Log the time taken for each run.
                 logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index b7fdfa25ec..b7d2b0186a 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -22,11 +22,11 @@ const int default_second_order_value = 0;
 CodegenInstanceData::~CodegenInstanceData() {
     // first free num_ptr_members members which are pointers
     for (size_t i = 0; i < num_ptr_members; i++) {
-#ifdef NMODL_LLVM_CUDA_BACKEND
-        cudaFree(members[i]);
-#else
+// #ifdef NMODL_LLVM_CUDA_BACKEND
+//         cudaFree(members[i]);
+// #else
         free(members[i]);
-#endif
+// #endif
     }
 // and then pointer to container struct
 #ifdef NMODL_LLVM_CUDA_BACKEND

From c0e8f31dfd646e371566414ae477a3eeb72f0ebd Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 3 May 2022 17:22:08 +0200
Subject: [PATCH 311/331] Clean comments and debug prints

---
 test/benchmark/llvm_benchmark.cpp         | 37 -----------------------
 test/unit/codegen/codegen_data_helper.cpp | 23 +-------------
 2 files changed, 1 insertion(+), 59 deletions(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 1b00e4771d..a42dfc3e46 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -53,22 +53,16 @@ void* copy_instance_data_gpu(const codegen::CodegenInstanceData& data) {
     for (int i = 0; i < num_scalar_vars; i++) {
         scalar_vars_size += data.members_size[i+data.num_ptr_members];
     }
-    logger->info("Malloc dev_base_ptr for the struct");
     checkCudaErrors(cudaMalloc(&dev_base_ptr, ptr_vars_size + scalar_vars_size));
-    logger->info("dev_base_ptr addr: {}", dev_base_ptr);
     for (auto i = 0; i < data.num_ptr_members; i++) {
         // Allocate a vector with the correct size
         void* dev_member_ptr;
         auto size_of_var = data.members_size[i];
-        logger->info("Malloc member {}", i);
         checkCudaErrors(cudaMalloc(&dev_member_ptr, size_of_var*data.num_elements));
-        logger->info("Memcpy vector of member {}: {} ({})", i, data.members[i], size_of_var*data.num_elements);
         checkCudaErrors(cudaMemcpy(dev_member_ptr, data.members[i], size_of_var*data.num_elements, cudaMemcpyHostToDevice));
         // Copy the pointer addresses to the struct
         auto offseted_place = (char*)dev_base_ptr+data.offsets[i];
-        logger->info("Memcpy pointer to dev_base_ptr {}: {} ({})", i, dev_member_ptr, sizeof(double*));
         checkCudaErrors(cudaMemcpy(offseted_place, &dev_member_ptr, sizeof(double*), cudaMemcpyHostToDevice));
-        logger->info("Pointer saved to {}", (void*)offseted_place);
     }
     // memcpy the scalar values
     auto offseted_place_dev = (char*)dev_base_ptr+data.offsets[data.num_ptr_members];
@@ -85,14 +79,11 @@ void copy_instance_data_host(codegen::CodegenInstanceData& data, void* dev_base_
         scalar_vars_size += data.members_size[i+data.num_ptr_members];
     }
     const auto host_base_ptr = data.base_ptr;
-    logger->info("dev_base_ptr addr: {} host_base_ptr {}", dev_base_ptr, host_base_ptr);
     for (auto i = 0; i < data.num_ptr_members; i++) {
         auto size_of_var = data.members_size[i];
         void* offset_dev_ptr = (char*)dev_base_ptr+data.offsets[i];
-        logger->info("Copying data {} from GPU back to host {} from {}", i, data.members[i], offset_dev_ptr);
         void* gpu_offset_addr;
         checkCudaErrors(cudaMemcpy(&gpu_offset_addr, offset_dev_ptr, sizeof(double*), cudaMemcpyDeviceToHost));
-        logger->info("GPU mem addr {}", gpu_offset_addr);
         checkCudaErrors(cudaMemcpy(data.members[i], gpu_offset_addr, size_of_var*data.num_elements, cudaMemcpyDeviceToHost));
     }
     // memcpy the scalar values
@@ -210,27 +201,6 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
 
                 // Record the execution time of the kernel.
                 std::string wrapper_name = "__" + kernel_name + "_wrapper";
-                struct test__instance_var__type  {
-                    double* __restrict__ x;
-                    double* __restrict__ y;
-                    double* __restrict__ m;
-                    double* __restrict__ Dm;
-                    double* __restrict__ v_unused;
-                    double* __restrict__ g_unused;
-                    double* __restrict__ voltage;
-                    int* __restrict__ node_index;
-                    double t;
-                    double dt;
-                    double celsius;
-                    int secondorder;
-                    int node_count;
-                };
-                for(int i = 0; i < 5; ++i) {
-                    std::cout << static_cast<test__instance_var__type*>(instance_data.base_ptr)->y[i] << std::endl;;
-                }
-                for(int i = 0; i < 5; ++i) {
-                    std::cout << static_cast<test__instance_var__type*>(instance_data.base_ptr)->m[i] << std::endl;;
-                }
                 auto start = std::chrono::steady_clock::now();
 #ifdef NMODL_LLVM_CUDA_BACKEND
                 if (platform.is_CUDA_gpu()) {
@@ -250,13 +220,6 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
                     copy_instance_data_host(instance_data, dev_ptr);
                 }
 #endif
-                for(int i = 0; i < 5; ++i) {
-                    std::cout << static_cast<test__instance_var__type*>(instance_data.base_ptr)->y[i] << std::endl;;
-                }
-                for(int i = 0; i < 5; ++i) {
-                    std::cout << static_cast<test__instance_var__type*>(instance_data.base_ptr)->m[i] << std::endl;;
-                }
-
                 // Log the time taken for each run.
                 logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
 
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index b7d2b0186a..6e2a8c46aa 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -22,18 +22,10 @@ const int default_second_order_value = 0;
 CodegenInstanceData::~CodegenInstanceData() {
     // first free num_ptr_members members which are pointers
     for (size_t i = 0; i < num_ptr_members; i++) {
-// #ifdef NMODL_LLVM_CUDA_BACKEND
-//         cudaFree(members[i]);
-// #else
         free(members[i]);
-// #endif
     }
-// and then pointer to container struct
-#ifdef NMODL_LLVM_CUDA_BACKEND
-    cudaFree(base_ptr);
-#else
+    // and then pointer to container struct
     free(base_ptr);
-#endif
 }
 
 /**
@@ -97,12 +89,7 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
     // max size of each member : pointer / double has maximum size
     size_t member_size = std::max(sizeof(double), sizeof(double*));
 
-// allocate instance object with memory alignment
-// #ifdef NMODL_LLVM_CUDA_BACKEND
-//     cudaMallocManaged(&base, member_size * variables.size());
-// #else
     posix_memalign(&base, NBYTE_ALIGNMENT, member_size * variables.size());
-// #endif
 
     data.base_ptr = base;
     data.num_bytes += member_size * variables.size();
@@ -132,15 +119,7 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
 
         // allocate memory and setup a pointer
         void* member;
-// #ifdef NMODL_LLVM_CUDA_BACKEND
-//         cudaMallocManaged(&member, member_size * num_elements);
-//         int deviceId;
-//         cudaGetDevice(&deviceId);
-//         cudaMemPrefetchAsync(&member, member_size * num_elements, deviceId);
-// #else
         posix_memalign(&member, NBYTE_ALIGNMENT, member_size * num_elements);
-// #endif
-        logger->info("Allocated {} bytes in {}", member_size * num_elements, member);
 
         // integer values are often offsets so they must start from
         // 0 to num_elements-1 to avoid out of bound accesses.

From a5b7179762974897578a0fc6d940d37a3f666942 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 3 May 2022 17:29:47 +0200
Subject: [PATCH 312/331] Make clang-format happy

---
 test/benchmark/llvm_benchmark.cpp         | 51 ++++++++++++++---------
 test/unit/codegen/codegen_data_helper.cpp |  9 ++--
 2 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index a42dfc3e46..b96ecfaea5 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -40,8 +40,7 @@ void LLVMBenchmark::generate_llvm(const std::shared_ptr<ast::Program>& node) {
 
 void checkCudaErrors(cudaError error) {
     if (error != cudaSuccess) {
-        throw std::runtime_error(
-                "CUDA Execution Error: {}\n"_format(cudaGetErrorString(error)));
+        throw std::runtime_error("CUDA Execution Error: {}\n"_format(cudaGetErrorString(error)));
     }
 }
 
@@ -51,23 +50,28 @@ void* copy_instance_data_gpu(const codegen::CodegenInstanceData& data) {
     auto scalar_vars_size = 0;
     const auto num_scalar_vars = data.members.size() - data.num_ptr_members;
     for (int i = 0; i < num_scalar_vars; i++) {
-        scalar_vars_size += data.members_size[i+data.num_ptr_members];
+        scalar_vars_size += data.members_size[i + data.num_ptr_members];
     }
     checkCudaErrors(cudaMalloc(&dev_base_ptr, ptr_vars_size + scalar_vars_size));
     for (auto i = 0; i < data.num_ptr_members; i++) {
         // Allocate a vector with the correct size
         void* dev_member_ptr;
         auto size_of_var = data.members_size[i];
-        checkCudaErrors(cudaMalloc(&dev_member_ptr, size_of_var*data.num_elements));
-        checkCudaErrors(cudaMemcpy(dev_member_ptr, data.members[i], size_of_var*data.num_elements, cudaMemcpyHostToDevice));
+        checkCudaErrors(cudaMalloc(&dev_member_ptr, size_of_var * data.num_elements));
+        checkCudaErrors(cudaMemcpy(dev_member_ptr,
+                                   data.members[i],
+                                   size_of_var * data.num_elements,
+                                   cudaMemcpyHostToDevice));
         // Copy the pointer addresses to the struct
-        auto offseted_place = (char*)dev_base_ptr+data.offsets[i];
-        checkCudaErrors(cudaMemcpy(offseted_place, &dev_member_ptr, sizeof(double*), cudaMemcpyHostToDevice));
+        auto offseted_place = (char*) dev_base_ptr + data.offsets[i];
+        checkCudaErrors(
+            cudaMemcpy(offseted_place, &dev_member_ptr, sizeof(double*), cudaMemcpyHostToDevice));
     }
     // memcpy the scalar values
-    auto offseted_place_dev = (char*)dev_base_ptr+data.offsets[data.num_ptr_members];
-    auto offseted_place_host = (char*)(data.base_ptr)+data.offsets[data.num_ptr_members];
-    checkCudaErrors(cudaMemcpy(offseted_place_dev, offseted_place_host, scalar_vars_size, cudaMemcpyHostToDevice));
+    auto offseted_place_dev = (char*) dev_base_ptr + data.offsets[data.num_ptr_members];
+    auto offseted_place_host = (char*) (data.base_ptr) + data.offsets[data.num_ptr_members];
+    checkCudaErrors(cudaMemcpy(
+        offseted_place_dev, offseted_place_host, scalar_vars_size, cudaMemcpyHostToDevice));
     return dev_base_ptr;
 }
 
@@ -76,20 +80,25 @@ void copy_instance_data_host(codegen::CodegenInstanceData& data, void* dev_base_
     auto scalar_vars_size = 0;
     const auto num_scalar_vars = data.members.size() - data.num_ptr_members;
     for (int i = 0; i < num_scalar_vars; i++) {
-        scalar_vars_size += data.members_size[i+data.num_ptr_members];
+        scalar_vars_size += data.members_size[i + data.num_ptr_members];
     }
     const auto host_base_ptr = data.base_ptr;
     for (auto i = 0; i < data.num_ptr_members; i++) {
         auto size_of_var = data.members_size[i];
-        void* offset_dev_ptr = (char*)dev_base_ptr+data.offsets[i];
+        void* offset_dev_ptr = (char*) dev_base_ptr + data.offsets[i];
         void* gpu_offset_addr;
-        checkCudaErrors(cudaMemcpy(&gpu_offset_addr, offset_dev_ptr, sizeof(double*), cudaMemcpyDeviceToHost));
-        checkCudaErrors(cudaMemcpy(data.members[i], gpu_offset_addr, size_of_var*data.num_elements, cudaMemcpyDeviceToHost));
+        checkCudaErrors(
+            cudaMemcpy(&gpu_offset_addr, offset_dev_ptr, sizeof(double*), cudaMemcpyDeviceToHost));
+        checkCudaErrors(cudaMemcpy(data.members[i],
+                                   gpu_offset_addr,
+                                   size_of_var * data.num_elements,
+                                   cudaMemcpyDeviceToHost));
     }
     // memcpy the scalar values
-    void* offseted_place_dev = (char*)dev_base_ptr+data.offsets[data.num_ptr_members];
-    void* offseted_place_host = (char*)(data.base_ptr)+data.offsets[data.num_ptr_members];
-    checkCudaErrors(cudaMemcpy(offseted_place_host, offseted_place_dev, scalar_vars_size, cudaMemcpyDeviceToHost));
+    void* offseted_place_dev = (char*) dev_base_ptr + data.offsets[data.num_ptr_members];
+    void* offseted_place_host = (char*) (data.base_ptr) + data.offsets[data.num_ptr_members];
+    checkCudaErrors(cudaMemcpy(
+        offseted_place_host, offseted_place_dev, scalar_vars_size, cudaMemcpyDeviceToHost));
 }
 
 void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
@@ -173,7 +182,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
         double time_mean = time_sum / num_experiments;
         logger->info("Average compute time = {:.6f}", time_mean);
         logger->info("Compute time variance = {:g}",
-                        time_squared_sum / num_experiments - time_mean * time_mean);
+                     time_squared_sum / num_experiments - time_mean * time_mean);
         logger->info("Minimum compute time = {:.6f}", time_min);
         logger->info("Maximum compute time = {:.6f}\n", time_max);
     } else {
@@ -196,7 +205,9 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
                 // Log instance size once.
                 if (i == 0) {
                     double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-                    logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
+                    logger->info("Benchmarking kernel '{}' with {} MBs dataset",
+                                 kernel_name,
+                                 size_mbs);
                 }
 
                 // Record the execution time of the kernel.
@@ -233,7 +244,7 @@ void LLVMBenchmark::run_benchmark(const std::shared_ptr<ast::Program>& node) {
             double time_mean = time_sum / num_experiments;
             logger->info("Average compute time = {:.6f}", time_mean);
             logger->info("Compute time variance = {:g}",
-                        time_squared_sum / num_experiments - time_mean * time_mean);
+                         time_squared_sum / num_experiments - time_mean * time_mean);
             logger->info("Minimum compute time = {:.6f}", time_min);
             logger->info("Maximum compute time = {:.6f}\n", time_max);
         }
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index 6e2a8c46aa..9f7b3ab6b3 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -147,15 +147,12 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
 
     int cnt{};
     for (auto& var: variables) {
-        // printout vars 
-        std::cout << cnt++ 
-            << ":\t" << to_string(var->get_type()->get_type())  
-            << '\t' << var->get_is_pointer()
-            << '\t' << var->get_name()->get_node_name()  << '\n';
+        // printout vars
+        std::cout << cnt++ << ":\t" << to_string(var->get_type()->get_type()) << '\t'
+                  << var->get_is_pointer() << '\t' << var->get_name()->get_node_name() << '\n';
     }
 
 
-
     // we are now switching from pointer type to next member type (e.g. double)
     // ideally we should use padding but switching from double* to double should
     // already meet alignment requirements

From 3c81509df0a7ec8ffe7575d2ca86e00919436b2b Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 3 May 2022 18:25:54 +0200
Subject: [PATCH 313/331] Small changes to handle external kernel and fixes in
 the script

---
 src/main.cpp                      |  7 +----
 src/pybind/pynmodl.cpp            |  1 +
 test/benchmark/llvm_benchmark.cpp | 50 +++++++++++++------------------
 test/benchmark/nmodl-llvm-time.sh | 12 ++++----
 4 files changed, 29 insertions(+), 41 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 11c66547b8..dec9ef6ec8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -444,14 +444,9 @@ int main(int argc, const char* argv[]) {
                                                        num_experiments,
                                                        instance_size,
                                                        platform,
-<<<<<<< HEAD
-                                                       llvm_opt_level_ir,
-                                                       llvm_opt_level_codegen,
-                                                       external_kernel,
-=======
                                                        cfg.llvm_opt_level_ir,
                                                        cfg.llvm_opt_level_codegen,
->>>>>>> magkanar/gpu-runner
+                                                       external_kernel,
                                                        gpu_execution_parameters);
                     benchmark.run();
                 }
diff --git a/src/pybind/pynmodl.cpp b/src/pybind/pynmodl.cpp
index bb3cb443f0..c1e82d411e 100644
--- a/src/pybind/pynmodl.cpp
+++ b/src/pybind/pynmodl.cpp
@@ -204,6 +204,7 @@ class JitDriver {
                                                   platform,
                                                   cfg.llvm_opt_level_ir,
                                                   cfg.llvm_opt_level_codegen,
+                                                  false,
                                                   gpu_execution_parameters);
         return benchmark.run();
     }
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index d69d89d346..2b63acf53f 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -44,7 +44,7 @@ void LLVMBenchmark::generate_llvm() {
 
 void checkCudaErrors(cudaError error) {
     if (error != cudaSuccess) {
-        throw std::runtime_error("CUDA Execution Error: {}\n"_format(cudaGetErrorString(error)));
+        throw std::runtime_error(fmt::format("CUDA Execution Error: {}\n", cudaGetErrorString(error)));
     }
 }
 
@@ -150,21 +150,11 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
     }
 #endif
 
-<<<<<<< HEAD
+    BenchmarkResults results{};
     if (external_kernel) {
         // benchmark external kernel
         logger->info("Benchmarking external kernel");
-        double time_min = std::numeric_limits<double>::max();
-        double time_max = 0.0;
-        double time_sum = 0.0;
-        double time_squared_sum = 0.0;
-=======
-    BenchmarkResults results{};
-    // Benchmark every kernel.
-    for (const auto& kernel_name: kernel_names) {
-        // For every kernel run the benchmark `num_experiments` times and collect runtimes.
         auto times = std::vector<double>(num_experiments, 0.0);
->>>>>>> magkanar/gpu-runner
         for (int i = 0; i < num_experiments; ++i) {
             // Initialise the data.
             auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
@@ -203,15 +193,12 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
         logger->info("Compute time standard deviation = {:8f}", time_stdev);
         logger->info("Minimum compute time = {:.6f}", time_min);
         logger->info("Maximum compute time = {:.6f}\n", time_max);
-<<<<<<< HEAD
+        results["nrn_state_hh_ext"] = {time_mean, time_stdev, time_min, time_max};
     } else {
         // Benchmark every kernel.
         for (const auto& kernel_name: kernel_names) {
-            // For every kernel run the benchmark `num_experiments` times.
-            double time_min = std::numeric_limits<double>::max();
-            double time_max = 0.0;
-            double time_sum = 0.0;
-            double time_squared_sum = 0.0;
+            // For every kernel run the benchmark `num_experiments` times and collect runtimes.
+            auto times = std::vector<double>(num_experiments, 0.0);
             for (int i = 0; i < num_experiments; ++i) {
                 // Initialise the data.
                 auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
@@ -251,25 +238,30 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
                 }
 #endif
                 // Log the time taken for each run.
-                logger->info("Experiment {} compute time = {:.6f} sec", i, diff.count());
+                logger->debug("Experiment {} compute time = {:.6f} sec", i, diff.count());
 
                 // Update statistics.
-                time_sum += diff.count();
-                time_squared_sum += diff.count() * diff.count();
-                time_min = std::min(time_min, diff.count());
-                time_max = std::max(time_max, diff.count());
+                times[i] = diff.count();
             }
+            // Calculate statistics
+            double time_mean = std::accumulate(times.begin(), times.end(), 0.0) / num_experiments;
+            double time_var = std::accumulate(times.begin(),
+                                            times.end(),
+                                            0.0,
+                                            [time_mean](const double& pres, const double& e) {
+                                                return (e - time_mean) * (e - time_mean);
+                                            }) /
+                            num_experiments;
+            double time_stdev = std::sqrt(time_var);
+            double time_min = *std::min_element(times.begin(), times.end());
+            double time_max = *std::max_element(times.begin(), times.end());
             // Log the average time taken for the kernel.
-            double time_mean = time_sum / num_experiments;
             logger->info("Average compute time = {:.6f}", time_mean);
-            logger->info("Compute time variance = {:g}",
-                         time_squared_sum / num_experiments - time_mean * time_mean);
+            logger->info("Compute time standard deviation = {:8f}", time_stdev);
             logger->info("Minimum compute time = {:.6f}", time_min);
             logger->info("Maximum compute time = {:.6f}\n", time_max);
+            results[kernel_name] = {time_mean, time_stdev, time_min, time_max};
         }
-=======
-        results[kernel_name] = {time_mean, time_stdev, time_min, time_max};
->>>>>>> magkanar/gpu-runner
     }
     return results;
 }
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 32a52371b3..aa7d2a64c9 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -190,9 +190,9 @@ declare -a benchmark_variance
 
 # Kernels, architectures and compilers loop
 
-KERNEL_TARGETS="hh" #"compute-bound memory-bound hh"
+KERNEL_TARGETS="compute-bound memory-bound hh"
 
-ARCHITECTURES="nvptx64" #"skylake_avx512 broadwell nehalem default nvptx64"
+ARCHITECTURES="skylake_avx512 broadwell nehalem default nvptx64"
 
 COMPILERS="intel clang gcc"
 
@@ -318,8 +318,8 @@ for kernel_target in ${KERNEL_TARGETS}; do
                     nmodl_args="${kernels_path}/${kernel_target}.mod --output ${output_dir}/${benchmark_nmodl_desc} llvm --ir ${fast_math_flag} --opt-level-ir 3 cpu --name ${nmodl_architecture} --vector-width ${vec_width} --math-library ${math_lib} benchmark --run --instance-size ${kernel_inst_size} --repeat ${num_exp} --opt-level-codegen 3 --libs ${math_lib_path}"
                     # runs only kernel generated by LLVM IR
                     ${debug} eval "${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
-                    benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
-                    benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
+                    benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}' | tail -n 1))
+                    benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}' | tail -n 1))
                 done
             else
                 benchmark_nmodl_desc=${kernel_target}_nmodl-cuda-jit_libdevice_${nmodl_architecture}_v${vec_width}_${fast_math_opt}
@@ -333,8 +333,8 @@ for kernel_target in ${KERNEL_TARGETS}; do
                     nvidia_profile="${nsys_exec} profile --stats=true --force-overwrite=true -o ${output_dir}/${kernel_target}_${fast_math_opt}_nsys"
                 fi
                 ${debug} eval "${nvidia_profile} ${nmodl_exe} ${nmodl_args} 2>&1 | tee ${output_dir}/${benchmark_nmodl_desc}.log"
-                benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
-                benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}'))
+                benchmark_time+=($(grep "Average compute time" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}' | tail -n 1))
+                benchmark_variance+=($(grep "Compute time variance" ${output_dir}/${benchmark_nmodl_desc}.log | awk '{print $NF}' | tail -n 1))
             fi
         done
     done

From 95782bc8f3f775869a884a1dbfb539858c414d68 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 9 May 2022 14:00:15 +0200
Subject: [PATCH 314/331] [LLVM][GPU] Added CUDADriver to execute benchmark on
 GPU (#829)

- Added CUDADriver to compile LLVM IR string generated from CodegenLLVMVisitor to PTX string and then execute it using CUDA API
- Ability to select the compilation GPU architecture and then set the proper GPU architecture based on the GPU that is going to be used
- Link `libdevice` math library with GPU LLVM module
- Handles kernel and wrapper functions attributes properly for GPU execution (wrapper function is `kernel` and kernel attribute is `device`)
- Small fixes in InstanceStruct declaration and setup to allocate the pointer variables properly, including the shadow variables
- Adds tests in the CI that run small benchmarks in CPU and GPU on BB5
- Adds replacement of `log` math function for SLEEF and libdevice, `pow` and `fabs` for libdevice
- Adds GPU execution ability in PyJIT
- Small improvement in PyJIT benchmark python script to handle arguments and GPU execution
- Separated benchmark info from benchmark driver
- Added hh and expsyn mod files in benchmarking tests
---
 .gitlab-ci.yml                                |  37 ++--
 CMakeLists.txt                                |   2 +
 INSTALL.md                                    |  25 ++-
 src/codegen/codegen_driver.hpp                |   3 -
 .../llvm/codegen_llvm_helper_visitor.cpp      |  18 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  78 +++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  12 +-
 src/codegen/llvm/llvm_utils.cpp               |  35 ++-
 src/codegen/llvm/llvm_utils.hpp               |   9 +-
 .../llvm/replace_with_lib_functions.cpp       |  11 +-
 src/main.cpp                                  |  51 +++--
 src/pybind/CMakeLists.txt                     |   3 +-
 src/pybind/pynmodl.cpp                        |  36 +++-
 test/benchmark/CMakeLists.txt                 |  30 ++-
 test/benchmark/benchmark.py                   |  26 ++-
 test/benchmark/benchmark_info.hpp             |  29 +++
 test/benchmark/cuda_driver.cpp                | 201 ++++++++++++++++++
 test/benchmark/cuda_driver.hpp                | 187 ++++++++++++++++
 test/benchmark/gpu_parameters.hpp             |  27 +++
 test/benchmark/jit_driver.hpp                 |  20 +-
 test/benchmark/kernels/expsyn.mod             |  42 ++++
 test/benchmark/kernels/hh.mod                 | 125 +++++++++++
 test/benchmark/llvm_benchmark.cpp             |  55 ++++-
 test/benchmark/llvm_benchmark.hpp             |  49 ++++-
 test/integration/mod/test_math.mod            |  16 ++
 test/unit/CMakeLists.txt                      |   1 -
 .../codegen/codegen_llvm_instance_struct.cpp  |  18 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  12 +-
 test/unit/codegen/codegen_llvm_visitor.cpp    |  12 +-
 29 files changed, 1045 insertions(+), 125 deletions(-)
 create mode 100644 test/benchmark/benchmark_info.hpp
 create mode 100644 test/benchmark/cuda_driver.cpp
 create mode 100644 test/benchmark/cuda_driver.hpp
 create mode 100644 test/benchmark/gpu_parameters.hpp
 create mode 100644 test/benchmark/kernels/expsyn.mod
 create mode 100644 test/benchmark/kernels/hh.mod
 create mode 100644 test/integration/mod/test_math.mod

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b44650c555..13a347b7e0 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,7 +26,6 @@ trigger cvf:
   variables:
     SPACK_PACKAGE: nmodl
     SPACK_PACKAGE_SPEC: ~legacy-unit+python+llvm
-    SPACK_EXTRA_MODULES: llvm
     SPACK_INSTALL_EXTRA_FLAGS: -v
 
 spack_setup:
@@ -44,14 +43,6 @@ build:intel:
   variables:
     SPACK_PACKAGE_COMPILER: intel
 
-build:gcc:
-  extends:
-    - .spack_build
-    - .spack_nmodl
-  variables:
-    SPACK_PACKAGE_COMPILER: gcc
-    SPACK_PACKAGE_DEPENDENCIES: ^bison%gcc^flex%gcc^py-jinja2%gcc^py-sympy%gcc^py-pyyaml%gcc
-
 .nmodl_tests:
   variables:
     # https://github.com/BlueBrain/nmodl/issues/737
@@ -63,8 +54,30 @@ test:intel:
     - .nmodl_tests
   needs: ["build:intel"]
 
-test:gcc:
+.benchmark_config:
+  variables:
+    bb5_ntasks: 1
+    bb5_cpus_per_task: 1
+    bb5_memory: 16G
+    bb5_exclusive: full
+    bb5_constraint: gpu_32g # CascadeLake CPU & V100 GPU node
+
+.build_allocation:
+  variables:
+    bb5_ntasks: 2   # so we block 16 cores
+    bb5_cpus_per_task: 8 # ninja -j {this}
+    bb5_memory: 76G # ~16*384/80
+
+build_cuda:gcc:
+  extends: [.spack_build, .build_allocation]
+  variables:
+    SPACK_PACKAGE: nmodl
+    SPACK_PACKAGE_SPEC: ~legacy-unit+python+llvm+llvm_cuda
+    SPACK_INSTALL_EXTRA_FLAGS: -v
+    SPACK_PACKAGE_COMPILER: gcc
+
+test_benchmark:gcc:
   extends:
+    - .benchmark_config
     - .ctest
-    - .nmodl_tests
-  needs: ["build:gcc"]
+  needs: ["build_cuda:gcc"]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2e8a84f7a..1e27e8d7ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" ON)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
 option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
 option(NMODL_ENABLE_LLVM_GPU "Enable LLVM based GPU code generation" ON)
+option(NMODL_ENABLE_LLVM_CUDA "Enable LLVM CUDA backend to run GPU benchmark" OFF)
 option(NMODL_ENABLE_JIT_EVENT_LISTENERS "Enable JITEventListener for Perf and Vtune" OFF)
 
 if(NMODL_ENABLE_LEGACY_UNITS)
@@ -162,6 +163,7 @@ if(NMODL_ENABLE_LLVM)
   if(NMODL_ENABLE_LLVM_CUDA)
     enable_language(CUDA)
     find_package(CUDAToolkit)
+    include_directories(${CUDAToolkit_INCLUDE_DIRS})
     add_definitions(-DNMODL_LLVM_CUDA_BACKEND)
   endif()
 endif()
diff --git a/INSTALL.md b/INSTALL.md
index 1b65c1212c..7ddb21b15c 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -21,7 +21,7 @@ To build the project from source, a modern C++ compiler with C++14 support is ne
 
 - flex (>=2.6)
 - bison (>=3.0)
-- CMake (>=3.15)
+- CMake (>=3.17)
 - Python (>=3.6)
 - Python packages : jinja2 (>=2.10), pyyaml (>=3.13), pytest (>=4.0.0), sympy (>=1.3), textwrap
 
@@ -141,6 +141,29 @@ export NMODL_WRAPLIB=/opt/nmodl/lib/libpywrapper.so
 **Note**: In order for all unit tests to function correctly when building without linking against libpython we must
 set `NMODL_PYLIB` before running cmake!
 
+### Using CUDA backend to run benchmarks
+
+`NMODL` supports generating code and compiling it for execution on an `NVIDIA` GPU via its benchmark infrastructure using the `LLVM` backend. To enable the `CUDA` backend to compile and execute the GPU code we need to set the following `CMake` flag during compilation of `NMODL`:
+```
+-DNMODL_ENABLE_LLVM_CUDA=ON
+```
+
+To find the need `CUDA` libraries (`cudart` and `nvrtc`) it's needed to have CUDA Toolkit installed on your system.
+This can be done by installing the CUDA Toolkit from the [CUDA Toolkit website](https://developer.nvidia.com/cuda-downloads) or by installing the `CUDA` spack package and loading the corresponding module.
+
+Then given a supported MOD file you can execute the benchmark on GPU in you supported NVIDIA GPU by running the following command:
+```
+./bin/nmodl <file>.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_80" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_ROOT}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2 --grid-dim-x 4096 --block-dim-x 256
+```
+The above command executes the benchmark on a GPU with `Compute Architecture` `sm_80` and links the generated code to the `libdevice` optimized math library provided by `NVIDIA`.
+Using the above command you can also select the optimization level of the generated code, the instance size of the generated data, the number of repetitions and the grid and block dimensions for the GPU execution.
+
+**Note**: In order for the CUDA backend to be able to compile and execute the generated code on GPU the CUDA Toolkit version installed needs to have the same version as the `CUDA` installed by the NVIDIA driver in the system that will be used to run the benchmark.
+You can find the CUDA Toolkit version by running the following command:
+```
+nvidia-smi
+```
+and noting the `CUDA Version` stated there. For example if `CUDA Version` reported by `nvidia-smi` is CUDA 11.4 you need to install the `CUDA Toolkit 11.4.*` to be able to compile and execute the GPU code.
 
 ## Testing the Installed Module
 
diff --git a/src/codegen/codegen_driver.hpp b/src/codegen/codegen_driver.hpp
index 78c95421da..14d8ed76ab 100644
--- a/src/codegen/codegen_driver.hpp
+++ b/src/codegen/codegen_driver.hpp
@@ -33,9 +33,6 @@ struct CodeGenConfig {
     /// true if cuda code to be generated
     bool cuda_backend = false;
 
-    /// true if llvm code to be generated
-    bool llvm_backend = false;
-
     /// true if sympy should be used for solving ODEs analytically
     bool sympy_analytic = false;
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 5f8119a4d1..5800beae6b 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -239,13 +239,6 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     add_var_with_type(VOLTAGE_VAR, FLOAT_TYPE, /*is_pointer=*/1);
     add_var_with_type(NODE_INDEX_VAR, INTEGER_TYPE, /*is_pointer=*/1);
 
-    // add dt, t, celsius
-    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
-
     // As we do not have `NrnThread` object as an argument, we store points to rhs
     // and d to in the instance struct as well. Also need their respective shadow variables
     // in case of point process mechanism.
@@ -256,6 +249,17 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     add_var_with_type(naming::NTHREAD_RHS_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
     add_var_with_type(naming::NTHREAD_D_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
 
+    // NOTE: All the pointer variables should be declared before the scalar variables otherwise
+    // the allocation of memory for the variables in the InstanceStruct and their offsets will be
+    // wrong
+
+    // add dt, t, celsius
+    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
+
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 9e159f7aff..de6c7ad914 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -25,6 +25,10 @@ namespace codegen {
 /*                                  Helper routines                                     */
 /****************************************************************************************/
 
+static std::string get_wrapper_name(const std::string& kernel_name) {
+    return "__" + kernel_name + "_wrapper";
+}
+
 /// A utility to check for supported Statement AST nodes.
 static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
@@ -55,15 +59,36 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
-void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel,
+                                                   const std::string& annotation = "kernel") {
     llvm::Metadata* metadata[] = {llvm::ValueAsMetadata::get(kernel),
-                                  llvm::MDString::get(*context, "kernel"),
+                                  llvm::MDString::get(*context, annotation),
                                   llvm::ValueAsMetadata::get(
                                       llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
     llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
     module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
 }
 
+void CodegenLLVMVisitor::annotate_wrapper_kernels_with_nvvm() {
+    // First clear all the nvvm annotations from the module
+    auto module_named_metadata = module->getNamedMetadata("nvvm.annotations");
+    module->eraseNamedMetadata(module_named_metadata);
+
+    // Then each kernel should be annotated as "device" function and wrappers should be annotated as
+    // "kernel" functions
+    std::vector<std::string> kernel_names;
+    find_kernel_names(kernel_names);
+
+    for (const auto& kernel_name: kernel_names) {
+        // Get the kernel function.
+        auto kernel = module->getFunction(kernel_name);
+        // Get the kernel wrapper function.
+        auto kernel_wrapper = module->getFunction(get_wrapper_name(kernel_name));
+        annotate_kernel_with_nvvm(kernel, "device");
+        annotate_kernel_with_nvvm(kernel_wrapper, "kernel");
+    }
+}
+
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
     node->accept(*this);
     return ir_builder.pop_last_value();
@@ -402,12 +427,17 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         auto kernel = module->getFunction(kernel_name);
 
         // Create a wrapper void function that takes a void pointer as a single argument.
-        llvm::Type* i32_type = ir_builder.get_i32_type();
+        llvm::Type* return_type;
+        if (platform.is_gpu()) {
+            return_type = ir_builder.get_void_type();
+        } else {
+            return_type = ir_builder.get_i32_type();
+        }
         llvm::Type* void_ptr_type = ir_builder.get_i8_ptr_type();
         llvm::Function* wrapper_func = llvm::Function::Create(
-            llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
+            llvm::FunctionType::get(return_type, {void_ptr_type}, /*isVarArg=*/false),
             llvm::Function::ExternalLinkage,
-            "__" + kernel_name + "_wrapper",
+            get_wrapper_name(kernel_name),
             *module);
 
         // Optionally, add debug information for the wrapper function.
@@ -425,9 +455,23 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         args.push_back(bitcasted);
         ir_builder.create_function_call(kernel, args, /*use_result=*/false);
 
-        // Create a 0 return value and a return instruction.
-        ir_builder.create_i32_constant(0);
-        ir_builder.create_return(ir_builder.pop_last_value());
+        // create return instructions and annotate wrapper with certain attributes depending on
+        // the backend type
+        if (platform.is_gpu()) {
+            // return void
+            ir_builder.create_return();
+        } else {
+            // Create a 0 return value and a return instruction.
+            ir_builder.create_i32_constant(0);
+            ir_builder.create_return(ir_builder.pop_last_value());
+            ir_builder.set_function(wrapper_func);
+            ir_builder.set_kernel_attributes();
+        }
+        ir_builder.clear_function();
+    }
+    // for GPU we need to first clear all the annotations and then reapply them
+    if (platform.is_gpu()) {
+        annotate_wrapper_kernels_with_nvvm();
     }
 }
 
@@ -823,9 +867,6 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     // Handle GPU optimizations (CUDA platfroms only for now).
     if (platform.is_gpu()) {
-        if (!platform.is_CUDA_gpu())
-            throw std::runtime_error("Error: unsupported GPU architecture!\n");
-
         // We only support CUDA backends anyway, so this works for now.
         utils::initialise_nvptx_passes();
 
@@ -839,15 +880,12 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         logger->debug("Dumping generated IR...\n" + dump_module());
     }
 
-    // If the output directory is specified, save the IR to .ll file.
-    if (output_dir != ".") {
-        utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
-    }
-
     // Setup CodegenHelper for C++ wrapper file
     setup(node);
+    // Print C++ wrapper file
     print_wrapper_routines();
-    print_target_file();
+    // Print LLVM IR module to <mod_filename>.ll file
+    utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
 }
 
 void CodegenLLVMVisitor::print_mechanism_range_var_structure() {
@@ -960,6 +998,12 @@ void CodegenLLVMVisitor::print_instance_variable_setup() {
     // Pass ml->nodeindices pointer to node_index
     printer->add_line("inst->node_index = ml->nodeindices;");
 
+    // Setup rhs, d and their shadow vectors
+    printer->add_line(fmt::format("inst->{} = nt->_actual_rhs;", naming::NTHREAD_RHS));
+    printer->add_line(fmt::format("inst->{} = nt->_actual_d;", naming::NTHREAD_D));
+    printer->add_line(fmt::format("inst->{} = nt->_shadow_rhs;", naming::NTHREAD_RHS_SHADOW));
+    printer->add_line(fmt::format("inst->{} = nt->_shadow_d;", naming::NTHREAD_D_SHADOW));
+
     // Setup global variables
     printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_T_VARIABLE));
     printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_DT_VARIABLE));
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 0862307337..a22f698431 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -139,10 +139,6 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
         return str;
     }
 
-    void print_target_file() const {
-        target_printer->add_multi_line(dump_module());
-    }
-
     /// Fills the container with the names of kernel functions from the MOD file.
     void find_kernel_names(std::vector<std::string>& container);
 
@@ -303,8 +299,12 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
     void print_compute_functions() override;
 
   private:
-    // Annotates kernel function with NVVM metadata.
-    void annotate_kernel_with_nvvm(llvm::Function* kernel);
+    /// Annotates kernel function with NVVM metadata.
+    void annotate_kernel_with_nvvm(llvm::Function* kernel, const std::string& annotation);
+
+    /// Handles NVVM function annotations when we create the wrapper functions. All original kernels
+    /// should be "device" functions and wrappers "kernel" functions
+    void annotate_wrapper_kernels_with_nvvm();
 
     /// Accepts the given AST node and returns the processed value.
     llvm::Value* accept_and_get(const std::shared_ptr<ast::Node>& node);
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index bd4feee32f..f6590fec5b 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -75,12 +75,9 @@ void initialise_nvptx_passes() {
     initialise_optimisation_passes();
 }
 
-void optimise_module_for_nvptx(codegen::Platform& platform,
-                               llvm::Module& module,
-                               int opt_level,
-                               std::string& target_asm) {
+std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform,
+                                                                llvm::Module& module) {
     // CUDA target machine we generating code for.
-    std::unique_ptr<llvm::TargetMachine> tm;
     std::string platform_name = platform.get_name();
 
     // Target and layout information.
@@ -111,9 +108,30 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
     if (!target)
         throw std::runtime_error("Error: " + error_msg + "\n");
 
+    std::unique_ptr<llvm::TargetMachine> tm;
     tm.reset(target->createTargetMachine(triple, subtarget, features, {}, {}));
     if (!tm)
         throw std::runtime_error("Error: creating target machine failed! Aborting.");
+    return tm;
+}
+
+std::string get_module_ptx(llvm::TargetMachine& tm, llvm::Module& module) {
+    std::string target_asm;
+    llvm::raw_string_ostream stream(target_asm);
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegen_pm;
+
+    tm.addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
+    codegen_pm.run(module);
+    return target_asm;
+}
+
+void optimise_module_for_nvptx(const codegen::Platform& platform,
+                               llvm::Module& module,
+                               int opt_level,
+                               std::string& target_asm) {
+    // Create target machine for CUDA GPU
+    auto tm = create_CUDA_target_machine(platform, module);
 
     // Create pass managers.
     llvm::legacy::FunctionPassManager func_pm(&module);
@@ -137,12 +155,7 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
 
     // Now, we want to run target-specific (e.g. NVPTX) passes. In LLVM, this
     // is done via `addPassesToEmitFile`.
-    llvm::raw_string_ostream stream(target_asm);
-    llvm::buffer_ostream pstream(stream);
-    llvm::legacy::PassManager codegen_pm;
-
-    tm->addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
-    codegen_pm.run(module);
+    target_asm = get_module_ptx(*tm, module);
 }
 
 void initialise_optimisation_passes() {
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index 3394463317..9763718ab0 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -21,11 +21,18 @@ void initialise_optimisation_passes();
 /// Initialises NVPTX-specific optimisation passes.
 void initialise_nvptx_passes();
 
+//// Initializes a CUDA target machine
+std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform,
+                                                                llvm::Module& module);
+
+/// Generate PTX code given a CUDA target machine and the module
+std::string get_module_ptx(llvm::TargetMachine& tm, llvm::Module& module);
+
 /// Replaces calls to LLVM intrinsics with appropriate library calls.
 void replace_with_lib_functions(codegen::Platform& platform, llvm::Module& module);
 
 /// Optimises the given LLVM IR module for NVPTX targets.
-void optimise_module_for_nvptx(codegen::Platform& platform,
+void optimise_module_for_nvptx(const codegen::Platform& platform,
                                llvm::Module& module,
                                int opt_level,
                                std::string& target_asm);
diff --git a/src/codegen/llvm/replace_with_lib_functions.cpp b/src/codegen/llvm/replace_with_lib_functions.cpp
index 6d98dd3eb0..750e2c2318 100644
--- a/src/codegen/llvm/replace_with_lib_functions.cpp
+++ b/src/codegen/llvm/replace_with_lib_functions.cpp
@@ -72,6 +72,8 @@ void ReplaceMathFunctions::add_vectorizable_functions_from_vec_lib(TargetLibrary
             DISPATCH("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2))
             DISPATCH("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4))
             DISPATCH("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2))
+            DISPATCH("llvm.log.f32", "_ZGVnN4v_logf", FIXED(4))
+            DISPATCH("llvm.log.f64", "_ZGVnN2v_log", FIXED(2))
             // clang-format on
         };
         const VecDesc x86_functions[] = {
@@ -82,6 +84,9 @@ void ReplaceMathFunctions::add_vectorizable_functions_from_vec_lib(TargetLibrary
             DISPATCH("llvm.pow.f64", "_ZGVbN2vv_pow", FIXED(2))
             DISPATCH("llvm.pow.f64", "_ZGVdN4vv_pow", FIXED(4))
             DISPATCH("llvm.pow.f64", "_ZGVeN8vv_pow", FIXED(8))
+            DISPATCH("llvm.log.f64", "_ZGVbN2v_log", FIXED(2))
+            DISPATCH("llvm.log.f64", "_ZGVdN4v_log", FIXED(4))
+            DISPATCH("llvm.log.f64", "_ZGVeN8v_log", FIXED(8))
             // clang-format on
         };
 #undef DISPATCH
@@ -166,7 +171,11 @@ bool ReplaceWithLibdevice::replace_call(CallInst& call_inst) {
     static const std::map<std::string, std::string> libdevice_name = {{"llvm.exp.f32", "__nv_expf"},
                                                                       {"llvm.exp.f64", "__nv_exp"},
                                                                       {"llvm.pow.f32", "__nv_powf"},
-                                                                      {"llvm.pow.f64", "__nv_pow"}};
+                                                                      {"llvm.pow.f64", "__nv_pow"},
+                                                                      {"llvm.log.f32", "__nv_logf"},
+                                                                      {"llvm.log.f64", "__nv_log"},
+                                                                      {"llvm.fabs.f32", "__nv_fabsf"},
+                                                                      {"llvm.fabs.f64", "__nv_fabs"}};
 
     // If replacement is not supported, abort.
     std::string old_name = function->getName().str();
diff --git a/src/main.cpp b/src/main.cpp
index c394a160f9..c19c742d3c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -66,6 +66,12 @@ int main(int argc, const char* argv[]) {
 
     /// the number of repeated experiments for the benchmarking
     int num_experiments = 100;
+
+    /// X dimension of grid in blocks for GPU execution
+    int llvm_cuda_grid_dim_x = 1;
+
+    /// X dimension of block in threads for GPU execution
+    int llvm_cuda_block_dim_x = 1;
 #endif
 
     CodeGenConfig cfg;
@@ -228,9 +234,10 @@ int main(int argc, const char* argv[]) {
     auto gpu_target_name = gpu_opt->add_option("--name",
         cfg.llvm_gpu_name,
         "Name of GPU platform to use")->ignore_case();
-   gpu_opt->add_option("--target-chip",
+    gpu_target_name->check(CLI::IsMember({"nvptx", "nvptx64"}));
+    gpu_opt->add_option("--target-arch",
         cfg.llvm_gpu_target_architecture,
-        "Name of target chip to use")->ignore_case();
+        "Name of target architecture to use")->ignore_case();
     auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
         cfg.llvm_math_library,
         "Math library for GPU code generation ({})"_format(cfg.llvm_math_library));
@@ -258,6 +265,12 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
+    benchmark_opt->add_option("--grid-dim-x",
+                              llvm_cuda_grid_dim_x,
+                              "Grid dimension X ({})"_format(llvm_cuda_grid_dim_x))->ignore_case();
+    benchmark_opt->add_option("--block-dim-x",
+                                llvm_cuda_block_dim_x,
+                                "Block dimension X ({})"_format(llvm_cuda_block_dim_x))->ignore_case();
 #endif
     // clang-format on
 
@@ -372,11 +385,18 @@ int main(int argc, const char* argv[]) {
                                                                         : cfg.llvm_gpu_name;
                 Platform platform(pid,
                                   name,
-                                  cfg.llvm_cpu_name,
+                                  cfg.llvm_gpu_target_architecture,
                                   cfg.llvm_math_library,
                                   cfg.llvm_float_type,
                                   cfg.llvm_vector_width);
 
+                // GPU code generation doesn't support debug information at the moment so disable it
+                // in case it's enabled
+                if (!cfg.llvm_no_debug && platform.is_gpu()) {
+                    logger->warn("Disabling addition of debug symbols in GPU code.");
+                    cfg.llvm_no_debug = true;
+                }
+
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
                                            cfg.output_dir,
@@ -397,23 +417,30 @@ int main(int argc, const char* argv[]) {
                 }
 
                 if (llvm_benchmark) {
-                    // \todo integrate Platform class here
-                    if (cfg.llvm_gpu_name != "default") {
-                        logger->warn(
-                            "GPU benchmarking is not supported, targeting "
-                            "CPU instead");
-                    }
-
                     logger->info("Running LLVM benchmark");
+                    if (platform.is_gpu() && !platform.is_CUDA_gpu()) {
+                        throw std::runtime_error(
+                            "Benchmarking is only supported on CUDA GPUs at the moment");
+                    }
+#ifndef NMODL_LLVM_CUDA_BACKEND
+                    if (platform.is_CUDA_gpu()) {
+                        throw std::runtime_error(
+                            "GPU benchmarking is not supported if NMODL is not built with CUDA "
+                            "backend enabled.");
+                    }
+#endif
+                    const GPUExecutionParameters gpu_execution_parameters{llvm_cuda_grid_dim_x,
+                                                                          llvm_cuda_block_dim_x};
                     benchmark::LLVMBenchmark benchmark(visitor,
                                                        modfile,
                                                        cfg.output_dir,
                                                        cfg.shared_lib_paths,
                                                        num_experiments,
                                                        instance_size,
-                                                       cfg.llvm_cpu_name,
+                                                       platform,
                                                        cfg.llvm_opt_level_ir,
-                                                       cfg.llvm_opt_level_codegen);
+                                                       cfg.llvm_opt_level_codegen,
+                                                       gpu_execution_parameters);
                     benchmark.run();
                 }
             }
diff --git a/src/pybind/CMakeLists.txt b/src/pybind/CMakeLists.txt
index 16f4a586cc..43be3b01a1 100644
--- a/src/pybind/CMakeLists.txt
+++ b/src/pybind/CMakeLists.txt
@@ -73,7 +73,8 @@ if(NMODL_ENABLE_PYTHON_BINDINGS)
 
   # Additional options are needed when LLVM JIT functionality is built
   if(NMODL_ENABLE_LLVM)
-    set_property(TARGET codegen llvm_codegen llvm_benchmark benchmark_data PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_property(TARGET codegen llvm_codegen llvm_benchmark benchmark_data
+                 PROPERTY POSITION_INDEPENDENT_CODE ON)
     target_link_libraries(_nmodl PRIVATE codegen llvm_codegen llvm_benchmark benchmark_data
                                          ${LLVM_LIBS_TO_LINK})
   endif()
diff --git a/src/pybind/pynmodl.cpp b/src/pybind/pynmodl.cpp
index 3b75a7f30a..ebaec890d1 100644
--- a/src/pybind/pynmodl.cpp
+++ b/src/pybind/pynmodl.cpp
@@ -151,8 +151,22 @@ class JitDriver {
                                              : nmodl::codegen::PlatformID::GPU;
         const std::string name = cfg.llvm_gpu_name == "default" ? cfg.llvm_cpu_name
                                                                 : cfg.llvm_gpu_name;
-        platform = nmodl::codegen::Platform(
-            pid, name, cfg.llvm_math_library, cfg.llvm_float_type, cfg.llvm_vector_width);
+        platform = nmodl::codegen::Platform(pid,
+                                            name,
+                                            cfg.llvm_gpu_target_architecture,
+                                            cfg.llvm_math_library,
+                                            cfg.llvm_float_type,
+                                            cfg.llvm_vector_width);
+        if (platform.is_gpu() && !platform.is_CUDA_gpu()) {
+            throw std::runtime_error("Benchmarking is only supported on CUDA GPUs at the moment");
+        }
+#ifndef NMODL_LLVM_CUDA_BACKEND
+        if (platform.is_CUDA_gpu()) {
+            throw std::runtime_error(
+                "GPU benchmarking is not supported if NMODL is not built with CUDA "
+                "backend enabled.");
+        }
+#endif
     }
 
   public:
@@ -171,7 +185,9 @@ class JitDriver {
     benchmark::BenchmarkResults run(std::shared_ptr<nmodl::ast::Program> node,
                                     std::string& modname,
                                     int num_experiments,
-                                    int instance_size) {
+                                    int instance_size,
+                                    int cuda_grid_dim_x,
+                                    int cuda_block_dim_x) {
         // New directory is needed to be created otherwise the directory cannot be created
         // automatically through python
         if (cfg.nmodl_ast || cfg.json_ast || cfg.json_perfstat) {
@@ -180,15 +196,17 @@ class JitDriver {
         cg_driver.prepare_mod(node, modname);
         nmodl::codegen::CodegenLLVMVisitor visitor(modname, cfg.output_dir, platform, 0);
         visitor.visit_program(*node);
+        const GPUExecutionParameters gpu_execution_parameters{cuda_grid_dim_x, cuda_block_dim_x};
         nmodl::benchmark::LLVMBenchmark benchmark(visitor,
                                                   modname,
                                                   cfg.output_dir,
                                                   cfg.shared_lib_paths,
                                                   num_experiments,
                                                   instance_size,
-                                                  cfg.llvm_cpu_name,
+                                                  platform,
                                                   cfg.llvm_opt_level_ir,
-                                                  cfg.llvm_opt_level_codegen);
+                                                  cfg.llvm_opt_level_codegen,
+                                                  gpu_execution_parameters);
         return benchmark.run();
     }
 };
@@ -228,7 +246,7 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
     cfg.def(py::init([]() {
            auto cfg = std::make_unique<nmodl::codegen::CodeGenConfig>();
            // set to more sensible defaults for python binding
-           cfg->llvm_backend = true;
+           cfg->llvm_ir = true;
            return cfg;
        }))
         .def_readwrite("sympy_analytic", &nmodl::codegen::CodeGenConfig::sympy_analytic)
@@ -265,6 +283,8 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
         .def_readwrite("llvm_fast_math_flags", &nmodl::codegen::CodeGenConfig::llvm_fast_math_flags)
         .def_readwrite("llvm_cpu_name", &nmodl::codegen::CodeGenConfig::llvm_cpu_name)
         .def_readwrite("llvm_gpu_name", &nmodl::codegen::CodeGenConfig::llvm_gpu_name)
+        .def_readwrite("llvm_gpu_target_architecture",
+                       &nmodl::codegen::CodeGenConfig::llvm_gpu_target_architecture)
         .def_readwrite("llvm_vector_width", &nmodl::codegen::CodeGenConfig::llvm_vector_width)
         .def_readwrite("llvm_opt_level_codegen",
                        &nmodl::codegen::CodeGenConfig::llvm_opt_level_codegen)
@@ -278,7 +298,9 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
              "node"_a,
              "modname"_a,
              "num_experiments"_a,
-             "instance_size"_a);
+             "instance_size"_a,
+             "cuda_grid_dim_x"_a = 1,
+             "cuda_block_dim_x"_a = 1);
 
     m_nmodl.def("to_nmodl",
                 static_cast<std::string (*)(const nmodl::ast::Ast&,
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index cc3e26bb35..5529b505d2 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -5,12 +5,20 @@ set(LLVM_BENCHMARK_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
 
+if(NMODL_ENABLE_LLVM_CUDA)
+  list(APPEND LLVM_BENCHMARK_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.cpp
+       ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.hpp)
+endif()
+
 # =============================================================================
 # LLVM benchmark library
 # =============================================================================
 include_directories(${LLVM_INCLUDE_DIRS})
 add_library(llvm_benchmark STATIC ${LLVM_BENCHMARK_SOURCE_FILES})
 add_dependencies(llvm_benchmark lexer util visitor)
+if(NMODL_ENABLE_LLVM_CUDA)
+  target_link_libraries(llvm_benchmark PRIVATE CUDA::cudart CUDA::nvrtc)
+endif()
 
 if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
   target_compile_definitions(llvm_benchmark PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
@@ -22,13 +30,33 @@ endif()
 
 if(NMODL_ENABLE_PYTHON_BINDINGS)
   file(GLOB modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/*.mod")
+  list(APPEND modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/integration/mod/test_math.mod")
   foreach(modfile ${modfiles})
+    # For expsyn.mod set the vector width to 1 since atomic operations are not supported for vector
+    # widths > 1. See https://github.com/BlueBrain/nmodl/issues/857
+    if(${modfile} STREQUAL "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/expsyn.mod")
+      set(extra_args "--vec 1")
+    endif()
     get_filename_component(modfile_name "${modfile}" NAME)
     add_test(NAME "PyJIT/${modfile_name}"
              COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
-                     ${modfile})
+                     --file ${modfile} ${extra_args})
     set_tests_properties(
       "PyJIT/${modfile_name}" PROPERTIES ENVIRONMENT
                                          PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH})
+    # Disable running the expsyn.mod on GPU because atomic instructions are not supported yet on GPU
+    # See https://github.com/BlueBrain/nmodl/issues/834
+    if(NMODL_ENABLE_LLVM_CUDA AND NOT ${modfile} STREQUAL "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/expsyn.mod")
+      add_test(NAME "PyJIT/${modfile_name}_gpu"
+               COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
+                      --file ${modfile} --gpu ${extra_args})
+      message(STATUS "CUDA_HOME is ${CUDAToolkit_TARGET_DIR}")
+      set_tests_properties(
+        "PyJIT/${modfile_name}_gpu"
+        PROPERTIES
+          ENVIRONMENT
+          "PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH};CUDA_HOME=${CUDAToolkit_TARGET_DIR}"
+      )
+    endif()
   endforeach()
 endif()
diff --git a/test/benchmark/benchmark.py b/test/benchmark/benchmark.py
index c133f8d59c..9144fa549d 100644
--- a/test/benchmark/benchmark.py
+++ b/test/benchmark/benchmark.py
@@ -1,17 +1,39 @@
+import argparse
 import sys
+import os
 
 import nmodl.dsl as nmodl
 from nmodl import ast, visitor
 
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Benchmark test script for NMODL.')
+    parser.add_argument('--gpu', action='store_true', default=False,
+                        help='Enable GPU JIT execution')
+    parser.add_argument('--vec', type=int, default=1,
+                        help='Vector width for CPU execution')
+    parser.add_argument('--file', type=str,
+                        help='NMODL file to benchmark')
+    args, _ = parser.parse_known_args()
+    return args
+
 def main():
+    args = parse_arguments()
+
     driver = nmodl.NmodlDriver()
     lookup_visitor = visitor.AstLookupVisitor()
 
     cfg = nmodl.CodeGenConfig()
-    cfg.llvm_vector_width = 4
+    cfg.llvm_vector_width = args.vec
     cfg.llvm_opt_level_ir = 2
     cfg.nmodl_ast = True
-    fname = sys.argv[1]
+    fname = args.file
+    if args.gpu:  # GPU enabled
+        cfg.llvm_math_library = "libdevice"
+        cfg.llvm_gpu_name = "nvptx64"
+        cfg.llvm_gpu_target_architecture = "sm_70"
+        if not os.environ.get("CUDA_HOME"):
+            raise RuntimeError("CUDA_HOME environment variable not set")
+        cfg.shared_lib_paths = [os.getenv("CUDA_HOME") + "/nvvm/libdevice/libdevice.10.bc"]
     with open(fname) as f:
         hh = f.read()
         modast = driver.parse_string(hh)
diff --git a/test/benchmark/benchmark_info.hpp b/test/benchmark/benchmark_info.hpp
new file mode 100644
index 0000000000..d02d33ce2e
--- /dev/null
+++ b/test/benchmark/benchmark_info.hpp
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+/// A struct to hold the information for benchmarking.
+struct BenchmarkInfo {
+    /// Object or PTX filename to dump.
+    std::string filename;
+
+    /// Object file output directory.
+    std::string output_dir;
+
+    /// Shared libraries' paths to link against.
+    std::vector<std::string> shared_lib_paths;
+
+    /// Optimisation level for IT.
+    int opt_level_ir;
+
+    /// Optimisation level for machine code generation.
+    int opt_level_codegen;
+};
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
new file mode 100644
index 0000000000..cecc97b35d
--- /dev/null
+++ b/test/benchmark/cuda_driver.cpp
@@ -0,0 +1,201 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <fstream>
+#include <regex>
+
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/llvm_utils.hpp"
+#include "cuda_driver.hpp"
+#include "fmt/format.h"
+#include "utils/common_utils.hpp"
+
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Target/TargetMachine.h"
+
+using fmt::literals::operator""_format;
+
+namespace nmodl {
+namespace runner {
+
+void CUDADriver::checkCudaErrors(CUresult err) {
+    if (err != CUDA_SUCCESS) {
+        const char* ret = NULL;
+        cuGetErrorName(err, &ret);
+        throw std::runtime_error("CUDA error: " + std::string(ret));
+    }
+}
+
+void CUDADriver::link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_info) {
+    llvm::Linker linker(module);
+    for (const auto& lib_path: benchmark_info->shared_lib_paths) {
+        const auto lib_name = lib_path.substr(lib_path.find_last_of("/\\") + 1);
+        std::regex libdevice_bitcode_name{"libdevice.*.bc"};
+        if (!std::regex_match(lib_name, libdevice_bitcode_name)) {
+            throw std::runtime_error("Only libdevice is supported for now");
+        }
+        // Load libdevice module to the LLVM Module
+        auto libdevice_file_memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
+        llvm::Expected<std::unique_ptr<llvm::Module>> libdevice_expected_module =
+            parseBitcodeFile(libdevice_file_memory_buffer->get()->getMemBufferRef(),
+                             module.getContext());
+        if (std::error_code error = errorToErrorCode(libdevice_expected_module.takeError())) {
+            throw std::runtime_error("Error reading bitcode: {}"_format(error.message()));
+        }
+        linker.linkInModule(std::move(libdevice_expected_module.get()),
+                            llvm::Linker::LinkOnlyNeeded);
+    }
+}
+
+void print_string_to_file(const std::string& ptx_compiled_module, const std::string& filename) {
+    std::ofstream ptx_file(filename);
+    ptx_file << ptx_compiled_module;
+    ptx_file.close();
+}
+
+// Converts the CUDA compute version to the CUjit_target enum used by the CUJIT
+CUjit_target get_CUjit_target(const int compute_version_major, const int compute_version_minor) {
+    auto compute_architecture = compute_version_major * 10 + compute_version_minor;
+    switch (compute_architecture) {
+    case 20:
+        return CU_TARGET_COMPUTE_20;
+    case 21:
+        return CU_TARGET_COMPUTE_21;
+    case 30:
+        return CU_TARGET_COMPUTE_30;
+    case 32:
+        return CU_TARGET_COMPUTE_32;
+    case 35:
+        return CU_TARGET_COMPUTE_35;
+    case 37:
+        return CU_TARGET_COMPUTE_37;
+    case 50:
+        return CU_TARGET_COMPUTE_50;
+    case 52:
+        return CU_TARGET_COMPUTE_52;
+    case 53:
+        return CU_TARGET_COMPUTE_53;
+    case 60:
+        return CU_TARGET_COMPUTE_60;
+    case 61:
+        return CU_TARGET_COMPUTE_61;
+    case 62:
+        return CU_TARGET_COMPUTE_62;
+    case 70:
+        return CU_TARGET_COMPUTE_70;
+    case 72:
+        return CU_TARGET_COMPUTE_72;
+    case 75:
+        return CU_TARGET_COMPUTE_75;
+    case 80:
+        return CU_TARGET_COMPUTE_80;
+    case 86:
+        return CU_TARGET_COMPUTE_86;
+    default:
+        throw std::runtime_error("Unsupported compute architecture");
+    }
+}
+
+void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmark_info) {
+    // CUDA initialization
+    checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuDeviceGetCount(&device_info.count));
+    checkCudaErrors(cuDeviceGet(&device, 0));
+
+    char name[128];
+    checkCudaErrors(cuDeviceGetName(name, 128, device));
+    device_info.name = name;
+    logger->info("Using CUDA Device [0]: {}"_format(device_info.name));
+
+    // Get the compute capability of the device that is actually going to be used to run the kernel
+    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_major,
+                                         CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                                         device));
+    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_minor,
+                                         CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                                         device));
+    logger->info("Device Compute Capability: {}.{}"_format(device_info.compute_version_major,
+                                                           device_info.compute_version_minor));
+    if (device_info.compute_version_major < 2) {
+        throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
+    }
+
+    // Load the external libraries modules to the NVVM program
+    // Currently only libdevice is supported
+    link_libraries(*module, benchmark_info);
+
+    // Compile the program
+    logger->info("Compiling the LLVM IR to PTX");
+
+    // Optimize code for nvptx including the wrapper functions and generate PTX
+    const auto opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
+    utils::optimise_module_for_nvptx(platform, *module, opt_level_codegen, ptx_compiled_module);
+    utils::save_ir_to_ll_file(*module,
+                              benchmark_info->output_dir + "/" + benchmark_info->filename +
+                                  "_benchmark");
+    if (benchmark_info) {
+        print_string_to_file(ptx_compiled_module,
+                             benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
+    }
+
+    // Create driver context
+    checkCudaErrors(cuCtxCreate(&context, 0, device));
+
+    // Create module for object
+    logger->info("Loading PTX to CUDA module");
+    const unsigned int jitNumOptions = 5;
+    CUjit_option* jitOptions = new CUjit_option[jitNumOptions];
+    void** jitOptVals = new void*[jitNumOptions];
+
+    // set up size of compilation log buffer
+    jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+    size_t jitLogBufferSize = 1024 * 1024;
+    jitOptVals[0] = (void*) jitLogBufferSize;
+
+    // set up pointer to the compilation log buffer
+    jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
+    char* jitLogBuffer = new char[jitLogBufferSize];
+    jitOptVals[1] = jitLogBuffer;
+
+    // set up size of compilation error log buffer
+    jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+    size_t jitErrorLogBufferSize = 1024 * 1024;
+    jitOptVals[2] = (void*) jitErrorLogBufferSize;
+
+    // set up pointer to the compilation error log buffer
+    jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
+    char* jitErrorLogBuffer = new char[jitErrorLogBufferSize];
+    jitOptVals[3] = jitErrorLogBuffer;
+
+    // set the exact CUDA compute target architecture based on the GPU it's going to be actually
+    // used
+    jitOptions[4] = CU_JIT_TARGET;
+    auto target_architecture = get_CUjit_target(device_info.compute_version_major,
+                                                device_info.compute_version_minor);
+    jitOptVals[4] = (void*) target_architecture;
+
+    // load the LLVM module to the CUDA module (CUDA JIT compilation)
+    auto cuda_jit_ret = cuModuleLoadDataEx(
+        &cudaModule, ptx_compiled_module.c_str(), jitNumOptions, jitOptions, jitOptVals);
+    if (!std::string(jitLogBuffer).empty()) {
+        logger->info("CUDA JIT INFO LOG: {}"_format(std::string(jitLogBuffer)));
+    }
+    if (!std::string(jitErrorLogBuffer).empty()) {
+        logger->info("CUDA JIT ERROR LOG: {}"_format(std::string(jitErrorLogBuffer)));
+    }
+    delete[] jitOptions;
+    delete[] jitOptVals;
+    delete[] jitLogBuffer;
+    delete[] jitErrorLogBuffer;
+    checkCudaErrors(cuda_jit_ret);
+}
+
+}  // namespace runner
+}  // namespace nmodl
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
new file mode 100644
index 0000000000..3fd02fd55e
--- /dev/null
+++ b/test/benchmark/cuda_driver.hpp
@@ -0,0 +1,187 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief Implementation of CUDA and NVVM-based execution engine to run functions from MOD files
+ *
+ * \file
+ * \brief \copybrief nmodl::runner::CUDADriver
+ */
+
+#include <memory>
+#include <string>
+
+#include "llvm/IR/Module.h"
+
+#include "benchmark_info.hpp"
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "gpu_parameters.hpp"
+
+using nmodl::cuda_details::GPUExecutionParameters;
+
+namespace nmodl {
+namespace runner {
+
+struct DeviceInfo {
+    int count;
+    std::string name;
+    int compute_version_major;
+    int compute_version_minor;
+};
+
+/**
+ * @brief Throw meaningful error in case CUDA API call fails
+ *
+ * Checks whether a call to the CUDA API was succsful and if not it throws a runntime_error with
+ * the error message from CUDA.
+ *
+ * @param err Return value of the CUDA API call
+ */
+void checkCudaErrors(CUresult err);
+
+/**
+ * \class CUDADriver
+ * \brief Driver to execute a MOD file function via the CUDA JIT backend.
+ */
+class CUDADriver {
+    /// LLVM IR module to execute.
+    std::unique_ptr<llvm::Module> module;
+    CUdevice device;
+    CUmodule cudaModule;
+    CUcontext context;
+    CUfunction function;
+    CUlinkState linker;
+    DeviceInfo device_info;
+    std::string ptx_compiled_module;
+
+    void checkCudaErrors(CUresult err);
+    void link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_info);
+
+  public:
+    explicit CUDADriver(std::unique_ptr<llvm::Module> m)
+        : module(std::move(m)) {}
+
+    /// Initializes the CUDA GPU JIT driver.
+    void init(const codegen::Platform& platform, BenchmarkInfo* benchmark_info = nullptr);
+
+    void launch_cuda_kernel(const std::string& entry_point,
+                            const GPUExecutionParameters& gpu_execution_parameters,
+                            void* kernel_parameters) {
+        // Get kernel function
+        checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
+
+        // Kernel launch
+        checkCudaErrors(cuLaunchKernel(function,
+                                       gpu_execution_parameters.gridDimX,
+                                       1,
+                                       1,
+                                       gpu_execution_parameters.blockDimX,
+                                       1,
+                                       1,
+                                       0,
+                                       nullptr,
+                                       &kernel_parameters,
+                                       nullptr));
+        auto asyncErr = cudaDeviceSynchronize();
+        if (asyncErr != cudaSuccess) {
+            throw std::runtime_error(
+                fmt::format("CUDA Execution Error: {}\n", cudaGetErrorString(asyncErr)));
+        }
+    }
+
+    /// Lookups the entry-point without arguments in the CUDA module and executes it.
+    void execute_without_arguments(const std::string& entry_point,
+                                   const GPUExecutionParameters& gpu_execution_parameters) {
+        launch_cuda_kernel(entry_point, gpu_execution_parameters, {});
+    }
+
+    /// Lookups the entry-point with arguments in the CUDA module and executes it.
+    template <typename ArgType>
+    void execute_with_arguments(const std::string& entry_point,
+                                ArgType arg,
+                                const GPUExecutionParameters& gpu_execution_parameters) {
+        launch_cuda_kernel(entry_point, gpu_execution_parameters, {&arg});
+    }
+};
+
+/**
+ * \class BaseGPURunner
+ * \brief A base runner class that provides functionality to execute an
+ * entry point in the CUDA module.
+ */
+class BaseGPURunner {
+  protected:
+    std::unique_ptr<CUDADriver> driver;
+
+    explicit BaseGPURunner(std::unique_ptr<llvm::Module> m)
+        : driver(std::make_unique<CUDADriver>(std::move(m))) {}
+
+  public:
+    /// Sets up the CUDA driver.
+    virtual void initialize_driver(const codegen::Platform& platform) = 0;
+
+    /// Runs the entry-point function without arguments.
+    void run_without_arguments(const std::string& entry_point,
+                               const GPUExecutionParameters& gpu_execution_parameters) {
+        return driver->execute_without_arguments(entry_point, gpu_execution_parameters);
+    }
+
+    /// Runs the entry-point function with a pointer to the data as an argument.
+    template <typename ArgType>
+    void run_with_argument(const std::string& entry_point,
+                           ArgType arg,
+                           const GPUExecutionParameters& gpu_execution_parameters) {
+        return driver->template execute_with_arguments(entry_point, arg, gpu_execution_parameters);
+    }
+};
+
+/**
+ * \class TestGPURunner
+ * \brief A simple runner for testing purposes.
+ */
+class TestGPURunner: public BaseGPURunner {
+  public:
+    explicit TestGPURunner(std::unique_ptr<llvm::Module> m)
+        : BaseGPURunner(std::move(m)) {}
+
+    virtual void initialize_driver(const codegen::Platform& platform) {
+        driver->init(platform);
+    }
+};
+
+/**
+ * \class BenchmarkGPURunner
+ * \brief A runner with benchmarking functionality. It takes user-specified GPU
+ * features into account, as well as it can link against shared libraries.
+ */
+class BenchmarkGPURunner: public BaseGPURunner {
+  private:
+    /// Benchmarking information passed to JIT driver.
+    BenchmarkInfo benchmark_info;
+
+  public:
+    BenchmarkGPURunner(std::unique_ptr<llvm::Module> m,
+                       std::string filename,
+                       std::string output_dir,
+                       std::vector<std::string> lib_paths = {},
+                       int opt_level_ir = 0,
+                       int opt_level_codegen = 0)
+        : BaseGPURunner(std::move(m))
+        , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
+
+    virtual void initialize_driver(const codegen::Platform& platform) {
+        driver->init(platform, &benchmark_info);
+    }
+};
+
+
+}  // namespace runner
+}  // namespace nmodl
diff --git a/test/benchmark/gpu_parameters.hpp b/test/benchmark/gpu_parameters.hpp
new file mode 100644
index 0000000000..5e72edb147
--- /dev/null
+++ b/test/benchmark/gpu_parameters.hpp
@@ -0,0 +1,27 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief GPU execution parameters struct
+ *
+ * \file
+ * \brief \copybrief nmodl::cuda_details::GPUExecutionParameters
+ */
+
+namespace nmodl {
+namespace cuda_details {
+
+struct GPUExecutionParameters {
+    int gridDimX;
+    int blockDimX;
+};
+
+}  // namespace cuda_details
+}  // namespace nmodl
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index ed86684f76..3569c4bd4f 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -15,6 +15,8 @@
  * \brief \copybrief nmodl::runner::JITDriver
  */
 
+#include "benchmark_info.hpp"
+
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/Support/Host.h"
@@ -22,24 +24,6 @@
 namespace nmodl {
 namespace runner {
 
-/// A struct to hold the information for benchmarking.
-struct BenchmarkInfo {
-    /// Object filename to dump.
-    std::string filename;
-
-    /// Object file output directory.
-    std::string output_dir;
-
-    /// Shared libraries' paths to link against.
-    std::vector<std::string> shared_lib_paths;
-
-    /// Optimisation level for IT.
-    int opt_level_ir;
-
-    /// Optimisation level for machine code generation.
-    int opt_level_codegen;
-};
-
 /**
  * \class JITDriver
  * \brief Driver to execute a MOD file function via LLVM IR backend.
diff --git a/test/benchmark/kernels/expsyn.mod b/test/benchmark/kernels/expsyn.mod
new file mode 100644
index 0000000000..56ddde3b19
--- /dev/null
+++ b/test/benchmark/kernels/expsyn.mod
@@ -0,0 +1,42 @@
+NEURON {
+	POINT_PROCESS ExpSyn
+	RANGE tau, e, i
+	NONSPECIFIC_CURRENT i
+}
+
+UNITS {
+	(nA) = (nanoamp)
+	(mV) = (millivolt)
+	(uS) = (microsiemens)
+}
+
+PARAMETER {
+	tau = 0.1 (ms) <1e-9,1e9>
+	e = 0	(mV)
+}
+
+ASSIGNED {
+	v (mV)
+	i (nA)
+}
+
+STATE {
+	g (uS)
+}
+
+INITIAL {
+	g=0
+}
+
+BREAKPOINT {
+	SOLVE state METHOD cnexp
+	i = g*(v - e)
+}
+
+DERIVATIVE state {
+	g' = -g/tau
+}
+
+NET_RECEIVE(weight (uS)) {
+	g = g + weight
+}
diff --git a/test/benchmark/kernels/hh.mod b/test/benchmark/kernels/hh.mod
new file mode 100644
index 0000000000..053a15f43f
--- /dev/null
+++ b/test/benchmark/kernels/hh.mod
@@ -0,0 +1,125 @@
+TITLE hh.mod   squid sodium, potassium, and leak channels
+ 
+COMMENT
+ This is the original Hodgkin-Huxley treatment for the set of sodium, 
+  potassium, and leakage channels found in the squid giant axon membrane.
+  ("A quantitative description of membrane current and its application 
+  conduction and excitation in nerve" J.Physiol. (Lond.) 117:500-544 (1952).)
+ Membrane voltage is in absolute mV and has been reversed in polarity
+  from the original HH convention and shifted to reflect a resting potential
+  of -65 mV.
+ Remember to set celsius=6.3 (or whatever) in your HOC file.
+ See squid.hoc for an example of a simulation using this model.
+ SW Jaslove  6 March, 1992
+ENDCOMMENT
+ 
+UNITS {
+        (mA) = (milliamp)
+        (mV) = (millivolt)
+	(S) = (siemens)
+}
+ 
+? interface
+NEURON {
+        SUFFIX hh
+        USEION na READ ena WRITE ina
+        USEION k READ ek WRITE ik
+        NONSPECIFIC_CURRENT il
+        RANGE gnabar, gkbar, gl, el, gna, gk
+        :GLOBAL minf, hinf, ninf, mtau, htau, ntau
+        RANGE minf, hinf, ninf, mtau, htau, ntau
+	THREADSAFE : assigned GLOBALs will be per thread
+}
+ 
+PARAMETER {
+        gnabar = .12 (S/cm2)	<0,1e9>
+        gkbar = .036 (S/cm2)	<0,1e9>
+        gl = .0003 (S/cm2)	<0,1e9>
+        el = -54.3 (mV)
+}
+ 
+STATE {
+        m h n
+}
+ 
+ASSIGNED {
+        v (mV)
+        celsius (degC)
+        ena (mV)
+        ek (mV)
+
+	gna (S/cm2)
+	gk (S/cm2)
+        ina (mA/cm2)
+        ik (mA/cm2)
+        il (mA/cm2)
+        minf hinf ninf
+	mtau (ms) htau (ms) ntau (ms)
+}
+ 
+? currents
+BREAKPOINT {
+        SOLVE states METHOD cnexp
+        gna = gnabar*m*m*m*h
+	ina = gna*(v - ena)
+        gk = gkbar*n*n*n*n
+	ik = gk*(v - ek)      
+        il = gl*(v - el)
+}
+ 
+ 
+INITIAL {
+	rates(v)
+	m = minf
+	h = hinf
+	n = ninf
+}
+
+? states
+DERIVATIVE states {  
+        rates(v)
+        m' =  (minf-m)/mtau
+        h' = (hinf-h)/htau
+        n' = (ninf-n)/ntau
+}
+ 
+:LOCAL q10
+
+
+? rates
+PROCEDURE rates(v(mV)) {  :Computes rate and other constants at current v.
+                      :Call once from HOC to initialize inf at resting v.
+        LOCAL  alpha, beta, sum, q10
+:        TABLE minf, mtau, hinf, htau, ninf, ntau DEPEND celsius FROM -100 TO 100 WITH 200
+
+UNITSOFF
+        q10 = 3^((celsius - 6.3)/10)
+                :"m" sodium activation system
+        alpha = .1 * vtrap(-(v+40),10)
+        beta =  4 * exp(-(v+65)/18)
+        sum = alpha + beta
+	mtau = 1/(q10*sum)
+        minf = alpha/sum
+                :"h" sodium inactivation system
+        alpha = .07 * exp(-(v+65)/20)
+        beta = 1 / (exp(-(v+35)/10) + 1)
+        sum = alpha + beta
+	htau = 1/(q10*sum)
+        hinf = alpha/sum
+                :"n" potassium activation system
+        alpha = .01*vtrap(-(v+55),10) 
+        beta = .125*exp(-(v+65)/80)
+	sum = alpha + beta
+        ntau = 1/(q10*sum)
+        ninf = alpha/sum
+}
+ 
+FUNCTION vtrap(x,y) {  :Traps for 0 in denominator of rate eqns.
+        if (fabs(x/y) < 1e-6) {
+                vtrap = y*(1 - x/y/2)
+        }else{
+                vtrap = x/(exp(x/y) - 1)
+        }
+}
+ 
+UNITSON
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 87d7e34512..010bc2edf3 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -16,6 +16,9 @@
 
 #include "test/unit/codegen/codegen_data_helper.hpp"
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+#include "test/benchmark/cuda_driver.hpp"
+#endif
 
 namespace nmodl {
 namespace benchmark {
@@ -45,17 +48,43 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
     llvm_visitor.find_kernel_names(kernel_names);
 
     // Get feature's string and turn them off depending on the cpu.
-    std::string cpu_name = cpu == "default" ? llvm::sys::getHostCPUName().str() : cpu;
-    logger->info("CPU: {}", cpu_name);
+    std::string backend_name;
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    if (platform.is_CUDA_gpu()) {
+        backend_name = platform.get_name();
+    } else {
+#endif
+        backend_name = platform.get_name() == "default" ? llvm::sys::getHostCPUName().str()
+                                                        : platform.get_name();
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    }
+#endif
+    logger->info("Backend: {}", backend_name);
 
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
     // Create the benchmark runner and initialize it.
-    std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
-                           mod_filename;
-    runner::BenchmarkRunner runner(
-        std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
-    runner.initialize_driver();
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    if (platform.is_CUDA_gpu()) {
+        std::string filename = "cuda_" + mod_filename;
+        cuda_runner = std::make_unique<runner::BenchmarkGPURunner>(
+            std::move(m), filename, output_dir, shared_libs, opt_level_ir, opt_level_codegen);
+        cuda_runner->initialize_driver(platform);
+    } else {
+#endif
+        std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
+                               mod_filename;
+        cpu_runner = std::make_unique<runner::BenchmarkRunner>(std::move(m),
+                                                               filename,
+                                                               output_dir,
+                                                               backend_name,
+                                                               shared_libs,
+                                                               opt_level_ir,
+                                                               opt_level_codegen);
+        cpu_runner->initialize_driver();
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    }
+#endif
 
     BenchmarkResults results{};
     // Benchmark every kernel.
@@ -75,7 +104,17 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+#ifdef NMODL_LLVM_CUDA_BACKEND
+            if (platform.is_CUDA_gpu()) {
+                cuda_runner->run_with_argument<void*>(wrapper_name,
+                                                      instance_data.base_ptr,
+                                                      gpu_execution_parameters);
+            } else {
+#endif
+                cpu_runner->run_with_argument<int, void*>(wrapper_name, instance_data.base_ptr);
+#ifdef NMODL_LLVM_CUDA_BACKEND
+            }
+#endif
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index f79cad62e5..f03e9ea52d 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -13,6 +13,16 @@
 #include <tuple>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "gpu_parameters.hpp"
+#include "test/benchmark/jit_driver.hpp"
+#include "utils/logger.hpp"
+
+#ifdef NMODL_LLVM_CUDA_BACKEND
+#include "test/benchmark/cuda_driver.hpp"
+#endif
+
+using nmodl::codegen::Platform;
+using nmodl::cuda_details::GPUExecutionParameters;
 
 namespace nmodl {
 namespace benchmark {
@@ -47,8 +57,11 @@ class LLVMBenchmark {
     /// The size of the instance struct for benchmarking.
     int instance_size;
 
-    /// CPU to target.
-    std::string cpu;
+    /// Target platform for the code generation.
+    Platform platform;
+
+    /// The GPU execution parameters needed to configure the kernels' execution.
+    GPUExecutionParameters gpu_execution_parameters;
 
     /// Optimisation level for IR generation.
     int opt_level_ir;
@@ -59,6 +72,14 @@ class LLVMBenchmark {
     /// Filestream for dumping logs to the file.
     std::ofstream ofs;
 
+    /// CPU benchmark runner
+    std::unique_ptr<runner::BenchmarkRunner> cpu_runner;
+
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    /// CUDA benchmark runner
+    std::unique_ptr<runner::BenchmarkGPURunner> cuda_runner;
+#endif
+
   public:
     LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,
                   const std::string& mod_filename,
@@ -66,7 +87,7 @@ class LLVMBenchmark {
                   std::vector<std::string> shared_libs,
                   int num_experiments,
                   int instance_size,
-                  const std::string& cpu,
+                  const Platform& platform,
                   int opt_level_ir,
                   int opt_level_codegen)
         : llvm_visitor(llvm_visitor)
@@ -75,9 +96,29 @@ class LLVMBenchmark {
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
-        , cpu(cpu)
+        , platform(platform)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen) {}
+    LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,
+                  const std::string& mod_filename,
+                  const std::string& output_dir,
+                  std::vector<std::string> shared_libs,
+                  int num_experiments,
+                  int instance_size,
+                  const Platform& platform,
+                  int opt_level_ir,
+                  int opt_level_codegen,
+                  const GPUExecutionParameters& gpu_exec_params)
+        : llvm_visitor(llvm_visitor)
+        , mod_filename(mod_filename)
+        , output_dir(output_dir)
+        , shared_libs(shared_libs)
+        , num_experiments(num_experiments)
+        , instance_size(instance_size)
+        , platform(platform)
+        , opt_level_ir(opt_level_ir)
+        , opt_level_codegen(opt_level_codegen)
+        , gpu_execution_parameters(gpu_exec_params) {}
 
     /// Runs the benchmark.
     BenchmarkResults run();
diff --git a/test/integration/mod/test_math.mod b/test/integration/mod/test_math.mod
new file mode 100644
index 0000000000..6e3174a846
--- /dev/null
+++ b/test/integration/mod/test_math.mod
@@ -0,0 +1,16 @@
+NEURON {
+    SUFFIX test
+    RANGE x, y
+}
+
+ASSIGNED { x y }
+
+STATE { m }
+
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+}
+
+DERIVATIVE states {
+    m = exp(y) + x ^ 107  + log(x)
+}
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 107d856d74..818167859c 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -117,7 +117,6 @@ if(NMODL_ENABLE_LLVM)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_data_helper.cpp
                                   codegen/codegen_llvm_execution.cpp)
   if(NMODL_ENABLE_LLVM_CUDA)
-    include_directories(${CUDAToolkit_INCLUDE_DIRS})
     target_link_libraries(benchmark_data PRIVATE CUDA::cudart)
     target_link_libraries(testllvm CUDA::cudart)
     target_link_libraries(test_llvm_runner CUDA::cudart)
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 401e0a6c63..9c22fdda78 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -120,11 +120,15 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
             size_t ion_ena_index_index = 8;
             size_t voltage_index = 9;
             size_t node_index_index = 10;
-            size_t t_index = 11;
-            size_t dt_index = 12;
-            size_t celsius_index = 13;
-            size_t secondorder_index = 14;
-            size_t node_count_index = 15;
+            size_t rhs_index = 11;
+            size_t d_index = 12;
+            size_t rhs_shadow_index = 13;
+            size_t d_shadow_index = 14;
+            size_t t_index = 15;
+            size_t dt_index = 16;
+            size_t celsius_index = 17;
+            size_t secondorder_index = 18;
+            size_t node_count_index = 19;
             // Check if the various instance struct fields are properly initialized
             REQUIRE(compare(instance_data.members[minf_index],
                             generate_dummy_data<double>(minf_index, num_elements)));
@@ -155,6 +159,10 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
                 int* ion_ena_index;
                 double* voltage;
                 int* node_index;
+                double* vec_rhs;
+                double* vec_d;
+                double* _shadow_rhs;
+                double* _shadow_d;
                 double t;
                 double dt;
                 double celsius;
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index b19ff95066..ebef71688e 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -980,8 +980,8 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             // Check the struct type with correct attributes and the kernel declaration.
             std::regex struct_type(
                 "%.*__instance_var__type = type \\{ double\\*, double\\*, double\\*, double\\*, "
-                "double\\*, double\\*, double\\*, double\\*, double\\*, double\\*, i32\\*, double, "
-                "double, double, i32, i32, double\\*, double\\*, double\\*, double\\* \\}");
+                "double\\*, double\\*, double\\*, double\\*, double\\*, double\\*, i32\\*, "
+                "double\\*, double\\*, double\\*, double\\*, double, double, double, i32, i32 \\}");
             std::regex kernel_declaration(
                 R"(define void @nrn_state_hh\(%.*__instance_var__type\* noalias nocapture readonly .*\) #0)");
             REQUIRE(std::regex_search(module_string, m, struct_type));
@@ -1775,7 +1775,7 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             }
 
             DERIVATIVE states {
-              m = exp(y) + x ^ 2
+              m = exp(y) + x ^ 2 + log(x)
             }
         )";
 
@@ -1793,12 +1793,18 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             std::regex pow_declaration(R"(declare double @__nv_pow\(double, double\))");
             std::regex pow_new_call(R"(call double @__nv_pow\(double %.*, double .*\))");
             std::regex pow_old_call(R"(call double @llvm\.pow\.f64\(double %.*, double .*\))");
+            std::regex log_declaration(R"(declare double @__nv_log\(double\))");
+            std::regex log_new_call(R"(call double @__nv_log\(double %.*\))");
+            std::regex log_old_call(R"(call double @llvm\.log\.f64\(double %.*\))");
             REQUIRE(std::regex_search(module_string, m, exp_declaration));
             REQUIRE(std::regex_search(module_string, m, exp_new_call));
             REQUIRE(!std::regex_search(module_string, m, exp_old_call));
             REQUIRE(std::regex_search(module_string, m, pow_declaration));
             REQUIRE(std::regex_search(module_string, m, pow_new_call));
             REQUIRE(!std::regex_search(module_string, m, pow_old_call));
+            REQUIRE(std::regex_search(module_string, m, log_declaration));
+            REQUIRE(std::regex_search(module_string, m, log_new_call));
+            REQUIRE(!std::regex_search(module_string, m, log_old_call));
         }
     }
 }
diff --git a/test/unit/codegen/codegen_llvm_visitor.cpp b/test/unit/codegen/codegen_llvm_visitor.cpp
index 1906d0d27c..af9bed5e7c 100644
--- a/test/unit/codegen/codegen_llvm_visitor.cpp
+++ b/test/unit/codegen/codegen_llvm_visitor.cpp
@@ -171,15 +171,15 @@ SCENARIO("Check instance struct declaration and setup in wrapper",
                 int* __restrict__ ion_dikdv_index;
                 double* __restrict__ voltage;
                 int* __restrict__ node_index;
+                double* __restrict__ vec_rhs;
+                double* __restrict__ vec_d;
+                double* __restrict__ _shadow_rhs;
+                double* __restrict__ _shadow_d;
                 double t;
                 double dt;
                 double celsius;
                 int secondorder;
                 int node_count;
-                double* __restrict__ vec_rhs;
-                double* __restrict__ vec_d;
-                double* __restrict__ _shadow_rhs;
-                double* __restrict__ _shadow_d;
             };
         )";
         std::string generated_instance_struct_setup = R"(
@@ -226,6 +226,10 @@ SCENARIO("Check instance struct declaration and setup in wrapper",
                 inst->ion_dikdv_index = indexes+5*pnodecount;
                 inst->voltage = nt->_actual_v;
                 inst->node_index = ml->nodeindices;
+                inst->vec_rhs = nt->_actual_rhs;
+                inst->vec_d = nt->_actual_d;
+                inst->_shadow_rhs = nt->_shadow_rhs;
+                inst->_shadow_d = nt->_shadow_d;
                 inst->t = nt->t;
                 inst->dt = nt->dt;
                 inst->celsius = celsius;

From ebe560b911610e577524097bf1952db63ec3f4c7 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 10 May 2022 16:01:49 +0200
Subject: [PATCH 315/331] Fixed compute-bound and memory-bound mod files for
 PyJIT execution

---
 test/benchmark/kernels/compute-bound.mod | 6 ++++++
 test/benchmark/kernels/memory-bound.mod  | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/test/benchmark/kernels/compute-bound.mod b/test/benchmark/kernels/compute-bound.mod
index ded2618cf4..a0fd463ea5 100644
--- a/test/benchmark/kernels/compute-bound.mod
+++ b/test/benchmark/kernels/compute-bound.mod
@@ -12,6 +12,12 @@ ASSIGNED {
     v (mV)
     minf
     mtau (ms)
+    il (mA/cm2)
+}
+
+PARAMETER {
+    gl = .0003 (S/cm2)	<0,1e9>
+    el = -54.3 (mV)
 }
 
 BREAKPOINT {
diff --git a/test/benchmark/kernels/memory-bound.mod b/test/benchmark/kernels/memory-bound.mod
index 1e3df520a9..463ca9cb6c 100644
--- a/test/benchmark/kernels/memory-bound.mod
+++ b/test/benchmark/kernels/memory-bound.mod
@@ -13,6 +13,12 @@ ASSIGNED {
     v (mV)
     minf
     mtau (ms)
+    il (mA/cm2)
+}
+
+PARAMETER {
+    gl = .0003 (S/cm2)	<0,1e9>
+    el = -54.3 (mV)
 }
 
 BREAKPOINT {

From e4e187ac3afe8bf767caffeab6e81d58771a1ed6 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 10 May 2022 16:17:21 +0200
Subject: [PATCH 316/331] Avoid code duplication for running external kernel

---
 test/benchmark/llvm_benchmark.cpp | 124 +++++++++++-------------------
 1 file changed, 44 insertions(+), 80 deletions(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 2b63acf53f..1c207a4c77 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -153,38 +153,70 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
     BenchmarkResults results{};
     if (external_kernel) {
         // benchmark external kernel
-        logger->info("Benchmarking external kernel");
+        logger->info("Benchmarking external kernels");
+        kernel_names = {"nrn_state_hh_ext"};
+    }
+    // Benchmark every kernel.
+    for (const auto& kernel_name: kernel_names) {
+        // For every kernel run the benchmark `num_experiments` times and collect runtimes.
         auto times = std::vector<double>(num_experiments, 0.0);
         for (int i = 0; i < num_experiments; ++i) {
             // Initialise the data.
             auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-
+#ifdef NMODL_LLVM_CUDA_BACKEND
+            void* dev_ptr;
+            if (platform.is_CUDA_gpu()) {
+                dev_ptr = copy_instance_data_gpu(instance_data);
+            }
+#endif
             // Log instance size once.
             if (i == 0) {
                 double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-                logger->info("Benchmarking kernel nrn_state_hh_ext with {} MBs dataset", size_mbs);
+                logger->info("Benchmarking kernel '{}' with {} MBs dataset",
+                                kernel_name,
+                                size_mbs);
             }
 
             // Record the execution time of the kernel.
+            std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            nrn_state_hh_ext(instance_data.base_ptr);
+            if (external_kernel) {
+                nrn_state_hh_ext(instance_data.base_ptr);
+            } else {
+#ifdef NMODL_LLVM_CUDA_BACKEND
+                if (platform.is_CUDA_gpu()) {
+                    cuda_runner->run_with_argument<void*>(wrapper_name,
+                                                          dev_ptr,
+                                                          gpu_execution_parameters);
+                } else {
+#endif
+                    cpu_runner->run_with_argument<int, void*>(wrapper_name, instance_data.base_ptr);
+#ifdef NMODL_LLVM_CUDA_BACKEND
+                }
+#endif
+            }
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
-
+#ifdef NMODL_LLVM_CUDA_BACKEND
+            if (platform.is_CUDA_gpu()) {
+                copy_instance_data_host(instance_data, dev_ptr);
+            }
+#endif
             // Log the time taken for each run.
             logger->debug("Experiment {} compute time = {:.6f} sec", i, diff.count());
 
+            // Update statistics.
             times[i] = diff.count();
         }
         // Calculate statistics
         double time_mean = std::accumulate(times.begin(), times.end(), 0.0) / num_experiments;
         double time_var = std::accumulate(times.begin(),
-                                          times.end(),
-                                          0.0,
-                                          [time_mean](const double& pres, const double& e) {
-                                              return (e - time_mean) * (e - time_mean);
-                                          }) /
-                          num_experiments;
+                                        times.end(),
+                                        0.0,
+                                        [time_mean](const double& pres, const double& e) {
+                                            return (e - time_mean) * (e - time_mean);
+                                        }) /
+                        num_experiments;
         double time_stdev = std::sqrt(time_var);
         double time_min = *std::min_element(times.begin(), times.end());
         double time_max = *std::max_element(times.begin(), times.end());
@@ -193,75 +225,7 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
         logger->info("Compute time standard deviation = {:8f}", time_stdev);
         logger->info("Minimum compute time = {:.6f}", time_min);
         logger->info("Maximum compute time = {:.6f}\n", time_max);
-        results["nrn_state_hh_ext"] = {time_mean, time_stdev, time_min, time_max};
-    } else {
-        // Benchmark every kernel.
-        for (const auto& kernel_name: kernel_names) {
-            // For every kernel run the benchmark `num_experiments` times and collect runtimes.
-            auto times = std::vector<double>(num_experiments, 0.0);
-            for (int i = 0; i < num_experiments; ++i) {
-                // Initialise the data.
-                auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
-#ifdef NMODL_LLVM_CUDA_BACKEND
-                void* dev_ptr;
-                if (platform.is_CUDA_gpu()) {
-                    dev_ptr = copy_instance_data_gpu(instance_data);
-                }
-#endif
-                // Log instance size once.
-                if (i == 0) {
-                    double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-                    logger->info("Benchmarking kernel '{}' with {} MBs dataset",
-                                 kernel_name,
-                                 size_mbs);
-                }
-
-                // Record the execution time of the kernel.
-                std::string wrapper_name = "__" + kernel_name + "_wrapper";
-                auto start = std::chrono::steady_clock::now();
-#ifdef NMODL_LLVM_CUDA_BACKEND
-                if (platform.is_CUDA_gpu()) {
-                    cuda_runner->run_with_argument<void*>(wrapper_name,
-                                                          dev_ptr,
-                                                          gpu_execution_parameters);
-                } else {
-#endif
-                    cpu_runner->run_with_argument<int, void*>(wrapper_name, instance_data.base_ptr);
-#ifdef NMODL_LLVM_CUDA_BACKEND
-                }
-#endif
-                auto end = std::chrono::steady_clock::now();
-                std::chrono::duration<double> diff = end - start;
-#ifdef NMODL_LLVM_CUDA_BACKEND
-                if (platform.is_CUDA_gpu()) {
-                    copy_instance_data_host(instance_data, dev_ptr);
-                }
-#endif
-                // Log the time taken for each run.
-                logger->debug("Experiment {} compute time = {:.6f} sec", i, diff.count());
-
-                // Update statistics.
-                times[i] = diff.count();
-            }
-            // Calculate statistics
-            double time_mean = std::accumulate(times.begin(), times.end(), 0.0) / num_experiments;
-            double time_var = std::accumulate(times.begin(),
-                                            times.end(),
-                                            0.0,
-                                            [time_mean](const double& pres, const double& e) {
-                                                return (e - time_mean) * (e - time_mean);
-                                            }) /
-                            num_experiments;
-            double time_stdev = std::sqrt(time_var);
-            double time_min = *std::min_element(times.begin(), times.end());
-            double time_max = *std::max_element(times.begin(), times.end());
-            // Log the average time taken for the kernel.
-            logger->info("Average compute time = {:.6f}", time_mean);
-            logger->info("Compute time standard deviation = {:8f}", time_stdev);
-            logger->info("Minimum compute time = {:.6f}", time_min);
-            logger->info("Maximum compute time = {:.6f}\n", time_max);
-            results[kernel_name] = {time_mean, time_stdev, time_min, time_max};
-        }
+        results[kernel_name] = {time_mean, time_stdev, time_min, time_max};
     }
     return results;
 }

From 3ad529ebccc6fb11b7b732a868389c3e94dfb623 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 17 May 2022 18:52:18 +0200
Subject: [PATCH 317/331] Fixed kernels for JIT execution of nrn_cur

---
 test/benchmark/kernels/compute-bound.mod | 6 ++++++
 test/benchmark/kernels/memory-bound.mod  | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/test/benchmark/kernels/compute-bound.mod b/test/benchmark/kernels/compute-bound.mod
index ded2618cf4..500524563e 100644
--- a/test/benchmark/kernels/compute-bound.mod
+++ b/test/benchmark/kernels/compute-bound.mod
@@ -4,6 +4,11 @@ NEURON {
     RANGE minf, mtau, gl, el
 }
 
+PARAMETER {
+    gl = .0003 (S/cm2)	<0,1e9>
+    el = -54.3 (mV)
+}
+
 STATE {
     m
 }
@@ -12,6 +17,7 @@ ASSIGNED {
     v (mV)
     minf
     mtau (ms)
+    il (mA/cm2)
 }
 
 BREAKPOINT {
diff --git a/test/benchmark/kernels/memory-bound.mod b/test/benchmark/kernels/memory-bound.mod
index 1e3df520a9..c5b9f3fd04 100644
--- a/test/benchmark/kernels/memory-bound.mod
+++ b/test/benchmark/kernels/memory-bound.mod
@@ -5,6 +5,11 @@ NEURON {
     USEION na WRITE nai
 }
 
+PARAMETER {
+    gl = .0003 (S/cm2)	<0,1e9>
+    el = -54.3 (mV)
+}
+
 STATE {
     m
 }
@@ -13,6 +18,7 @@ ASSIGNED {
     v (mV)
     minf
     mtau (ms)
+    il (mA/cm2)
 }
 
 BREAKPOINT {

From f2f404339b4dadc2a89671c02ebe0b04cbd7d40e Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 17 May 2022 19:06:31 +0200
Subject: [PATCH 318/331] Small fixes after merge

---
 CMakeLists.txt                    | 2 +-
 src/codegen/codegen_c_visitor.hpp | 1 -
 src/codegen/codegen_info.hpp      | 1 +
 src/main.cpp                      | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7fbd0f7d49..7e10ab72c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,7 +82,7 @@ add_compile_options(${NMODL_COMPILER_WARNING_SUPPRESSIONS})
 # =============================================================================
 project(
   NMODL
-  VERSION "0.8"
+  VERSION "1.0"
   LANGUAGES CXX)
 
 # =============================================================================
diff --git a/src/codegen/codegen_c_visitor.hpp b/src/codegen/codegen_c_visitor.hpp
index ac1fd190ca..1c4349914e 100644
--- a/src/codegen/codegen_c_visitor.hpp
+++ b/src/codegen/codegen_c_visitor.hpp
@@ -35,7 +35,6 @@ namespace nmodl {
 /// encapsulates code generation backend implementations
 namespace codegen {
 
-using namespace fmt::literals;
 /**
  * @defgroup codegen Code Generation Implementation
  * @brief Implementations of code generation backends
diff --git a/src/codegen/codegen_info.hpp b/src/codegen/codegen_info.hpp
index fa53eaaea9..0fc88fe15f 100644
--- a/src/codegen/codegen_info.hpp
+++ b/src/codegen/codegen_info.hpp
@@ -224,6 +224,7 @@ enum BlockType {
     BlockTypeEnd
 };
 
+
 /**
  * \class ShadowUseStatement
  * \brief Represents ion write statement during code generation
diff --git a/src/main.cpp b/src/main.cpp
index 6445ba6354..a820868643 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -273,7 +273,7 @@ int main(int argc, const char* argv[]) {
                               fmt::format("Number of experiments for benchmarking ({})", num_experiments))->ignore_case();
     benchmark_opt->add_flag("--external",
                               external_kernel,
-                              "Benchmark external kernel ({})"_format(external_kernel))->ignore_case();
+                              fmt::format("Benchmark external kernel ({})", external_kernel))->ignore_case();
     benchmark_opt->add_option("--grid-dim-x",
                               llvm_cuda_grid_dim_x,
                               fmt::format("Grid dimension X ({})", llvm_cuda_grid_dim_x))->ignore_case();

From 1f9887fe67033098cc0eb2be7671b102bf0a6f68 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 19 May 2022 14:06:11 +0200
Subject: [PATCH 319/331] Only compile GPU memory transfers when
 NMODL_LLVM_CUDA option is enabled

---
 test/benchmark/llvm_benchmark.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 1c207a4c77..ab269e82ad 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -48,6 +48,7 @@ void checkCudaErrors(cudaError error) {
     }
 }
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
 void* copy_instance_data_gpu(const codegen::CodegenInstanceData& data) {
     void* dev_base_ptr;
     const auto ptr_vars_size = data.num_ptr_members * sizeof(double*);
@@ -104,6 +105,7 @@ void copy_instance_data_host(codegen::CodegenInstanceData& data, void* dev_base_
     checkCudaErrors(cudaMemcpy(
         offseted_place_host, offseted_place_dev, scalar_vars_size, cudaMemcpyDeviceToHost));
 }
+#endif
 
 BenchmarkResults LLVMBenchmark::run_benchmark() {
     // Set the codegen data helper and find the kernels.

From 0353c9c2f37c2ddcf3f2ae159a0a0a23b20d874b Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Thu, 19 May 2022 14:34:01 +0200
Subject: [PATCH 320/331] Small fix for cuda related functions

---
 test/benchmark/llvm_benchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index ab269e82ad..cb8ca6bd8c 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -42,13 +42,13 @@ void LLVMBenchmark::generate_llvm() {
     logger->info("Created LLVM IR module from NMODL AST in {} sec", diff.count());
 }
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
 void checkCudaErrors(cudaError error) {
     if (error != cudaSuccess) {
         throw std::runtime_error(fmt::format("CUDA Execution Error: {}\n", cudaGetErrorString(error)));
     }
 }
 
-#ifdef NMODL_LLVM_CUDA_BACKEND
 void* copy_instance_data_gpu(const codegen::CodegenInstanceData& data) {
     void* dev_base_ptr;
     const auto ptr_vars_size = data.num_ptr_members * sizeof(double*);

From 1475efc5d950a0824759ffe4fc757b9823a6698d Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Fri, 20 May 2022 15:01:50 +0200
Subject: [PATCH 321/331] Fix missing fast_math option for llvm jit

---
 test/benchmark/nmodl-llvm-time.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index aa7d2a64c9..06c325f65b 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -295,7 +295,7 @@ for kernel_target in ${KERNEL_TARGETS}; do
             done
 		fi
         echo "|  |  NMODL JIT"
-        for fast_math in false; do
+        for fast_math in false true; do
             if $fast_math; then
                 fast_math_flag="--fmf nnan contract afn"
                 fast_math_opt="nnancontractafn"

From 583911b171e2e22c9f94aa00d7392a7e876d1c17 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 26 Dec 2022 20:38:26 +0200
Subject: [PATCH 322/331] [LLVM] Adding python script for benchmarking (#874)

* Added various benchmarking related outputs to .gitignore to avoid too much output in git status

* Initial commit for python benchmarking script

* Load external shared library and execute it

* Add proper RPATH for SVML library to the external shared library so it can be found by NMODL without setting any flags later

* Made clang-format happy with all files

* Work towards running JIT vie python

* Running JIT benchmark

* [LLVM][SIMD] Atomic updates support (#864)

* Adds code generation functionality for atomic writes on SIMD platforms
* Adds both execution and IR tests
* Refactors common GPU/SIMD atomics code

Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com>

* Editing scripts to run the JIT benchmarks

* Work towards compiling external libraries in python

* dlclose dlopened handler

* Fix instance struct declaration for .cpp files in kernels

* Overall improvements and additions in the benchmarking scripts

* Update hh state kernel

* Save pdfs to output directory

* Added pragmas to .cpp external kernels to be able to sed them afterwards

* Apply suggestions from Omar's code review

Co-authored-by: Omar Awile <omar.awile@epfl.ch>

* Various improvements in benchmark script

* flags file

* Cleaned up hh.cpp

* Small fixes in python benchmarking script

* Clear GPU memory, context and module after every invocation of the benchmark

* Added GPU JIT execution and horizontal barplots

* Compile LLVM IR generated from JIT with Clang and then execute it with JIT

* Added external openacc kernel execution in the script

* Small improvements in benchmarking infrastructure in nmodl

* Added current kernel to hh.cpp

* Added changes in nmodl jit invocation for handling nrn_cur kernel

* Handle nrn_state and nrn_cur kernels and plot CPU and GPU results in different pdfs

* Added python notebook for graphs

* Clearing up jupyter notebook

* Changed colour palette

* Fixes in benchmark script for compiling the memory and compute bound kernels with clang

* Added nrn_cur kernels in memory and compute bound and changed their suffix

* Added compute bound kernel plots and legend outside of the plot to show results

* Fix clang compilation of llvm ir generated by nmodl with svml and sleef

* Updated hh data with svml and sleef mod2ir runs

* Updated to the plots

* Updated compute bound results

* Updated memory data

* Updating jupyter notebook

* Removed plotting from benchmarking script

* Updated compiler flags with only the ones that matter

* Added expsyn mod file kernel

* Added expsyn figure and hh with more comparisons with/without svml

* Pinned compilers and let nmodl replace lib functions for all cpu vec widths (including 1)

* Order compilers by their order of appearance in the compiler config json passed to the plotting function

* Change plot dimensions

* Changes for expsyn file

* Fixed hh nrn_cur for openacc

* Only print atomic pragmas for updates of rhs and d vectors on GPU if mechanism is POINT_PROCESS

* Added expsyn and hh gpu data

* Added generation of combined GPU result graph

* Updated compiler flags and run script for ncu and intel advisor

* Added latest data with -mavx512f and script to calculate average % diff

* Added combined cpu results

* Updated GPU figure

* Updated plots

* Added relative plots

* Install explicitly LLVM 13.0.1 in MacOS builds in Azure (#898)

* Changed names for reative plots

* Fix compilation error with Apple Clang with steady_clock

* Added more options for labeling the relative figures

* Update plots for paper

* Improved name of plot function and generated plots

* Changes for figures with runtime

* Update color for gpu a bit

* Added nvhpc compiler for CPU

* Added NVHPC results

* Added new plot for GPU results

* Updated GPU plot

* Copy ast in run pynmodl function to be able to reexecute the benchmark and visitors with different configurations from the same script

* Added artefact folder

* Small improvement in artefact script

* Run black on artefact python sctiprt

* Added log scale plots

* Updates in run script and python script

* Added hatched plots

* Fix CPU and GPU relative plots and added variance plot

* Fix a bit variance plots

* Added docker file and new run script

* Cleaned and added documentation in the Dockerfile

* Added Dockerfile for GPU execution

* Added plotting script based on jupyter notebook

* Added seaborn installation in the Dockerfile

* get rid of `docker` directory to avoid confusion

* Cleaned up run script

* Updated benchmark plotting script

* Added preliminary artifact description

* Fixed paths to new generated pickle files

* Added instructions in the PAD file and separated CPU only and CPU-GPU execution scripts

* Fix PAD

* Small fixes in plot scripts

* Some small changes

* Added readme

* minor fixes

* Added option to save temp files from compilation of clang and gcc

* Install vim as well

* Fix ploting jupyter notebook

* Updated readme with instructions to run docker images from dockerhub

* Fix issue with plotting script

* Remove unnecessary docs from dockerfile

* Use Intel 2021.4.0 and NVHPC 22.3. Also reduce size of images

* Update documentation regarding new way to launch the docker containers

* Update compilers and small text addition

- Use Intel 2022.2.1
- Add note regarding the gcc results and libmvec
- Added sentence that points to the README.md on how to run the
benchmarks

Co-authored-by: George Mitenkov <georgemitenk0v@gmail.com>
Co-authored-by: Omar Awile <omar.awile@epfl.ch>
Co-authored-by: Ioannis Magkanaris <ioannis.magkanaris@epfl.ch>
Co-authored-by: Pramod S Kumbhar <pramod.s.kumbhar@gmail.com>
---
 .gitignore                                    |   16 +
 azure-pipelines.yml                           |    4 +-
 docker/docker-compose.yml                     |   22 -
 docker/recipe/Dockerfile                      |   50 -
 docker/recipe/entrypoint                      |   41 -
 docs/CC2023/PAD.md                            |   51 +
 docs/CC2023/README.md                         |   74 +
 src/codegen/codegen_acc_visitor.cpp           |    2 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  150 +-
 src/codegen/llvm/llvm_ir_builder.cpp          |   79 +
 src/codegen/llvm/llvm_ir_builder.hpp          |   13 +
 .../llvm/replace_with_lib_functions.cpp       |    2 +-
 src/main.cpp                                  |   10 +-
 src/pybind/pynmodl.cpp                        |   16 +-
 test/benchmark/CMakeLists.txt                 |    5 -
 test/benchmark/NMODL_paper_graphs.ipynb       | 2518 +++++++++++++++++
 test/benchmark/artifacts/artifact_script.py   |   34 +
 test/benchmark/artifacts/expsyn.mod           |   42 +
 test/benchmark/artifacts/hh.mod               |  125 +
 test/benchmark/benchmark_script.py            |  541 ++++
 test/benchmark/compiler_flags.json            |   64 +
 test/benchmark/cpu_docker/Dockerfile          |   83 +
 test/benchmark/cuda_driver.cpp                |   11 +
 test/benchmark/cuda_driver.hpp                |    2 +
 test/benchmark/ext_kernel.cpp                 |    6 +-
 test/benchmark/ext_kernel.hpp                 |    2 +-
 test/benchmark/gpu_docker/Dockerfile          |   86 +
 test/benchmark/install_gpu_docker_env.sh      |   40 +
 test/benchmark/jit_driver.hpp                 |    4 +-
 test/benchmark/kernels/compute-bound.cpp      |   76 +-
 test/benchmark/kernels/compute-bound.mod      |    2 +-
 test/benchmark/kernels/expsyn.cpp             |   81 +
 test/benchmark/kernels/expsyn.mod             |   10 +-
 test/benchmark/kernels/expsyn_openacc.cpp     |   78 +
 test/benchmark/kernels/hh.cpp                 |  133 +-
 test/benchmark/kernels/hh_openacc.cpp         |  171 ++
 test/benchmark/kernels/memory-bound.cpp       |   51 +-
 test/benchmark/kernels/memory-bound.mod       |    2 +-
 test/benchmark/llvm_benchmark.cpp             |  185 +-
 test/benchmark/llvm_benchmark.hpp             |   10 +-
 test/benchmark/nmodl-llvm-time.sh             |    3 +-
 test/benchmark/plot_benchmarks_cpu_gpu.py     |  507 ++++
 test/benchmark/plot_benchmarks_cpu_only.py    |  321 +++
 .../reference_data/compute_bound.pickle       |  Bin 0 -> 8758 bytes
 .../reference_data/expsyn_cpu_results.pickle  |  Bin 0 -> 7739 bytes
 .../reference_data/expsyn_gpu.pickle          |  Bin 0 -> 492 bytes
 .../expsyn_gpu_100mil_1024x128.pickle         |  Bin 0 -> 349 bytes
 .../reference_data/expsyn_icc_clang.pickle    |  Bin 0 -> 6519 bytes
 test/benchmark/reference_data/hh.pickle       |  Bin 0 -> 8351 bytes
 .../reference_data/hh_expsyn_mavx512f.pickle  |  Bin 0 -> 7423 bytes
 .../reference_data/hh_expsyn_nvhpc_cpu.pickle |  Bin 0 -> 517 bytes
 test/benchmark/reference_data/hh_gpu.pickle   |  Bin 0 -> 472 bytes
 .../hh_gpu_20mil_1024x128.pickle              |  Bin 0 -> 337 bytes
 .../hh_ic_clang_gcc_w_wout_svml.pickle        |  Bin 0 -> 7607 bytes
 .../reference_data/memory_bound.pickle        |  Bin 0 -> 8721 bytes
 test/benchmark/run_benchmark_script.sh        |  130 +
 .../benchmark/run_benchmark_script_cpu_gpu.sh |   63 +
 .../run_benchmark_script_cpu_only.sh          |   41 +
 test/unit/CMakeLists.txt                      |    2 +-
 test/unit/codegen/codegen_data_helper.cpp     |   93 +-
 test/unit/codegen/codegen_data_helper.hpp     |   14 +
 test/unit/codegen/codegen_llvm_execution.cpp  |  294 ++
 test/unit/codegen/codegen_llvm_ir.cpp         |  106 +
 63 files changed, 6117 insertions(+), 349 deletions(-)
 delete mode 100644 docker/docker-compose.yml
 delete mode 100644 docker/recipe/Dockerfile
 delete mode 100755 docker/recipe/entrypoint
 create mode 100644 docs/CC2023/PAD.md
 create mode 100644 docs/CC2023/README.md
 create mode 100644 test/benchmark/NMODL_paper_graphs.ipynb
 create mode 100644 test/benchmark/artifacts/artifact_script.py
 create mode 100644 test/benchmark/artifacts/expsyn.mod
 create mode 100644 test/benchmark/artifacts/hh.mod
 create mode 100644 test/benchmark/benchmark_script.py
 create mode 100644 test/benchmark/compiler_flags.json
 create mode 100644 test/benchmark/cpu_docker/Dockerfile
 create mode 100644 test/benchmark/gpu_docker/Dockerfile
 create mode 100644 test/benchmark/install_gpu_docker_env.sh
 create mode 100644 test/benchmark/kernels/expsyn.cpp
 create mode 100644 test/benchmark/kernels/expsyn_openacc.cpp
 create mode 100644 test/benchmark/kernels/hh_openacc.cpp
 create mode 100644 test/benchmark/plot_benchmarks_cpu_gpu.py
 create mode 100644 test/benchmark/plot_benchmarks_cpu_only.py
 create mode 100644 test/benchmark/reference_data/compute_bound.pickle
 create mode 100644 test/benchmark/reference_data/expsyn_cpu_results.pickle
 create mode 100644 test/benchmark/reference_data/expsyn_gpu.pickle
 create mode 100644 test/benchmark/reference_data/expsyn_gpu_100mil_1024x128.pickle
 create mode 100644 test/benchmark/reference_data/expsyn_icc_clang.pickle
 create mode 100644 test/benchmark/reference_data/hh.pickle
 create mode 100644 test/benchmark/reference_data/hh_expsyn_mavx512f.pickle
 create mode 100644 test/benchmark/reference_data/hh_expsyn_nvhpc_cpu.pickle
 create mode 100644 test/benchmark/reference_data/hh_gpu.pickle
 create mode 100644 test/benchmark/reference_data/hh_gpu_20mil_1024x128.pickle
 create mode 100644 test/benchmark/reference_data/hh_ic_clang_gcc_w_wout_svml.pickle
 create mode 100644 test/benchmark/reference_data/memory_bound.pickle
 create mode 100644 test/benchmark/run_benchmark_script.sh
 create mode 100644 test/benchmark/run_benchmark_script_cpu_gpu.sh
 create mode 100644 test/benchmark/run_benchmark_script_cpu_only.sh

diff --git a/.gitignore b/.gitignore
index 65b55e2b82..5306898462 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,19 @@ venv.bak/
 .cmake-format.yaml
 .pre-commit-config.yaml
 .ipynb_checkpoints
+
+# Benchmark outputs
+test/benchmark/*.ll
+test/benchmark/*.ptx
+test/benchmark/*.out
+test/benchmark/*.log
+test/benchmark/*.cpp
+test/benchmark/*.txt
+test/benchmark/core.*
+test/benchmark/memory_bound_*
+test/benchmark/memory-bound_*
+test/benchmark/hh_*
+test/benchmark/compute_bound_*
+test/benchmark/compute-bound_*
+test/benchmark/llvm_benchmark_*
+test/benchmark/v*
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index bbc5ba2591..d5e550bdf7 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -183,7 +183,7 @@ stages:
     - checkout: self
       submodules: True
     - script: |
-        brew install flex bison cmake python@3 gcc@8 llvm
+        brew install flex bison cmake python@3 gcc@8 llvm@13
         python3 -m pip install --upgrade pip setuptools
         python3 -m pip install --user 'Jinja2>=2.9.3' 'PyYAML>=3.13' pytest pytest-cov numpy 'sympy>=1.3,<1.9'
       displayName: 'Install Dependencies'
@@ -191,7 +191,7 @@ stages:
         export PATH=/usr/local/opt/flex/bin:/usr/local/opt/bison/bin:$PATH;
         mkdir -p $(Build.Repository.LocalPath)/build
         cd $(Build.Repository.LocalPath)/build
-        cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$(brew --prefix llvm)/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
+        cmake .. -DPYTHON_EXECUTABLE=$(which python3) -DCMAKE_INSTALL_PREFIX=$HOME/nmodl -DCMAKE_BUILD_TYPE=RelWithDebInfo -DNMODL_ENABLE_PYTHON_BINDINGS=OFF -DLLVM_DIR=$(brew --prefix llvm@13)/lib/cmake/llvm -DNMODL_ENABLE_LLVM=ON
         make -j 2
         if [ $? -ne 0 ]
         then
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
deleted file mode 100644
index f28457f513..0000000000
--- a/docker/docker-compose.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-version: '3.7'
-services:
-  notebook:
-    image: bluebrain/nmodl:latest
-    hostname: ${HOSTNAME}
-    ports:
-    - "8888:8888"
-    environment:
-    - USER_LOGIN=${USER}
-    - USER_ID=${DUID}
-    - GROUP_ID=${DGID}
-    volumes:
-    - $PWD/notebooks:/nmodl/notebooks/my_notebooks
-    command:
-    - jupyter
-    - notebook
-    - --port=8888
-    - --no-browser
-    - --ip=0.0.0.0
-    - --allow-root
-    - --notebook-dir=/nmodl/notebooks
-
diff --git a/docker/recipe/Dockerfile b/docker/recipe/Dockerfile
deleted file mode 100644
index c5660b54c2..0000000000
--- a/docker/recipe/Dockerfile
+++ /dev/null
@@ -1,50 +0,0 @@
-FROM alpine:3.9 AS builder
-
-WORKDIR /nmodl/src
-
-RUN apk add --update build-base gcc g++ make cmake flex flex-dev bison git python3-dev
-
-RUN pip3 install --trusted-host pypi.python.org jinja2 pyyaml pytest sympy
-
-
-ARG NMODL_VERSION=master
-
-RUN git clone --recursive https://github.com/BlueBrain/nmodl.git && \
-    cd nmodl && \
-    git checkout ${NMODL_VERSION}
-
-WORKDIR /nmodl/src/nmodl
-
-RUN python3 setup.py build
-
-FROM alpine:3.9
-
-
-RUN apk add --no-cache --update shadow python3 libgfortran libstdc++ openblas && \
-    apk add --no-cache --update \
-            --repository http://dl-cdn.alpinelinux.org/alpine/edge/testing gosu && \
-    apk add --no-cache --virtual build-dependencies \
-            build-base linux-headers openblas-dev freetype-dev \
-            pkgconfig gfortran python3-dev && \
-    pip3 install --no-cache-dir --trusted-host pypi.python.org \
-                 jinja2 pyyaml pytest sympy numpy matplotlib jupyter && \
-    apk del build-dependencies && \
-    rm -rf /var/cache/apk/*
-
-WORKDIR /usr/lib/python3.6/site-packages/nmodl
-
-COPY --from=builder /nmodl/src/nmodl/build/lib.linux-x86_64-3.6/nmodl .
-
-ENV LANG en_US.utf8
-ENV SHELL=/bin/bash
-
-ADD entrypoint /usr/bin/
-ENTRYPOINT ["/usr/bin/entrypoint"]
-
-EXPOSE 8888
-WORKDIR /nmodl/notebooks
-
-COPY --from=builder /nmodl/src/nmodl/docs/notebooks ./examples
-
-CMD ["jupyter", "notebook", "--port=8888", "--no-browser", "--ip=0.0.0.0", "--allow-root", "--notebook-dir=/nmodl/notebooks"]
-
diff --git a/docker/recipe/entrypoint b/docker/recipe/entrypoint
deleted file mode 100755
index 9082326f80..0000000000
--- a/docker/recipe/entrypoint
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh -e
-
-# Create fake user and group with the same ids than
-# the host user, and use its identity all along
-# in the container so that all created files in
-# mounted volumes belongs to the host user.
-#
-# https://stackoverflow.com/questions/41857462
-
-if [ "x$GROUP_ID" = x -o "x$USER_ID" = x ] ;then
-    echo 'Error: $USER_ID and $GROUP_ID environment variable not set' >&2
-    echo Abort >&2
-    exit 1
-fi
-
-# Create fake user
-CUR_GROUP=$(grep ":${GROUP_ID}:" /etc/group | cut -d: -f1)
-if [ "x$CUR_GROUP" != x ] ;then
-    groupmod --new-name dummy "$CUR_GROUP"
-else
-    grep -q ^dummy: /etc/group || groupadd -g $GROUP_ID dummy
-fi
-grep -q ^dummy: /etc/passwd || useradd -m -u $USER_ID -g $GROUP_ID dummy -s /bin/bash
-
-
-chown -R dummy:dummy /home/dummy
-chown -R dummy:dummy /nmodl
-
-chmod -R "u=rwX,go=rX" "/nmodl"
-
-# Run the given command as root if bash or sh,
-# the fake user otherwise.
-case "$1" in
-    sh|bash)
-        exec $@
-        ;;
-    *)
-        cd /nmodl/notebooks
-        gosu dummy "$@"
-        ;;
-esac
diff --git a/docs/CC2023/PAD.md b/docs/CC2023/PAD.md
new file mode 100644
index 0000000000..384bb8af83
--- /dev/null
+++ b/docs/CC2023/PAD.md
@@ -0,0 +1,51 @@
+
+# MOD2IR: High-Performance Code Generation for a Biophysically Detailed Neuronal Simulation DSL
+
+## Preliminary Artifact Description
+
+### Broad Description
+
+This artifact provides all the necessary code, scripts and results to compile the NMODL transpiler
+with the MOD2IR extension and run all benchmarks described in the manuscript. To simplify the
+evaluation process we provide along with the instructions a Dockerfile that will setup a viable
+system for the benchmarks. The driver script compiles the membrane mechanism model `hh.mod` and the
+synapse mechanism model `expsyn.mod` with various compile-time configurations and then runs the
+generated  binaries comparing their runtimes. More specifically the benchmark compares the execution
+runtime of the binaries generated via the two-step compilation process MOD-C++-binary using various
+open-source and commercial compiler frameworks with the one-step ahead-of-time and just-in-time
+processes of MOD2IR.
+MOD2IR is implemented as a code generation backend inside the NMODL Framework and it makes heavy
+use of the LLVM IR and compilation passes. Most of the relevant code of the described work can be
+found [here](https://github.com/BlueBrain/nmodl/tree/llvm/src/codegen/llvm) and
+[here](https://github.com/BlueBrain/nmodl/tree/llvm/test/benchmark). The intructions to reproduce
+the results can be found [here](https://github.com/BlueBrain/nmodl/blob/3365551b332829699c1af3bea3c0fbe820a30800/docs/CC2023/README.md). 
+
+### Badge
+
+Blue Badge (results validated). We hope that using the provided Dockerfile and scripts the
+evaluators should be able to fully build our code and reproduce our benchmark setup as well as
+obtain benchmarking results. Please note that in all likelihood the obtained runtimes by the
+evaluators will slightly differ from the presented results in the paper as they heavily depend on
+the used hardware and system software. We believe, however, that the results should nevertheless be
+qualitiatively the same as the ones we have presented.
+
+### Hardware requisites
+
+The provided artifact can in theory be run on any x86 hardware platform. For the prupose of closely
+reproducing our benchmark results we recommend using a workstation (or cloud instance) with Intel Xeon
+Skylake (or newer) CPU and an NVIDIA Volta V100 (or newer) GPU. All benchmark runs are single-core
+and have relatively low memory-requirement. For building the Docker image (and more specifically the
+NMODL Framework) we, however, recommend a system with plenty of cores and at least 32GB of RAM
+available and 20 GB of disk space.
+
+### Software requisites
+
+Any reasonably up-to-date Linux system with Docker should be sufficient. If GPU results are to be
+reproduced, an up-to-date CUDA (11.0 or newer) should be present.
+
+
+### Expectations
+
+We expect that all setup and benchmarks can be completed within one working day. The expected time for building
+the docker image is around 10 minutes using a modern multicore system with a stable internet connection.
+The expected runtime of the benchmarks is around 4 hours.
diff --git a/docs/CC2023/README.md b/docs/CC2023/README.md
new file mode 100644
index 0000000000..e9f1fc22d8
--- /dev/null
+++ b/docs/CC2023/README.md
@@ -0,0 +1,74 @@
+
+# MOD2IR: High-Performance Code Generation for a Biophysically Detailed Neuronal Simulation DSL
+
+Please refer to the PAD.md file for an overview and necessary prerequisites.
+
+## Benchmarking Instructions
+
+To reproduce as closely as possible our environment and to lower the burden of the
+installation of the different compilers and libraries we have created Docker images which take
+care of installing all the necessary packages and compilers to install MOD2IR and execute the
+benchmarks.
+Due to technical restrictions imposed by Docker to execute a Docker image and be able to execute
+applications on NVIDIA GPUs there are some extra steps needed. For this reason we have created two
+different `Dockerfile`s, one that takes care of both the CPU and GPU benchmarks and one for CPU only
+execution if there is no NVIDIA GPU available in the test system.
+
+### CPU and GPU docker image
+
+The image that targets both CPU and GPU can be found in `test/benchmark/gpu_docker/Dockerfile`.
+To launch the Docker image you can execute the following:
+
+```
+cd test/benchnark/gpu_docker  # Enter the directory that contains the Dockerfile
+bash install_gpu_docker_env.sh  # Installs docker and NVIDIA docker runtime (needs sudo permission)
+docker run -it -v $PWD:/opt/mount --gpus all bluebrain/nmodl:mod2ir-gpu-benchmark # Execute docker image (~16GB)
+```
+
+After building and launching the docker file we can now execute the benchmarks and generate the same
+plots as the ones we included in the paper with the new results along the reference plots from the paper.
+To do this we need to execute the following two scripts inside the docker image environment:
+
+```
+cd nmodl/test/benchmark  # Enter the directory where the scripts are inside the docker image
+bash run_benchmark_script_cpu_gpu.sh  # Runs all the benchmarks on CPU and GPU
+python3 plot_benchmarks_cpu_gpu.py  # Generate the plots based on the outputs of the previous script
+cp -r graphs_output_pandas /opt/mount  # Copy the graphs from the docker image to your environment
+```
+
+Executing `run_benchmark_script_dockerfile.sh` will generate two pickle files that include the results
+in `hh_expsyn_cpu/benchmark_results.pickle` for the CPU benchmarks and `hh_expsyn_gpu/benchmark_results.pickle`
+for the GPU benchmarks. Those will then be loaded by `plot_benchmarks.py` to generate the plots.
+Now you can exit the docker image terminal and open the above files which exist in your local directory.
+
+
+### CPU only docker image
+
+In case there is no GPU available instead of running the above Docker container you can also run a
+CPU only container.
+To do this you need to:
+
+```
+cd test/benchnark/cpu_docker  # Enter the directory that contains the Dockerfile
+docker run -it -v $PWD:/opt/mount bluebrain/nmodl:mod2ir-cpu-benchmark # Execute docker image (~16GB)
+```
+
+Then inside the docker shell:
+
+```
+cd nmodl/test/benchmark  # Enter the directory where the scripts are inside the docker image
+bash run_benchmark_script_cpu_only.sh  # Runs all the benchmarks on CPU
+python3 plot_benchmarks_cpu_only.py  # Generate the plots based on the outputs of the previous script
+cp -r graphs_output_pandas /opt/mount  # Copy the graphs from the docker image to your environment
+```
+
+By executing `run_benchmark_script_cpu_only.sh` there will be only `hh_expsyn_cpu/benchmark_results.pickle`
+generated containing the CPU results.
+
+
+## Notes
+
+1. Acceleration results with `GCC` compiler might be better in the docker container than the paper
+   due to the newer OS we're using in the Dockerfile. Latest Ubuntu versions come with GLIBC 2.3x that
+   includes `libmvec` which provides vectorized implementations to the `GCC` compiler enabling the
+   vectorization of the kernels even without providing the `SVML` library to `GCC`.
diff --git a/src/codegen/codegen_acc_visitor.cpp b/src/codegen/codegen_acc_visitor.cpp
index 80e784063e..6aa6c33bb7 100644
--- a/src/codegen/codegen_acc_visitor.cpp
+++ b/src/codegen/codegen_acc_visitor.cpp
@@ -57,7 +57,7 @@ void CodegenAccVisitor::print_channel_iteration_block_parallel_hint(BlockType ty
 
 
 void CodegenAccVisitor::print_atomic_reduction_pragma() {
-    if (!info.artificial_cell) {
+    if (info.point_process) {
         printer->add_line("nrn_pragma_acc(atomic update)");
         printer->add_line("nrn_pragma_omp(atomic update)");
     }
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 32bc1b8b9d..7f882c7392 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -533,53 +533,121 @@ void CodegenLLVMVisitor::visit_codegen_atomic_statement(const ast::CodegenAtomic
     ast::BinaryOp op = ir_builder.extract_atomic_op(atomic_op);
 
     // For different platforms, we handle atomic updates differently!
-    if (platform.is_cpu_with_simd()) {
-        throw std::runtime_error("Error: no atomic update support for SIMD CPUs\n");
-    } else if (platform.is_gpu()) {
-        const auto& identifier = var->get_name();
-
-        // We only need to support atomic updates to instance struct members.
-        if (!identifier->is_codegen_instance_var())
-            throw std::runtime_error("Error: atomic updates for non-instance variable\n");
-
-        const auto& node = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(identifier);
-        const auto& instance_name = node->get_instance_var()->get_node_name();
-        const auto& member_node = node->get_member_var();
-        const auto& member_name = member_node->get_node_name();
-
-        if (!instance_var_helper.is_an_instance_variable(member_name))
-            throw std::runtime_error("Error: " + member_name +
-                                     " is not a member of the instance variable\n");
-
-        llvm::Value* instance_ptr = ir_builder.create_load(instance_name);
-        int member_index = instance_var_helper.get_variable_index(member_name);
-        llvm::Value* member_ptr = ir_builder.get_struct_member_ptr(instance_ptr, member_index);
-
-        // Some sanity checks.
-        auto codegen_var_with_type = instance_var_helper.get_variable(member_name);
-        if (!codegen_var_with_type->get_is_pointer())
-            throw std::runtime_error(
-                "Error: atomic updates are allowed on pointer variables only\n");
-        const auto& member_var_name = std::dynamic_pointer_cast<ast::VarName>(member_node);
-        if (!member_var_name->get_name()->is_indexed_name())
-            throw std::runtime_error("Error: " + member_name + " is not an IndexedName\n");
-        const auto& member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
-            member_var_name->get_name());
-        if (!member_indexed_name->get_length()->is_name())
-            throw std::runtime_error("Error: " + member_name + " must be indexed with a variable!");
-
-        llvm::Value* i64_index = get_index(*member_indexed_name);
-        llvm::Value* instance_member = ir_builder.create_load(member_ptr);
-        llvm::Value* ptr = ir_builder.create_inbounds_gep(instance_member, i64_index);
 
-        ir_builder.create_atomic_op(ptr, rhs, op);
-    } else {
-        // For non-SIMD CPUs, updates don't have to be atomic at all!
+    // For non-SIMD CPUs (or any scalar code on SIMD CPUs), updates don't have to be atomic at all!
+    const bool non_SIMD_cpu = platform.is_cpu() && !platform.is_cpu_with_simd();
+    if (non_SIMD_cpu || (platform.is_cpu_with_simd() && !ir_builder.vectorizing())) {
         llvm::Value* lhs = accept_and_get(node.get_lhs());
         ir_builder.create_binary_op(lhs, rhs, op);
         llvm::Value* result = ir_builder.pop_last_value();
 
         write_to_variable(*var, result);
+        return;
+    }
+
+    // Otherwise, we either have a GPU or a SIMD CPU. Double-check to be sure.
+    if (!platform.is_gpu() && !platform.is_cpu_with_simd())
+        throw std::runtime_error("Error: unknown platform - " + platform.get_name() + "\n");
+
+    const auto& identifier = var->get_name();
+    if (!identifier->is_codegen_instance_var())
+        throw std::runtime_error("Error: atomic updates for non-instance variable\n");
+
+    const auto& codegen_intance_node = std::dynamic_pointer_cast<ast::CodegenInstanceVar>(
+        identifier);
+    const auto& instance_name = codegen_intance_node->get_instance_var()->get_node_name();
+    const auto& member_node = codegen_intance_node->get_member_var();
+    const auto& member_name = member_node->get_node_name();
+
+    // Sanity checks. Note that there is a bit of duplication with `read_from_or_write_to_instance`
+    // but this is not crucial for now.
+    // TODO: remove this duplication!
+    if (!instance_var_helper.is_an_instance_variable(member_name))
+        throw std::runtime_error("Error: " + member_name +
+                                 " is not a member of the instance variable\n");
+    auto codegen_var_with_type = instance_var_helper.get_variable(member_name);
+    if (!codegen_var_with_type->get_is_pointer())
+        throw std::runtime_error("Error: atomic updates are allowed on pointer variables only\n");
+    const auto& member_var_name = std::dynamic_pointer_cast<ast::VarName>(member_node);
+    if (!member_var_name->get_name()->is_indexed_name())
+        throw std::runtime_error("Error: " + member_name + " is not an IndexedName\n");
+    const auto& member_indexed_name = std::dynamic_pointer_cast<ast::IndexedName>(
+        member_var_name->get_name());
+    if (!member_indexed_name->get_length()->is_name())
+        throw std::runtime_error("Error: " + member_name + " must be indexed with a variable!");
+
+    // First, load the pointer variable from instance struct and process its index.
+    llvm::Value* instance_ptr = ir_builder.create_load(instance_name);
+    const int member_index = instance_var_helper.get_variable_index(member_name);
+    llvm::Value* member_ptr = ir_builder.get_struct_member_ptr(instance_ptr, member_index);
+    llvm::Value* instance_member = ir_builder.create_load(member_ptr);
+    llvm::Value* i64_index = get_index(*member_indexed_name);
+
+    // For GPUs, we just need to create atomic add/subtract.
+    if (platform.is_gpu()) {
+        llvm::Value* ptr = ir_builder.create_inbounds_gep(instance_member, i64_index);
+        ir_builder.create_atomic_op(ptr, rhs, op);
+    } else {
+        // SIMD case is more elaborate. We will create a scalar block that will perform necessary
+        // update. The overall structure will be
+        //  +---------------------------+
+        //  | <for body code>           |
+        //  | <some initialisation>     |
+        //  | br %atomic                |
+        //  +---------------------------+
+        //                |
+        //                V
+        //  +-----------------------------+
+        //  | <atomic update code>        |
+        //  | %cmp = ...                  |<------+
+        //  | cond_br %cmp, %atomic, %rem |       |
+        //  +-----------------------------+       |
+        //      |                 |               |
+        //      |                 +---------------+
+        //      V
+        //  +---------------------------+
+        //  | <for body remaining code> |
+        //  |                           |
+        //  +---------------------------+
+
+        // Step 1: Create a vector of (replicated) starting addresses of the given member.
+        llvm::Value* start = ir_builder.create_member_addresses(instance_member);
+
+        // Step 2: Create a vector alloca that will store addresses of member values. Then also
+        // create an array of these addresses (as pointers). While this can be moved to `IRBuilder`,
+        // the amount of code is rather negligible and thus can be left here.
+        const int vector_width = platform.get_instruction_width();
+        llvm::Type* vi64_type = llvm::FixedVectorType::get(ir_builder.get_i64_type(), vector_width);
+        llvm::Type* array_type = llvm::ArrayType::get(ir_builder.get_fp_ptr_type(), vector_width);
+
+        llvm::Value* ptrs_vec = ir_builder.create_alloca(/*name=*/"ptrs", vi64_type);
+        llvm::Value* ptrs_arr =
+            ir_builder.create_bitcast(ptrs_vec,
+                                      llvm::PointerType::get(array_type, /*AddressSpace=*/0));
+
+        // Step 3: Calculate offsets of the values in the member by:
+        //     offset = start + (index * sizeof(fp_type))
+        // Store this vector to a temporary for later reuse.
+        llvm::Value* offsets = ir_builder.create_member_offsets(start, i64_index);
+        ir_builder.create_store(ptrs_vec, offsets);
+
+        // Step 4: Create a new block that  will be used for atomic code generation.
+        llvm::BasicBlock* body_bb = ir_builder.get_current_block();
+        llvm::BasicBlock* cond_bb = body_bb->getNextNode();
+        llvm::Function* func = body_bb->getParent();
+        llvm::BasicBlock* atomic_bb =
+            llvm::BasicBlock::Create(*context, /*Name=*/"atomic.update", func, cond_bb);
+        llvm::BasicBlock* remaining_body_bb =
+            llvm::BasicBlock::Create(*context, /*Name=*/"for.body.remaining", func, cond_bb);
+        ir_builder.create_br_and_set_insertion_point(atomic_bb);
+
+        // Step 5: Generate code for the atomic update: go through each element in the vector
+        // performing the computation.
+        llvm::Value* cmp = ir_builder.create_atomic_loop(ptrs_arr, rhs, op);
+
+        // Create branch to close the loop and restore the insertion point.
+        ir_builder.create_cond_br(cmp, remaining_body_bb, atomic_bb);
+        ir_builder.set_insertion_point(remaining_body_bb);
     }
 }
 
diff --git a/src/codegen/llvm/llvm_ir_builder.cpp b/src/codegen/llvm/llvm_ir_builder.cpp
index efbd7aa050..f0682fff91 100644
--- a/src/codegen/llvm/llvm_ir_builder.cpp
+++ b/src/codegen/llvm/llvm_ir_builder.cpp
@@ -315,6 +315,85 @@ void IRBuilder::create_atomic_op(llvm::Value* ptr, llvm::Value* update, ast::Bin
                             llvm::AtomicOrdering::SequentiallyConsistent);
 }
 
+llvm::Value* IRBuilder::create_member_addresses(llvm::Value* member_ptr) {
+    llvm::Module* m = builder.GetInsertBlock()->getParent()->getParent();
+
+    // Treat this member address as integer value.
+    llvm::Type* int_ptr_type = m->getDataLayout().getIntPtrType(builder.getContext());
+    llvm::Value* ptr_to_int = builder.CreatePtrToInt(member_ptr, int_ptr_type);
+
+    // Create a vector that has address at 0.
+    llvm::Type* vector_type = llvm::FixedVectorType::get(int_ptr_type,
+                                                         platform.get_instruction_width());
+    llvm::Value* zero = get_scalar_constant<llvm::ConstantInt>(get_i32_type(), 0);
+    llvm::Value* tmp =
+        builder.CreateInsertElement(llvm::UndefValue::get(vector_type), ptr_to_int, zero);
+
+    // Finally, use `shufflevector` with zeroinitializer to replicate the 0th element.
+    llvm::Value* select = llvm::Constant::getNullValue(vector_type);
+    return builder.CreateShuffleVector(tmp, llvm::UndefValue::get(vector_type), select);
+}
+
+llvm::Value* IRBuilder::create_member_offsets(llvm::Value* start, llvm::Value* indices) {
+    llvm::Value* factor = get_vector_constant<llvm::ConstantInt>(get_i64_type(),
+                                                                 platform.get_precision() / 8);
+    llvm::Value* offset = builder.CreateMul(indices, factor);
+    return builder.CreateAdd(start, offset);
+}
+
+llvm::Value* IRBuilder::create_atomic_loop(llvm::Value* ptrs_arr,
+                                           llvm::Value* rhs,
+                                           ast::BinaryOp op) {
+    const int vector_width = platform.get_instruction_width();
+    llvm::BasicBlock* curr = get_current_block();
+    llvm::BasicBlock* prev = curr->getPrevNode();
+    llvm::BasicBlock* next = curr->getNextNode();
+
+    // Some constant values.
+    llvm::Value* false_value = get_scalar_constant<llvm::ConstantInt>(get_boolean_type(), 0);
+    llvm::Value* zero = get_scalar_constant<llvm::ConstantInt>(get_i64_type(), 0);
+    llvm::Value* one = get_scalar_constant<llvm::ConstantInt>(get_i64_type(), 1);
+    llvm::Value* minus_one = get_scalar_constant<llvm::ConstantInt>(get_i64_type(), -1);
+
+    // First, we create a PHI node that holds the mask of active vector elements.
+    llvm::PHINode* mask = builder.CreatePHI(get_i64_type(), /*NumReservedValues=*/2);
+
+    // Intially, all elements are active.
+    llvm::Value* init_value = get_scalar_constant<llvm::ConstantInt>(get_i64_type(),
+                                                                     ~((~0) << vector_width));
+
+    // Find the index of the next active element and update the mask. This can be easily computed
+    // with:
+    //     index    = cttz(mask)
+    //     new_mask = mask & ((1 << index) ^ -1)
+    llvm::Value* index =
+        builder.CreateIntrinsic(llvm::Intrinsic::cttz, {get_i64_type()}, {mask, false_value});
+    llvm::Value* new_mask = builder.CreateShl(one, index);
+    new_mask = builder.CreateXor(new_mask, minus_one);
+    new_mask = builder.CreateAnd(mask, new_mask);
+
+    // Update PHI with appropriate values.
+    mask->addIncoming(init_value, prev);
+    mask->addIncoming(new_mask, curr);
+
+    // Get the pointer to the current value, the value itself and the update.b
+    llvm::Value* gep =
+        builder.CreateGEP(ptrs_arr->getType()->getPointerElementType(), ptrs_arr, {zero, index});
+    llvm::Value* ptr = create_load(gep);
+    llvm::Value* source = create_load(ptr);
+    llvm::Value* update = builder.CreateExtractElement(rhs, index);
+
+    // Perform the update and store the result back.
+    //     source = *ptr
+    //     *ptr = source + update
+    create_binary_op(source, update, op);
+    llvm::Value* result = pop_last_value();
+    create_store(ptr, result);
+
+    // Return condition to break out of atomic update loop.
+    return builder.CreateICmpEQ(new_mask, zero);
+}
+
 void IRBuilder::create_binary_op(llvm::Value* lhs, llvm::Value* rhs, ast::BinaryOp op) {
     // Check that both lhs and rhs have the same types.
     if (lhs->getType() != rhs->getType())
diff --git a/src/codegen/llvm/llvm_ir_builder.hpp b/src/codegen/llvm/llvm_ir_builder.hpp
index 67db6fcded..537682b930 100644
--- a/src/codegen/llvm/llvm_ir_builder.hpp
+++ b/src/codegen/llvm/llvm_ir_builder.hpp
@@ -313,6 +313,19 @@ class IRBuilder {
     /// Generates an inbounds GEP instruction for the given value and returns calculated address.
     llvm::Value* create_inbounds_gep(llvm::Value* variable, llvm::Value* index);
 
+    /// Creates a vector splat of starting addresses of the given member.
+    llvm::Value* create_member_addresses(llvm::Value* member_ptr);
+
+    /// Creates IR for calculating offest to member values. For more context, see
+    /// `visit_codegen_atomic_statement` in LLVM visitor.
+    llvm::Value* create_member_offsets(llvm::Value* start, llvm::Value* indices);
+
+    /// Creates IR to perform scalar updates to instance member based on `ptrs_arr` for every
+    /// element in a vector by
+    ///     member[*ptrs_arr[i]] = member[*ptrs_arr[i]] op rhs.
+    /// Returns condition (i1 value) to break out of atomic update loop.
+    llvm::Value* create_atomic_loop(llvm::Value* ptrs_arr, llvm::Value* rhs, ast::BinaryOp op);
+
   private:
     /// Generates an inbounds GEP instruction for the given name and returns calculated address.
     llvm::Value* create_inbounds_gep(const std::string& variable_name, llvm::Value* index);
diff --git a/src/codegen/llvm/replace_with_lib_functions.cpp b/src/codegen/llvm/replace_with_lib_functions.cpp
index 07d6dd8f04..b8617950b2 100644
--- a/src/codegen/llvm/replace_with_lib_functions.cpp
+++ b/src/codegen/llvm/replace_with_lib_functions.cpp
@@ -28,7 +28,7 @@ bool ReplaceMathFunctions::runOnModule(Module& module) {
 
     // If the platform supports SIMD, replace math intrinsics with library
     // functions.
-    if (platform->is_cpu_with_simd()) {
+    if (platform->is_cpu()) {
         // First, get the target library information and add vectorizable functions for the
         // specified vector library.
         Triple triple(sys::getDefaultTargetTriple());
diff --git a/src/main.cpp b/src/main.cpp
index a820868643..f17a737ef0 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -67,7 +67,7 @@ int main(int argc, const char* argv[]) {
     int num_experiments = 100;
 
     /// benchmark external kernel with JIT
-    bool external_kernel;
+    std::string external_kernel_library;
 
     /// X dimension of grid in blocks for GPU execution
     int llvm_cuda_grid_dim_x = 1;
@@ -271,9 +271,9 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               fmt::format("Number of experiments for benchmarking ({})", num_experiments))->ignore_case();
-    benchmark_opt->add_flag("--external",
-                              external_kernel,
-                              fmt::format("Benchmark external kernel ({})", external_kernel))->ignore_case();
+    benchmark_opt->add_option("--external",
+                              external_kernel_library,
+                              fmt::format("Benchmark external kernels from shared library({})", external_kernel_library))->ignore_case()->check(CLI::ExistingFile);
     benchmark_opt->add_option("--grid-dim-x",
                               llvm_cuda_grid_dim_x,
                               fmt::format("Grid dimension X ({})", llvm_cuda_grid_dim_x))->ignore_case();
@@ -449,7 +449,7 @@ int main(int argc, const char* argv[]) {
                                                        platform,
                                                        cfg.llvm_opt_level_ir,
                                                        cfg.llvm_opt_level_codegen,
-                                                       external_kernel,
+                                                       external_kernel_library,
                                                        gpu_execution_parameters);
                     benchmark.run();
                 }
diff --git a/src/pybind/pynmodl.cpp b/src/pybind/pynmodl.cpp
index b0b4ff8634..422995fc88 100644
--- a/src/pybind/pynmodl.cpp
+++ b/src/pybind/pynmodl.cpp
@@ -188,10 +188,11 @@ class JitDriver {
     }
 
 
-    benchmark::BenchmarkResults run(std::shared_ptr<nmodl::ast::Program> node,
+    benchmark::BenchmarkResults run(const std::shared_ptr<const nmodl::ast::Program> node,
                                     std::string& modname,
                                     int num_experiments,
                                     int instance_size,
+                                    std::string& external_kernel_library,
                                     int cuda_grid_dim_x,
                                     int cuda_block_dim_x) {
         // New directory is needed to be created otherwise the directory cannot be created
@@ -199,9 +200,13 @@ class JitDriver {
         if (cfg.nmodl_ast || cfg.json_ast || cfg.json_perfstat) {
             utils::make_path(cfg.scratch_dir);
         }
-        cg_driver.prepare_mod(node, modname);
+        utils::make_path(cfg.output_dir);
+        // Make copy of node to be able to run the visitors according to any changes in the
+        // configuration and execute the mechanisms' functions multiple times
+        auto new_node = std::make_shared<nmodl::ast::Program>(*node);
+        cg_driver.prepare_mod(new_node, modname);
         nmodl::codegen::CodegenLLVMVisitor visitor(modname, cfg.output_dir, platform, 0);
-        visitor.visit_program(*node);
+        visitor.visit_program(*new_node);
         const GPUExecutionParameters gpu_execution_parameters{cuda_grid_dim_x, cuda_block_dim_x};
         nmodl::benchmark::LLVMBenchmark benchmark(visitor,
                                                   modname,
@@ -212,7 +217,7 @@ class JitDriver {
                                                   platform,
                                                   cfg.llvm_opt_level_ir,
                                                   cfg.llvm_opt_level_codegen,
-                                                  false,
+                                                  external_kernel_library,
                                                   gpu_execution_parameters);
         return benchmark.run();
     }
@@ -308,8 +313,9 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
              &nmodl::JitDriver::run,
              "node"_a,
              "modname"_a,
-             "num_experiments"_a,
              "instance_size"_a,
+             "num_experiments"_a = 1,
+             "external_kernel_library"_a = "",
              "cuda_grid_dim_x"_a = 1,
              "cuda_block_dim_x"_a = 1);
 #else
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index 292a7ff05f..6e10a72b22 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -39,11 +39,6 @@ if(NMODL_ENABLE_PYTHON_BINDINGS)
   file(GLOB modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/*.mod")
   list(APPEND modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/integration/mod/test_math.mod")
   foreach(modfile ${modfiles})
-    # For expsyn.mod set the vector width to 1 since atomic operations are not supported for vector
-    # widths > 1. See https://github.com/BlueBrain/nmodl/issues/857
-    if(${modfile} STREQUAL "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/expsyn.mod")
-      set(extra_args "--vec 1")
-    endif()
     get_filename_component(modfile_name "${modfile}" NAME)
     add_test(NAME "PyJIT/${modfile_name}"
              COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
diff --git a/test/benchmark/NMODL_paper_graphs.ipynb b/test/benchmark/NMODL_paper_graphs.ipynb
new file mode 100644
index 0000000000..beab469912
--- /dev/null
+++ b/test/benchmark/NMODL_paper_graphs.ipynb
@@ -0,0 +1,2518 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d21079d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "from matplotlib import pyplot as plt\n",
+    "import seaborn as sns\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06d06d97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from IPython.display import display, HTML\n",
+    "# display(HTML(\"<style>.container { width:100% !important; }</style>\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b857df7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pickle_files = [\"./reference_data/hh.pickle\", \"./reference_data/memory_bound.pickle\", \"./reference_data/compute_bound.pickle\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6423985",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = {}\n",
+    "def load_pickle_result_file(pickle_files, results):\n",
+    "    def _merge(a, b, path=None):\n",
+    "        if path is None: path = []\n",
+    "        for key in b:\n",
+    "            if key in a:\n",
+    "                if isinstance(a[key], dict) and isinstance(b[key], dict):\n",
+    "                    _merge(a[key], b[key], path + [str(key)])\n",
+    "                elif a[key] == b[key]:\n",
+    "                    pass # same leaf value\n",
+    "                else:\n",
+    "                    raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))\n",
+    "            else:\n",
+    "                a[key] = b[key]\n",
+    "        return a\n",
+    "    for pickle_file in pickle_files:\n",
+    "        with open(pickle_file, 'rb') as handle:\n",
+    "            results = _merge(results, pickle.load(handle))\n",
+    "    return results\n",
+    "\n",
+    "results = load_pickle_result_file(pickle_files, results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b203b54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _get_flags_string(flags):\n",
+    "    return flags.replace(\" \", \"_\").replace('-','').replace('=','_')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db628bac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# colors = ['#f0f9e8','#bae4bc','#7bccc4','#43a2ca','#0868ac'] # print friendly colors\n",
+    "# sns.set_palette(sns.color_palette(colors))\n",
+    "# sns.set_palette(\"Set3\") # NEURON paper palette\n",
+    "colors = ['#6baed6', '#0570b0', '#66c2a4','#238b45','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']\n",
+    "# pick colors according to the following order which matches the order of \n",
+    "# intel #6baed6\n",
+    "# intel svml #0570b0\n",
+    "# gcc #66c2a4\n",
+    "# gcc_svml #238b45\n",
+    "# clang #fdd49e\n",
+    "# clang_svml #fc8d59\n",
+    "# mod2ir #9ebcda\n",
+    "# mod2ir_svml #8c96c6\n",
+    "# mor2it_jit_svml #969696\n",
+    "# mod2ir_jit_sleef #525252\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "def generate_graph_pandas(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False):\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    for modname in results:\n",
+    "        # state\n",
+    "        bar_data_state_cpu_panda = {}\n",
+    "        bar_data_state_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_state_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_state_cpu_panda[\"runtime\"] = []\n",
+    "        # current\n",
+    "        bar_data_cur_cpu_panda = {}\n",
+    "        bar_data_cur_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"runtime\"] = []\n",
+    "        for architecture in results[modname]:\n",
+    "            for compiler in compiler_flags:\n",
+    "                if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                    for flags in compiler_flags[compiler][architecture]:\n",
+    "                        if compiler == \"nmodl_jit\":\n",
+    "                            state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                            cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        else:\n",
+    "                            state_kernel_name = \"nrn_state_ext\"\n",
+    "                            cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                        if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                            compiler_name = \"mod2ir\"\n",
+    "                        elif compiler == \"nmodl_jit\":\n",
+    "                            compiler_name = \"mod2ir_jit\"\n",
+    "                        else:\n",
+    "                            compiler_name = compiler\n",
+    "                        if \"svml\" in flags or \"SVML\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_svml\"\n",
+    "                        elif \"sleef\" in flags or \"SLEEF\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_sleef\"\n",
+    "                        if architecture == \"default\":\n",
+    "                            architecture_label = \"auto-scalar\"\n",
+    "                        elif architecture == \"nehalem\":\n",
+    "                            architecture_label = \"nehalem-sse2\"\n",
+    "                        elif architecture == \"broadwell\":\n",
+    "                            architecture_label = \"broadwell-avx2\"\n",
+    "                        else: # skylake-avx512\n",
+    "                            architecture_label = architecture\n",
+    "                        bar_data_state_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                        bar_data_state_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_state_cpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_state_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                        bar_data_cur_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                        bar_data_cur_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "        fig, axes = plt.subplots(1, 2, squeeze=False, figsize=(48,24))\n",
+    "        df_state = pd.DataFrame(bar_data_state_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "        sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_state, ax=axes[0,0])\n",
+    "        axes[0,0].xaxis.label.set_visible(False)\n",
+    "        axes[0,0].yaxis.label.set_visible(False)\n",
+    "        axes[0,0].set_title(\"nrn_state runtime for {}\".format(modname))\n",
+    "        axes[0,0].get_legend().remove()\n",
+    "        if print_values:\n",
+    "          for i in axes[0,0].containers:\n",
+    "            axes[0,0].bar_label(i,)\n",
+    "        df_cur = pd.DataFrame(bar_data_cur_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "        sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_cur, ax=axes[0,1])\n",
+    "        axes[0,1].xaxis.label.set_visible(False)\n",
+    "        axes[0,1].yaxis.label.set_visible(False)\n",
+    "        axes[0,1].set_title(\"nrn_cur runtime for {}\".format(modname))\n",
+    "        if print_values:\n",
+    "          for i in axes[0,1].containers:\n",
+    "            axes[0,1].bar_label(i,)\n",
+    "        fig.text(0.5, 0.04, 'Target Microarchitecture-Instruction Set', ha='center', va='center')\n",
+    "        fig.text(0.06, 0.5, 'Runtime (s)', ha='center', va='center', rotation='vertical')\n",
+    "        plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "        plt.savefig(\"{}/{}_benchmark_{}.pdf\".format(output_dir, modname, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "        plt.show()\n",
+    "        plt.close()\n",
+    "\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"default\": [\n",
+    "      \"-O2 -prec-div\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"-O2 -msse2 -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O2 -march=broadwell -mtune=broadwell -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math -ftree-vectorize\",\n",
+    "      \"-O3 -ffast-math -ftree-vectorize -mveclibabi=svml\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math\",\n",
+    "      \"-O3 -ffast-math -fveclib=SVML\",\n",
+    "      \"-O3 -ffast-math jit SVML\",\n",
+    "      \"-O3 -ffast-math jit SLEEF\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"default\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "\n",
+    "generate_graph_pandas(results, compilers_comparison_config, \"cpu_all_compilers\", \"graphs_output_pandas\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c788f88f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"nehalem\": [\n",
+    "      \"-O2 -msse2 -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O2 -march=broadwell -mtune=broadwell -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nehalem\": [\n",
+    "      \"SVML_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"SVML_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['#0570b0','#969696']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas(results, compilers_comparison_config, \"cpu_intel_vs_nmodl\", \"graphs_output_pandas\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d18eced2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_gpu(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False):\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    for modname in results:\n",
+    "        df = None\n",
+    "        bar_data_gpu_panda = {}\n",
+    "        bar_data_gpu_panda[\"kernel\"] = []\n",
+    "        bar_data_gpu_panda[\"compiler\"] = []\n",
+    "        bar_data_gpu_panda[\"runtime\"] = []\n",
+    "        architecture = \"nvptx64\"\n",
+    "        for compiler in results[modname][architecture]:\n",
+    "            if compiler in compiler_flags and architecture in compiler_flags[compiler]:\n",
+    "                for flags in compiler_flags[compiler][architecture]:\n",
+    "                    dict_label = \"{}_{}_{}\".format(architecture, compiler, _get_flags_string(flags))\n",
+    "                    if compiler == \"nmodl_jit\":\n",
+    "                        state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                    else:\n",
+    "                        state_kernel_name = \"nrn_state_ext\"\n",
+    "                        cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                    if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                        compiler_name = \"mod2ir\"\n",
+    "                    elif compiler == \"nmodl_jit\":\n",
+    "                        compiler_name = \"mod2ir_jit\"\n",
+    "                    else:\n",
+    "                        compiler_name = compiler\n",
+    "                    bar_data_gpu_panda[\"kernel\"].append(\"nrn_state\")\n",
+    "                    bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                    if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                    else:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                    bar_data_gpu_panda[\"kernel\"].append(\"nrn_current\")\n",
+    "                    bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                    if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                    else:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "        df_state = pd.DataFrame(bar_data_gpu_panda, columns=[\"kernel\", \"compiler\", \"runtime\"])\n",
+    "        fig, ax = plt.subplots(figsize=(6,6))\n",
+    "        ax = sns.barplot(x='kernel', y='runtime', hue='compiler', data=df_state, ax=ax)\n",
+    "        if print_values:\n",
+    "            for i in ax.containers:\n",
+    "                ax.bar_label(i,)\n",
+    "        plt.xlabel(\"Kernel Name\")\n",
+    "        plt.ylabel(\"Runtime (s)\")\n",
+    "        plt.title(\"OpenACC and MOD2IR comparison for {}\".format(modname))\n",
+    "        plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "        plt.savefig(\"{}/{}_benchmark_{}.pdf\".format(output_dir, modname, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "        plt.show()\n",
+    "        plt.close()\n",
+    "\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"nvhpc\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"-O3 -gpu=nordc,fastmath\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"libdevice_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['#b2df8a','#bdbdbd']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas_gpu(results, compilers_comparison_config, \"hh_openacc_vs_nmodl\", \"graphs_output_pandas\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "422cdca2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "colors = ['#0570b0','#66c2a4','#238b45','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']\n",
+    "# pick colors according to the following order which matches the order of \n",
+    "# intel #023858\n",
+    "# gcc #66c2a4\n",
+    "# gcc_svml #238b45\n",
+    "# clang #fdd49e\n",
+    "# clang_svml #fc8d59\n",
+    "# mod2ir #9ebcda\n",
+    "# mod2ir_svml #8c96c6\n",
+    "# mor2it_jit_svml #969696\n",
+    "# mod2ir_jit_sleef #525252\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"broadwell\": [\n",
+    "      \"-O2 -march=broadwell -mtune=broadwell -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"broadwell\": [\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"broadwell\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "\n",
+    "generate_graph_pandas(results, compilers_comparison_config, \"cpu_intel_clang_mod2ir_compilers\", \"graphs_output_pandas\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c77de9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results_expsyn = {}\n",
+    "results_expsyn = load_pickle_result_file([\"./reference_data/expsyn_icc_clang.pickle\"], results_expsyn)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c08266df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"default\": [\n",
+    "      \"-O2 -prec-div\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"-O2 -msse2 -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O2 -march=broadwell -mtune=broadwell -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math\",\n",
+    "      \"-O3 -ffast-math -fveclib=SVML\",\n",
+    "      \"-O3 -ffast-math jit SVML\",\n",
+    "      \"-O3 -ffast-math jit SLEEF\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math -ftree-vectorize\",\n",
+    "      \"-O3 -ffast-math -ftree-vectorize -mveclibabi=svml\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"default\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "generate_graph_pandas(results_expsyn, compilers_comparison_config, \"cpu_intel_clang_mod2ir_compilers_expsyn\", \"graphs_output_pandas\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7769a0b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results_hh = {}\n",
+    "results_hh = load_pickle_result_file([\"./reference_data/hh_ic_clang_gcc_w_wout_svml.pickle\"], results_hh)\n",
+    "colors = ['#6baed6', '#0570b0', '#66c2a4','#238b45','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']\n",
+    "# pick colors according to the following order which matches the order of \n",
+    "# intel #6baed6\n",
+    "# intel svml #0570b0\n",
+    "# gcc #66c2a4\n",
+    "# gcc_svml #238b45\n",
+    "# clang #fdd49e\n",
+    "# clang_svml #fc8d59\n",
+    "# mod2ir #9ebcda\n",
+    "# mod2ir_svml #8c96c6\n",
+    "# mor2it_jit_svml #969696\n",
+    "# mod2ir_jit_sleef #525252\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"default\": [\n",
+    "      \"-O2 -prec-div\",\n",
+    "      \"-O2 -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"-O2 -msse2 -prec-div\",\n",
+    "      \"-O2 -msse2 -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O2 -march=broadwell -mtune=broadwell -prec-div\",\n",
+    "      \"-O2 -march=broadwell -mtune=broadwell -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fopenmp\",\n",
+    "      \"-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math -ftree-vectorize\",\n",
+    "      \"-O3 -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math\",\n",
+    "      \"-O3 -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -ffast-math jit SVML\",\n",
+    "      \"-O3 -ffast-math jit SLEEF\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"default\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"nehalem\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"broadwell\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "generate_graph_pandas(results_hh, compilers_comparison_config, \"hh_icc_clang_gcc_w_wout_svml\", \"graphs_output_pandas\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4259c982",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "colors = ['#6baed6', '#0570b0', '#66c2a4','#238b45','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']\n",
+    "# pick colors according to the following order which matches the order of \n",
+    "# intel #6baed6\n",
+    "# intel svml #0570b0\n",
+    "# gcc #66c2a4\n",
+    "# gcc_svml #238b45\n",
+    "# clang #fdd49e\n",
+    "# clang_svml #fc8d59\n",
+    "# mod2ir #9ebcda\n",
+    "# mod2ir_svml #8c96c6\n",
+    "# mor2it_jit_svml #969696\n",
+    "# mod2ir_jit_sleef #525252\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"default\": [\n",
+    "      \"-O2 -prec-div\",\n",
+    "      \"-O2 -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fopenmp\",\n",
+    "      \"-O2 -march=skylake-avx512 -mtune=skylake -prec-div -fimf-use-svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math -ftree-vectorize\",\n",
+    "      \"-O3 -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math\",\n",
+    "      \"-O3 -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -ffast-math jit SVML\",\n",
+    "      \"-O3 -ffast-math jit SLEEF\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"default\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "generate_graph_pandas(results_hh, compilers_comparison_config, \"hh_icc_clang_gcc_w_wout_svml\", \"graphs_output_pandas\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9d862a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expsyn_cpu_results = {}\n",
+    "expsyn_cpu_results= load_pickle_result_file([\"./reference_data/expsyn_cpu_results.pickle\"], expsyn_cpu_results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "523af569",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "generate_graph_pandas(expsyn_cpu_results, compilers_comparison_config, \"expsyn_icc_clang_gcc_w_wout_svml\", \"graphs_output_pandas\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a468366e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expsyn_gpu_results = {}\n",
+    "expsyn_gpu_results = load_pickle_result_file([\"./reference_data/expsyn_gpu.pickle\"], expsyn_gpu_results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0640f8af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"nvhpc\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"-O3 -gpu=nordc,fastmath\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"libdevice_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['#b2df8a','#bdbdbd']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas_gpu(expsyn_gpu_results, compilers_comparison_config, \"exp_openacc_vs_nmodl\", \"graphs_output_pandas\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff31a4a0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpu_results = {}\n",
+    "gpu_results = load_pickle_result_file([\"./reference_data/hh_gpu.pickle\", \"./reference_data/expsyn_gpu.pickle\"], gpu_results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "597d5032",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_gpu_combined(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False):\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig = plt.figure(figsize=(12, 6))\n",
+    "    for i, modname in enumerate(results):\n",
+    "        df = None\n",
+    "        bar_data_gpu_panda = {}\n",
+    "        bar_data_gpu_panda[\"kernel\"] = []\n",
+    "        bar_data_gpu_panda[\"compiler\"] = []\n",
+    "        bar_data_gpu_panda[\"runtime\"] = []\n",
+    "        architecture = \"nvptx64\"\n",
+    "        for compiler in results[modname][architecture]:\n",
+    "            if compiler in compiler_flags and architecture in compiler_flags[compiler]:\n",
+    "                for flags in compiler_flags[compiler][architecture]:\n",
+    "                    dict_label = \"{}_{}_{}\".format(architecture, compiler, _get_flags_string(flags))\n",
+    "                    if compiler == \"nmodl_jit\":\n",
+    "                        state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                    else:\n",
+    "                        state_kernel_name = \"nrn_state_ext\"\n",
+    "                        cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                    if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                        compiler_name = \"mod2ir\"\n",
+    "                    elif compiler == \"nmodl_jit\":\n",
+    "                        compiler_name = \"mod2ir_jit\"\n",
+    "                    else:\n",
+    "                        compiler_name = compiler\n",
+    "                    bar_data_gpu_panda[\"kernel\"].append(\"nrn_state\")\n",
+    "                    bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                    if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                    else:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                    bar_data_gpu_panda[\"kernel\"].append(\"nrn_current\")\n",
+    "                    bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                    if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                    else:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "        df_state = pd.DataFrame(bar_data_gpu_panda, columns=[\"kernel\", \"compiler\", \"runtime\"])\n",
+    "        ax = fig.add_subplot(1, 2, i+1)\n",
+    "        ax = sns.barplot(x='kernel', y='runtime', hue='compiler', data=df_state, ax=ax)\n",
+    "        if print_values:\n",
+    "            for i in ax.containers:\n",
+    "                ax.bar_label(i,)\n",
+    "        plt.xlabel(\"Kernel Name\")\n",
+    "        if i == 0:\n",
+    "            plt.ylabel(\"Runtime (s)\")\n",
+    "        else:\n",
+    "            ax.set(ylabel=None)\n",
+    "        plt.title(\"OpenACC and MOD2IR comparison for {}\".format(modname))\n",
+    "        # plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "        # plt.savefig(\"{}/{}_benchmark_{}.pdf\".format(output_dir, modname, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.savefig(\"{}/gpu_combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()\n",
+    "\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"nvhpc\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"-O3 -gpu=nordc,fastmath\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"libdevice_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['#b2df8a','#bdbdbd']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas_gpu_combined(gpu_results, compilers_comparison_config, \"hh_expsyn_gpu\", \"graphs_output_pandas\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e31acf4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hh_expsyn_avx512f_results = {}\n",
+    "hh_expsyn_avx512f_results = load_pickle_result_file([\"./reference_data/hh_expsyn_mavx512f.pickle\"], hh_expsyn_avx512f_results)\n",
+    "colors = ['#6baed6', '#0570b0', '#66c2a4','#238b45','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']\n",
+    "# pick colors according to the following order which matches the order of \n",
+    "# intel #6baed6\n",
+    "# intel svml #0570b0\n",
+    "# gcc #66c2a4\n",
+    "# gcc_svml #238b45\n",
+    "# clang #fdd49e\n",
+    "# clang_svml #fc8d59\n",
+    "# mod2ir #9ebcda\n",
+    "# mod2ir_svml #8c96c6\n",
+    "# mor2it_jit_svml #969696\n",
+    "# mod2ir_jit_sleef #525252\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"default\": [\n",
+    "      \"-O2 -prec-div\",\n",
+    "      \"-O2 -prec-div -fimf-use-svml\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -mavx512f -prec-div -fopenmp\",\n",
+    "      \"-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math -ftree-vectorize\",\n",
+    "      \"-O3 -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"default\": [\n",
+    "      \"-O3 -ffast-math\",\n",
+    "      \"-O3 -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -ffast-math jit SVML\",\n",
+    "      \"-O3 -ffast-math jit SLEEF\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"default\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ],\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "generate_graph_pandas(hh_expsyn_avx512f_results, compilers_comparison_config, \"mavfx512f\", \"graphs_output_pandas\", True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3dac17d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compare_average_diff_percentage(original_vector, comparison_vector):\n",
+    "    diff_vector = []\n",
+    "    for i in range(len(original_vector)):\n",
+    "        diff_vector.append(100 * (original_vector[i] - comparison_vector[i]) / original_vector[i])\n",
+    "    return np.sum(diff_vector) / len(diff_vector)\n",
+    "\n",
+    "def return_results_from_config(results, compilers_comparison_config, modnames):\n",
+    "    return_vec = []\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    for modname in results:\n",
+    "        if modname in modnames:\n",
+    "          for architecture in results[modname]:\n",
+    "              for compiler in compiler_flags:\n",
+    "                  if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                      for flags in compiler_flags[compiler][architecture]:\n",
+    "                          if compiler == \"nmodl_jit\":\n",
+    "                              state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                              cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                          else:\n",
+    "                              state_kernel_name = \"nrn_state_ext\"\n",
+    "                              cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                          # print(\"{} {} {} {}\".format(modname, architecture, compiler, flags))\n",
+    "                          if _get_flags_string(flags) in results[modname][architecture][compiler]:\n",
+    "                              return_vec.append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                          if _get_flags_string(flags) in results[modname][architecture][compiler]:\n",
+    "                              return_vec.append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "    return return_vec\n",
+    "\n",
+    "def calculate_overall_averages(results, modnames):\n",
+    "\n",
+    "  intel_compiler_comparison_config = \"\"\"\n",
+    "  {\n",
+    "    \"intel\": {\n",
+    "      \"skylake-avx512\": [\n",
+    "        \"-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp\"\n",
+    "      ]\n",
+    "    }\n",
+    "  }\n",
+    "  \"\"\"\n",
+    "\n",
+    "  intel_results = return_results_from_config(hh_expsyn_avx512f_results, intel_compiler_comparison_config, modnames)\n",
+    "  # print(\"intel_results: {}\".format(intel_results))\n",
+    "\n",
+    "  mod2ir_jit_svml_compiler_comparison_config = \"\"\"\n",
+    "  {\n",
+    "    \"nmodl_jit\": {\n",
+    "      \"skylake-avx512\": [\n",
+    "        \"SVML_nnancontractafn\"\n",
+    "      ]\n",
+    "    }\n",
+    "  }\n",
+    "  \"\"\"\n",
+    "  mod2ir_jit_svml_results = return_results_from_config(hh_expsyn_avx512f_results, mod2ir_jit_svml_compiler_comparison_config, modnames)\n",
+    "  # print(\"mod2ir_jit_svml_results: {}\".format(mod2ir_jit_svml_results))\n",
+    "\n",
+    "  mod2ir_jit_sleef_compiler_comparison_config = \"\"\"\n",
+    "  {\n",
+    "    \"nmodl_jit\": {\n",
+    "      \"skylake-avx512\": [\n",
+    "        \"SLEEF_nnancontractafn\"\n",
+    "      ]\n",
+    "    }\n",
+    "  }\n",
+    "  \"\"\"\n",
+    "  mod2ir_jit_sleef_results = return_results_from_config(hh_expsyn_avx512f_results, mod2ir_jit_sleef_compiler_comparison_config, modnames)\n",
+    "  # print(\"mod2ir_jit_sleef_results: {}\".format(mod2ir_jit_svml_results))\n",
+    "\n",
+    "  print(\"Intel vs MOD2IR SVML diff (%) {} : {}\".format(modnames, compare_average_diff_percentage(intel_results, mod2ir_jit_svml_results)))\n",
+    "  print(\"Intel vs MOD2IR SLEEF diff (%) {} : {}\".format(modnames, compare_average_diff_percentage(intel_results, mod2ir_jit_sleef_results)))\n",
+    "\n",
+    "  mo2ir_svml_sleef_compiler_comparison_config = \"\"\"\n",
+    "  {\n",
+    "    \"clang\": {\n",
+    "      \"default\": [\n",
+    "        \"-O3 -ffast-math jit SVML\",\n",
+    "        \"-O3 -ffast-math jit SLEEF\"\n",
+    "      ],\n",
+    "      \"skylake-avx512\": [\n",
+    "        \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "        \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "      ]\n",
+    "    }\n",
+    "  }\n",
+    "  \"\"\"\n",
+    "\n",
+    "  mod2ir_svml_sleef_results = return_results_from_config(hh_expsyn_avx512f_results, mo2ir_svml_sleef_compiler_comparison_config, modnames)\n",
+    "  # print(\"mod2ir_svml_sleef_results: {}\".format(mod2ir_svml_sleef_results))\n",
+    "\n",
+    "  mo2ir_jit_svml_sleef_compiler_comparison_config = \"\"\"\n",
+    "  {\n",
+    "    \"nmodl_jit\": {\n",
+    "      \"default\": [\n",
+    "        \"SVML_nnancontractafn\",\n",
+    "        \"SLEEF_nnancontractafn\"\n",
+    "      ],\n",
+    "      \"skylake-avx512\": [\n",
+    "        \"SVML_nnancontractafn\",\n",
+    "        \"SLEEF_nnancontractafn\"\n",
+    "      ]\n",
+    "    }\n",
+    "  }\n",
+    "  \"\"\"\n",
+    "\n",
+    "  mod2ir_jit_svml_sleef_results = return_results_from_config(hh_expsyn_avx512f_results, mo2ir_jit_svml_sleef_compiler_comparison_config, modnames)\n",
+    "  # print(\"mod2ir_jit_svml_sleef_results: {}\".format(mod2ir_jit_svml_sleef_results))\n",
+    "\n",
+    "  print(\"MOD2IR vs MOD2IR JIT diff (%) {} : {}\".format(modnames, compare_average_diff_percentage(mod2ir_svml_sleef_results, mod2ir_jit_svml_sleef_results)))\n",
+    "\n",
+    "calculate_overall_averages(hh_expsyn_avx512f_results, [\"hh\"])\n",
+    "calculate_overall_averages(hh_expsyn_avx512f_results, [\"expsyn\"])\n",
+    "calculate_overall_averages(hh_expsyn_avx512f_results, [\"hh\", \"expsyn\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9bf8e29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_cpu_combined(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False, xaxis_label=None, plot_size=(12,6), show_xlabels=False):\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig, axes = plt.subplots(1, 3, squeeze=False, figsize=plot_size)\n",
+    "    ax_index = 0\n",
+    "    for modname in results:\n",
+    "        # state\n",
+    "        bar_data_state_cpu_panda = {}\n",
+    "        bar_data_state_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_state_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_state_cpu_panda[\"runtime\"] = []\n",
+    "        # current\n",
+    "        bar_data_cur_cpu_panda = {}\n",
+    "        bar_data_cur_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"runtime\"] = []\n",
+    "        for architecture in results[modname]:\n",
+    "            for compiler in compiler_flags:\n",
+    "                if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                    for flags in compiler_flags[compiler][architecture]:\n",
+    "                        if compiler == \"nmodl_jit\":\n",
+    "                            state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                            cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        else:\n",
+    "                            state_kernel_name = \"nrn_state_ext\"\n",
+    "                            cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                        if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                            compiler_name = \"mod2ir\"\n",
+    "                        elif compiler == \"nmodl_jit\":\n",
+    "                            compiler_name = \"mod2ir_jit\"\n",
+    "                        else:\n",
+    "                            compiler_name = compiler\n",
+    "                        if \"svml\" in flags or \"SVML\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_svml\"\n",
+    "                        elif \"sleef\" in flags or \"SLEEF\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_sleef\"\n",
+    "                        if architecture == \"default\":\n",
+    "                            architecture_label = \"auto-scalar\"\n",
+    "                        elif architecture == \"nehalem\":\n",
+    "                            architecture_label = \"nehalem-sse2\"\n",
+    "                        elif architecture == \"broadwell\":\n",
+    "                            architecture_label = \"broadwell-avx2\"\n",
+    "                        else: # skylake-avx512\n",
+    "                            architecture_label = architecture\n",
+    "                        if modname != \"expsyn\":\n",
+    "                            bar_data_state_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                            bar_data_state_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                            if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(0)\n",
+    "                            else:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                        bar_data_cur_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                        bar_data_cur_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "        if modname != \"expsyn\":\n",
+    "            df_state = pd.DataFrame(bar_data_state_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "            sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_state, ax=axes[0,ax_index])\n",
+    "            axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].set_title(\"nrn_state_{}\".format(modname))\n",
+    "            axes[0,ax_index].get_legend().remove()\n",
+    "            if not show_xlabels:\n",
+    "                axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "            if print_values:\n",
+    "                for i in axes[0,ax_index].containers:\n",
+    "                    axes[0,ax_index].bar_label(i,)\n",
+    "            ax_index += 1\n",
+    "        df_cur = pd.DataFrame(bar_data_cur_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "        sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_cur, ax=axes[0,ax_index])\n",
+    "        axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].set_title(\"nrn_cur_{}\".format(modname))\n",
+    "        axes[0,ax_index].get_legend().remove()\n",
+    "        if not show_xlabels:\n",
+    "            axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "        if print_values:\n",
+    "          for i in axes[0,ax_index].containers:\n",
+    "            axes[0,ax_index].bar_label(i,)\n",
+    "        ax_index += 1\n",
+    "    if xaxis_label is not None:\n",
+    "        fig.text(0.5, 0.04, xaxis_label, ha='center', va='center')\n",
+    "    # else:\n",
+    "    #     fig.text(0.5, 0.04, 'Target Microarchitecture-Instruction Set', ha='center', va='center')\n",
+    "    fig.text(0.06, 0.5, 'Runtime (s)', ha='center', va='center', rotation='vertical')\n",
+    "    plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "    plt.savefig(\"{}/combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf1371c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "colors = ['#6baed6', '#0570b0', '#66c2a4','#238b45','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']\n",
+    "# pick colors according to the following order which matches the order of \n",
+    "# intel #6baed6\n",
+    "# intel svml #0570b0\n",
+    "# gcc #66c2a4\n",
+    "# gcc_svml #238b45\n",
+    "# clang #fdd49e\n",
+    "# clang_svml #fc8d59\n",
+    "# mod2ir #9ebcda\n",
+    "# mod2ir_svml #8c96c6\n",
+    "# mor2it_jit_svml #969696\n",
+    "# mod2ir_jit_sleef #525252\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -mavx512f -prec-div -fopenmp\",\n",
+    "      \"-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "generate_graph_pandas_cpu_combined(hh_expsyn_avx512f_results, compilers_comparison_config, \"hh_expsyn_cpu\", \"graphs_output_pandas\", False, xaxis_label=\"Target Microarchitecture-Instruction Set\", show_xlabels=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccc9c259",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hh_expsyn_gpu_1024x128 = {}\n",
+    "hh_expsyn_gpu_1024x128 = load_pickle_result_file([\"./reference_data/hh_gpu_20mil_1024x128.pickle\", \"./reference_data/expsyn_gpu_100mil_1024x128.pickle\"], hh_expsyn_gpu_1024x128)\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"nvhpc\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"-O3 -gpu=nordc,fastmath\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"libdevice_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['#b2df8a','#bdbdbd']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas_cpu_combined(hh_expsyn_gpu_1024x128, compilers_comparison_config, \"hh_expsyn_gpu\", \"graphs_output_pandas\", print_values=False, plot_size=(13,5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2e1fb4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_combined_relative(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False, xaxis_label=None, plot_size=(12,6), baseline_name=\"intel_svml\"):\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig, axes = plt.subplots(1, 3, squeeze=False, figsize=plot_size)\n",
+    "    ax_index = 0\n",
+    "    for modname in results:\n",
+    "        # state\n",
+    "        bar_data_state_cpu_panda = {}\n",
+    "        bar_data_state_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_state_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_state_cpu_panda[\"runtime\"] = []\n",
+    "        # current\n",
+    "        bar_data_cur_cpu_panda = {}\n",
+    "        bar_data_cur_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"runtime\"] = []\n",
+    "        baseline_cur = 0.0\n",
+    "        for architecture in results[modname]:\n",
+    "            for compiler in compiler_flags:\n",
+    "                if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                    for flags in compiler_flags[compiler][architecture]:\n",
+    "                        if compiler == \"nmodl_jit\":\n",
+    "                            state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                            cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        else:\n",
+    "                            state_kernel_name = \"nrn_state_ext\"\n",
+    "                            cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                        if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                            compiler_name = \"mod2ir\"\n",
+    "                        elif compiler == \"nmodl_jit\":\n",
+    "                            compiler_name = \"mod2ir_jit\"\n",
+    "                        else:\n",
+    "                            compiler_name = compiler\n",
+    "                        if \"svml\" in flags or \"SVML\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_svml\"\n",
+    "                            if architecture != \"nvptx64\" and compiler == \"intel\":\n",
+    "                                baseline_state = results[modname][architecture][\"intel\"][_get_flags_string(flags)][state_kernel_name][0]\n",
+    "                                baseline_cur = results[modname][architecture][\"intel\"][_get_flags_string(flags)][cur_kernel_name][0]\n",
+    "                        elif \"sleef\" in flags or \"SLEEF\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_sleef\"\n",
+    "                        if architecture == \"default\":\n",
+    "                            architecture_label = \"auto-scalar\"\n",
+    "                        elif architecture == \"nehalem\":\n",
+    "                            architecture_label = \"nehalem-sse2\"\n",
+    "                        elif architecture == \"broadwell\":\n",
+    "                            architecture_label = \"broadwell-avx2\"\n",
+    "                        elif architecture == \"nvptx64\":\n",
+    "                            architecture_label = architecture\n",
+    "                            if compiler == \"nvhpc\":\n",
+    "                                baseline_state = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][state_kernel_name][0]\n",
+    "                                baseline_cur = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][cur_kernel_name][0]\n",
+    "                        else: # skylake-avx512\n",
+    "                            architecture_label = architecture\n",
+    "                        if modname != \"expsyn\":\n",
+    "                            bar_data_state_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                            bar_data_state_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                            if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(0)\n",
+    "                            else:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                        bar_data_cur_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                        bar_data_cur_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "        for i, runtime in enumerate(bar_data_state_cpu_panda[\"runtime\"]):\n",
+    "            bar_data_state_cpu_panda[\"runtime\"][i] = runtime/baseline_state\n",
+    "        for i, runtime in enumerate(bar_data_cur_cpu_panda[\"runtime\"]):\n",
+    "            bar_data_cur_cpu_panda[\"runtime\"][i] = runtime/baseline_cur\n",
+    "        pd.options.display.float_format = \"{:,.2f}\".format\n",
+    "        if modname != \"expsyn\":\n",
+    "            df_state = pd.DataFrame(bar_data_state_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "            print(df_state, type(df_state))\n",
+    "            sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_state, ax=axes[0,ax_index])\n",
+    "            axes[0,ax_index].axhline(1., ls='--', color =\"black\")\n",
+    "            axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].set_title(\"nrn_state_{}\".format(modname))\n",
+    "            axes[0,ax_index].get_legend().remove()\n",
+    "            if xaxis_label is not None:\n",
+    "                axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "            if print_values:\n",
+    "                for i in axes[0,ax_index].containers:\n",
+    "                    axes[0,ax_index].bar_label(i,)\n",
+    "            ax_index += 1\n",
+    "        df_cur = pd.DataFrame(bar_data_cur_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "        ax = sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_cur, ax=axes[0,ax_index])\n",
+    "        axes[0,ax_index].axhline(1., ls='--', color =\"black\")\n",
+    "        print(df_cur, type(df_cur))\n",
+    "        #axes[0,ax_index].axhline(1., linewidth=2, color=(0, 0, 0, 0.9))\n",
+    "        axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].set_title(\"nrn_cur_{}\".format(modname))\n",
+    "        axes[0,ax_index].get_legend().remove()\n",
+    "        if xaxis_label is not None:\n",
+    "                axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "        if print_values:\n",
+    "          for i in axes[0,ax_index].containers:\n",
+    "            axes[0,ax_index].bar_label(i,)\n",
+    "        ax_index += 1\n",
+    "            \n",
+    "    #if xaxis_label is not None:\n",
+    "    #    fig.text(0.5, 0.04, xaxis_label, ha='center', va='center')\n",
+    "    #else:\n",
+    "    #    fig.text(0.5, 0.04, 'Target Microarchitecture-Instruction Set', ha='center', va='center')\n",
+    "    fig.text(0.06, 0.5, 'Relative Performance ({} = 1)'.format(baseline_name), ha='center', va='center', rotation='vertical')\n",
+    "    plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "    plt.savefig(\"{}/combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa3cfb6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "colors = ['#6baed6', '#0570b0', '#66c2a4','#238b45','#b2df8a','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']\n",
+    "# pick colors according to the following order which matches the order of \n",
+    "# intel #6baed6\n",
+    "# intel svml #0570b0\n",
+    "# gcc #66c2a4\n",
+    "# gcc_svml #238b45\n",
+    "# nvhpc #b2df8a\n",
+    "# clang #fdd49e\n",
+    "# clang_svml #fc8d59\n",
+    "# mod2ir #9ebcda\n",
+    "# mod2ir_svml #8c96c6\n",
+    "# mor2it_jit_svml #969696\n",
+    "# mod2ir_jit_sleef #525252\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -mavx512f -prec-div -fopenmp\",\n",
+    "      \"-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nvhpc\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-fast -O3 -mp=autopar -tp=skylake -Msafeptr=all -Minfo -Mvect=simd:512,gather -mavx512vbmi -mavx512vbmi2 -mavx512vl\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "hh_expsyn_cpu_results = {}\n",
+    "hh_expsyn_cpu_results = load_pickle_result_file([\"./reference_data/hh_expsyn_mavx512f.pickle\", \"./reference_data/hh_expsyn_nvhpc_cpu.pickle\"], hh_expsyn_cpu_results)\n",
+    "json_object = json.dumps(hh_expsyn_cpu_results, indent = 4) \n",
+    "generate_graph_pandas_combined_relative(hh_expsyn_cpu_results, compilers_comparison_config, \"hh_expsyn_cpu_relative\", \"graphs_output_pandas\", False, xaxis_label=\"skylake-avx512 Target Microarchitecture\", plot_size=(10,3.5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2bf934fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hh_expsyn_gpu_1024x128 = {}\n",
+    "hh_expsyn_gpu_1024x128 = load_pickle_result_file([\"./reference_data/hh_gpu_20mil_1024x128.pickle\", \"./reference_data/expsyn_gpu_100mil_1024x128.pickle\"], hh_expsyn_gpu_1024x128)\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"nvhpc\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"-O3 -gpu=nordc,fastmath\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"libdevice_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['#b2df8a','#969696']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas_combined_relative(hh_expsyn_gpu_1024x128, compilers_comparison_config, \"hh_expsyn_gpu_relative\", \"graphs_output_pandas\", xaxis_label=\"NVPTX64 Architecture\", print_values=False, plot_size=(8,8), baseline_name=\"nvhpc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "079437d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_combined_relative_gpu(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False, xaxis_label=None, plot_size=(12,6), baseline_name=\"intel_svml\"):\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig, axes = plt.subplots(1, 1, squeeze=False, figsize=plot_size)\n",
+    "    ax = axes[0,0]\n",
+    "    bar_data_gpu_panda = {}\n",
+    "    bar_data_gpu_panda[\"kernel\"] = []\n",
+    "    bar_data_gpu_panda[\"compiler\"] = []\n",
+    "    bar_data_gpu_panda[\"runtime\"] = []\n",
+    "    baseline_kernel = {}\n",
+    "    for modname in results:\n",
+    "        for architecture in results[modname]:\n",
+    "            for compiler in compiler_flags:\n",
+    "                if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                    for flags in compiler_flags[compiler][architecture]:\n",
+    "                        if compiler == \"nmodl_jit\":\n",
+    "                            state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                            cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        else:\n",
+    "                            state_kernel_name = \"nrn_state_ext\"\n",
+    "                            cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                        label_state_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        label_cur_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                            compiler_name = \"mod2ir\"\n",
+    "                        elif compiler == \"nmodl_jit\":\n",
+    "                            compiler_name = \"mod2ir_jit\"\n",
+    "                        else:\n",
+    "                            compiler_name = compiler\n",
+    "                        if architecture == \"nvptx64\":\n",
+    "                            architecture_label = architecture\n",
+    "                            if compiler == \"nvhpc\":\n",
+    "                                baseline_kernel[label_state_name] = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][state_kernel_name][0]\n",
+    "                                baseline_kernel[label_cur_name] = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][cur_kernel_name][0]\n",
+    "                        else: # skylake-avx512\n",
+    "                            architecture_label = architecture\n",
+    "                        if modname != \"expsyn\":\n",
+    "                            bar_data_gpu_panda[\"kernel\"].append(label_state_name)\n",
+    "                            bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                            if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                                bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                            else:\n",
+    "                                bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                        bar_data_gpu_panda[\"kernel\"].append(label_cur_name)\n",
+    "                        bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "    for i, runtime in enumerate(bar_data_gpu_panda[\"runtime\"]):\n",
+    "        kernel = bar_data_gpu_panda[\"kernel\"][i]\n",
+    "        print(\"Scaling kernel {} arch {}\".format(kernel, bar_data_gpu_panda[\"compiler\"][i]))\n",
+    "        bar_data_gpu_panda[\"runtime\"][i] = runtime/baseline_kernel[kernel]\n",
+    "    pd.options.display.float_format = \"{:,.2f}\".format\n",
+    "    print(bar_data_gpu_panda)\n",
+    "    df_kernels = pd.DataFrame(bar_data_gpu_panda, columns=[\"kernel\", \"compiler\", \"runtime\"])\n",
+    "    print(df_kernels, type(df_kernels))\n",
+    "    sns.barplot(x='kernel', y='runtime', hue='compiler', data=df_kernels, ax=ax)\n",
+    "    ax.axhline(1., ls='--', color =\"black\")\n",
+    "    ax.xaxis.label.set_visible(False)\n",
+    "    # ax.yaxis.label.set_visible(False)\n",
+    "    plt.ylabel('Relative Performance ({} = 1)'.format(baseline_name))\n",
+    "    # ax.get_legend().remove()\n",
+    "    plt.legend(loc=\"lower left\")\n",
+    "    # if xaxis_label is not None:\n",
+    "    #     ax.get_xaxis().set_visible(False)\n",
+    "    if print_values:\n",
+    "        for i in ax.containers:\n",
+    "            ax.bar_label(i,)            \n",
+    "    #if xaxis_label is not None:\n",
+    "    #    fig.text(0.5, 0.04, xaxis_label, ha='center', va='center')\n",
+    "    #else:\n",
+    "    #    fig.text(0.5, 0.04, 'Target Microarchitecture-Instruction Set', ha='center', va='center')\n",
+    "    # fig.text(0.06, 0.5, 'Relative Performance ({} = 1)'.format(baseline_name), ha='center', va='center', rotation='vertical')\n",
+    "    # plt.xlabel(\"Kernel Name\")\n",
+    "    # plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "    plt.savefig(\"{}/combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4cfa3c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hh_expsyn_gpu_1024x128 = {}\n",
+    "hh_expsyn_gpu_1024x128 = load_pickle_result_file([\"./reference_data/hh_gpu_20mil_1024x128.pickle\", \"./reference_data/expsyn_gpu_100mil_1024x128.pickle\"], hh_expsyn_gpu_1024x128)\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"nvhpc\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"-O3 -gpu=nordc,fastmath\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"libdevice_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['#b2df8a','#969696']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas_combined_relative_gpu(hh_expsyn_gpu_1024x128, compilers_comparison_config, \"hh_expsyn_gpu_relative_one_plot\", \"graphs_output_pandas\", xaxis_label=\"NVPTX64 Architecture\", print_values=False, plot_size=(4,3), baseline_name=\"nvhpc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e31e6d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_combined_relative_log(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False, xaxis_label=None, plot_size=(12,6), baseline_name=\"intel_svml\"):\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig, axes = plt.subplots(1, 3, squeeze=False, figsize=plot_size)\n",
+    "    ax_index = 0\n",
+    "    for modname in results:\n",
+    "        # state\n",
+    "        bar_data_state_cpu_panda = {}\n",
+    "        bar_data_state_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_state_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_state_cpu_panda[\"runtime\"] = []\n",
+    "        # current\n",
+    "        bar_data_cur_cpu_panda = {}\n",
+    "        bar_data_cur_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"runtime\"] = []\n",
+    "        baseline_cur = 0.0\n",
+    "        for architecture in results[modname]:\n",
+    "            for compiler in compiler_flags:\n",
+    "                if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                    for flags in compiler_flags[compiler][architecture]:\n",
+    "                        if compiler == \"nmodl_jit\":\n",
+    "                            state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                            cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        else:\n",
+    "                            state_kernel_name = \"nrn_state_ext\"\n",
+    "                            cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                        if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                            compiler_name = \"mod2ir\"\n",
+    "                        elif compiler == \"nmodl_jit\":\n",
+    "                            compiler_name = \"mod2ir_jit\"\n",
+    "                        else:\n",
+    "                            compiler_name = compiler\n",
+    "                        if \"svml\" in flags or \"SVML\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_svml\"\n",
+    "                            if architecture != \"nvptx64\" and compiler == \"intel\":\n",
+    "                                baseline_state = results[modname][architecture][\"intel\"][_get_flags_string(flags)][state_kernel_name][0]\n",
+    "                                baseline_cur = results[modname][architecture][\"intel\"][_get_flags_string(flags)][cur_kernel_name][0]\n",
+    "                        elif \"sleef\" in flags or \"SLEEF\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_sleef\"\n",
+    "                        if architecture == \"default\":\n",
+    "                            architecture_label = \"auto-scalar\"\n",
+    "                        elif architecture == \"nehalem\":\n",
+    "                            architecture_label = \"nehalem-sse2\"\n",
+    "                        elif architecture == \"broadwell\":\n",
+    "                            architecture_label = \"broadwell-avx2\"\n",
+    "                        elif architecture == \"nvptx64\":\n",
+    "                            architecture_label = architecture\n",
+    "                            if compiler == \"nvhpc\":\n",
+    "                                baseline_state = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][state_kernel_name][0]\n",
+    "                                baseline_cur = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][cur_kernel_name][0]\n",
+    "                        else: # skylake-avx512\n",
+    "                            architecture_label = architecture\n",
+    "                        if modname != \"expsyn\":\n",
+    "                            bar_data_state_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                            bar_data_state_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                            if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(0)\n",
+    "                            else:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                        bar_data_cur_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                        bar_data_cur_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "        for i, runtime in enumerate(bar_data_state_cpu_panda[\"runtime\"]):\n",
+    "            bar_data_state_cpu_panda[\"runtime\"][i] = baseline_state/runtime\n",
+    "        for i, runtime in enumerate(bar_data_cur_cpu_panda[\"runtime\"]):\n",
+    "            bar_data_cur_cpu_panda[\"runtime\"][i] = baseline_cur/runtime\n",
+    "        pd.options.display.float_format = \"{:,.2f}\".format\n",
+    "        if modname != \"expsyn\":\n",
+    "            df_state = pd.DataFrame(bar_data_state_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "            print(df_state, type(df_state))\n",
+    "            sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_state, ax=axes[0,ax_index])\n",
+    "            axes[0,ax_index].set_yscale('symlog', base=2, linthresh=0.015)\n",
+    "            axes[0,ax_index].set_ylim(0.125,2)\n",
+    "            axes[0,ax_index].set_yticks([0.125, 0.25, 0.5, 1, 2])\n",
+    "            axes[0,ax_index].axhline(1., ls='--', color =\"black\")\n",
+    "            axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].set_title(\"nrn_state_{}\".format(modname))\n",
+    "            axes[0,ax_index].get_legend().remove()\n",
+    "            if xaxis_label is not None:\n",
+    "                axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "            if print_values:\n",
+    "                for i in axes[0,ax_index].containers:\n",
+    "                    axes[0,ax_index].bar_label(i,)\n",
+    "            ax_index += 1\n",
+    "        df_cur = pd.DataFrame(bar_data_cur_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "        ax = sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_cur, ax=axes[0,ax_index])\n",
+    "        axes[0,ax_index].axhline(1., ls='--', color =\"black\")\n",
+    "        print(df_cur, type(df_cur))\n",
+    "        axes[0,ax_index].set_yscale('symlog', base=2, linthresh=0.015)\n",
+    "        axes[0,ax_index].set_ylim(0.125,2)\n",
+    "        axes[0,ax_index].set_yticks([0.125, 0.25, 0.5, 1, 2])\n",
+    "        # axes[0,ax_index].set_yticklabels([0.5, 1, 2, 4, 8])\n",
+    "        #axes[0,ax_index].axhline(1., linewidth=2, color=(0, 0, 0, 0.9))\n",
+    "        axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].set_title(\"nrn_cur_{}\".format(modname))\n",
+    "        axes[0,ax_index].get_legend().remove()\n",
+    "        if xaxis_label is not None:\n",
+    "                axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "        if print_values:\n",
+    "          for i in axes[0,ax_index].containers:\n",
+    "            axes[0,ax_index].bar_label(i,)\n",
+    "        ax_index += 1\n",
+    "            \n",
+    "    #if xaxis_label is not None:\n",
+    "    #    fig.text(0.5, 0.04, xaxis_label, ha='center', va='center')\n",
+    "    #else:\n",
+    "    #    fig.text(0.5, 0.04, 'Target Microarchitecture-Instruction Set', ha='center', va='center')\n",
+    "    fig.text(0.06, 0.5, 'Speedup relative to {}'.format(baseline_name), ha='center', va='center', rotation='vertical')\n",
+    "    plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "    plt.savefig(\"{}/combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2667393",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "colors = ['#6baed6', '#0570b0', '#66c2a4','#238b45','#b2df8a','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']\n",
+    "# pick colors according to the following order which matches the order of \n",
+    "# intel #6baed6\n",
+    "# intel svml #0570b0\n",
+    "# gcc #66c2a4\n",
+    "# gcc_svml #238b45\n",
+    "# nvhpc #b2df8a\n",
+    "# clang #fdd49e\n",
+    "# clang_svml #fc8d59\n",
+    "# mod2ir #9ebcda\n",
+    "# mod2ir_svml #8c96c6\n",
+    "# mor2it_jit_svml #969696\n",
+    "# mod2ir_jit_sleef #525252\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -mavx512f -prec-div -fopenmp\",\n",
+    "      \"-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nvhpc\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-fast -O3 -mp=autopar -tp=skylake -Msafeptr=all -Minfo -Mvect=simd:512,gather -mavx512vbmi -mavx512vbmi2 -mavx512vl\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "hh_expsyn_cpu_results = {}\n",
+    "hh_expsyn_cpu_results = load_pickle_result_file([\"./reference_data/hh_expsyn_mavx512f.pickle\", \"./reference_data/hh_expsyn_nvhpc_cpu.pickle\"], hh_expsyn_cpu_results)\n",
+    "json_object = json.dumps(hh_expsyn_cpu_results, indent = 4) \n",
+    "generate_graph_pandas_combined_relative_log(hh_expsyn_cpu_results, compilers_comparison_config, \"hh_expsyn_cpu_relative_log\", \"graphs_output_pandas\", False, xaxis_label=\"skylake-avx512 Target Microarchitecture\", plot_size=(10,3.5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2eff5a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_combined_relative_gpu_log(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False, xaxis_label=None, plot_size=(12,6), baseline_name=\"intel_svml\"):\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig, axes = plt.subplots(1, 1, squeeze=False, figsize=plot_size)\n",
+    "    ax = axes[0,0]\n",
+    "    bar_data_gpu_panda = {}\n",
+    "    bar_data_gpu_panda[\"kernel\"] = []\n",
+    "    bar_data_gpu_panda[\"compiler\"] = []\n",
+    "    bar_data_gpu_panda[\"runtime\"] = []\n",
+    "    baseline_kernel = {}\n",
+    "    for modname in results:\n",
+    "        for architecture in results[modname]:\n",
+    "            for compiler in compiler_flags:\n",
+    "                if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                    for flags in compiler_flags[compiler][architecture]:\n",
+    "                        if compiler == \"nmodl_jit\":\n",
+    "                            state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                            cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        else:\n",
+    "                            state_kernel_name = \"nrn_state_ext\"\n",
+    "                            cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                        label_state_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        label_cur_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                            compiler_name = \"mod2ir\"\n",
+    "                        elif compiler == \"nmodl_jit\":\n",
+    "                            compiler_name = \"mod2ir_jit\"\n",
+    "                        else:\n",
+    "                            compiler_name = compiler\n",
+    "                        if architecture == \"nvptx64\":\n",
+    "                            architecture_label = architecture\n",
+    "                            if compiler == \"nvhpc\":\n",
+    "                                baseline_kernel[label_state_name] = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][state_kernel_name][0]\n",
+    "                                baseline_kernel[label_cur_name] = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][cur_kernel_name][0]\n",
+    "                        else: # skylake-avx512\n",
+    "                            architecture_label = architecture\n",
+    "                        if modname != \"expsyn\":\n",
+    "                            bar_data_gpu_panda[\"kernel\"].append(label_state_name)\n",
+    "                            bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                            if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                                bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                            else:\n",
+    "                                bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                        bar_data_gpu_panda[\"kernel\"].append(label_cur_name)\n",
+    "                        bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "    for i, runtime in enumerate(bar_data_gpu_panda[\"runtime\"]):\n",
+    "        kernel = bar_data_gpu_panda[\"kernel\"][i]\n",
+    "        print(\"Scaling kernel {} arch {}\".format(kernel, bar_data_gpu_panda[\"compiler\"][i]))\n",
+    "        bar_data_gpu_panda[\"runtime\"][i] = baseline_kernel[kernel]/runtime\n",
+    "    pd.options.display.float_format = \"{:,.2f}\".format\n",
+    "    print(bar_data_gpu_panda)\n",
+    "    df_kernels = pd.DataFrame(bar_data_gpu_panda, columns=[\"kernel\", \"compiler\", \"runtime\"])\n",
+    "    print(df_kernels, type(df_kernels))\n",
+    "    sns.barplot(x='kernel', y='runtime', hue='compiler', data=df_kernels, ax=ax)\n",
+    "    ax.axhline(1., ls='--', color =\"black\")\n",
+    "    ax.xaxis.label.set_visible(False)\n",
+    "    ax.set_yscale('symlog', base=2, linthresh=0.015)\n",
+    "    ax.set_ylim(0.5, 2)\n",
+    "    ax.set_yticks([0.5, 1, 2])\n",
+    "    # ax.yaxis.label.set_visible(False)\n",
+    "    plt.ylabel('Speedup relative to {}'.format(baseline_name))\n",
+    "    # ax.get_legend().remove()\n",
+    "    plt.legend(loc=\"upper right\")\n",
+    "    # if xaxis_label is not None:\n",
+    "    #     ax.get_xaxis().set_visible(False)\n",
+    "    if print_values:\n",
+    "        for i in ax.containers:\n",
+    "            ax.bar_label(i,)            \n",
+    "    #if xaxis_label is not None:\n",
+    "    #    fig.text(0.5, 0.04, xaxis_label, ha='center', va='center')\n",
+    "    #else:\n",
+    "    #    fig.text(0.5, 0.04, 'Target Microarchitecture-Instruction Set', ha='center', va='center')\n",
+    "    # fig.text(0.06, 0.5, 'Relative Performance ({} = 1)'.format(baseline_name), ha='center', va='center', rotation='vertical')\n",
+    "    # plt.xlabel(\"Kernel Name\")\n",
+    "    # plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "    plt.savefig(\"{}/combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3dc55905",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hh_expsyn_gpu_1024x128 = {}\n",
+    "hh_expsyn_gpu_1024x128 = load_pickle_result_file([\"./reference_data/hh_gpu_20mil_1024x128.pickle\", \"./reference_data/expsyn_gpu_100mil_1024x128.pickle\"], hh_expsyn_gpu_1024x128)\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"nvhpc\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"-O3 -gpu=nordc,fastmath\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"libdevice_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['#b2df8a','#969696']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas_combined_relative_gpu_log(hh_expsyn_gpu_1024x128, compilers_comparison_config, \"hh_expsyn_gpu_relative_one_plot_log\", \"graphs_output_pandas\", xaxis_label=\"NVPTX64 Architecture\", print_values=False, plot_size=(4,3), baseline_name=\"nvhpc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6642369c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_combined_relative_log_hatches(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False, xaxis_label=None, plot_size=(12,6), fontsize=14, baseline_name=\"intel_svml\", hatches=['/', '//', '+', '++', 'X', '-', '--', '|', '||', '.', 'o'], hatches_colors=None, colors=['#6baed6', '#0570b0', '#66c2a4','#238b45','#b2df8a','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']):\n",
+    "    # pick colors according to the following order which matches the order of \n",
+    "    # intel #6baed6\n",
+    "    # intel svml #0570b0\n",
+    "    # gcc #66c2a4\n",
+    "    # gcc_svml #238b45\n",
+    "    # nvhpc #b2df8a\n",
+    "    # clang #fdd49e\n",
+    "    # clang_svml #fc8d59\n",
+    "    # mod2ir #9ebcda\n",
+    "    # mod2ir_svml #8c96c6\n",
+    "    # mor2it_jit_svml #969696\n",
+    "    # mod2ir_jit_sleef #525252\n",
+    "    sns.set_palette(sns.color_palette(colors))\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig, axes = plt.subplots(1, 3, squeeze=False, figsize=plot_size)\n",
+    "    ax_index = 0\n",
+    "    for modname in results:\n",
+    "        # state\n",
+    "        bar_data_state_cpu_panda = {}\n",
+    "        bar_data_state_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_state_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_state_cpu_panda[\"runtime\"] = []\n",
+    "        # current\n",
+    "        bar_data_cur_cpu_panda = {}\n",
+    "        bar_data_cur_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"runtime\"] = []\n",
+    "        baseline_cur = 0.0\n",
+    "        for architecture in results[modname]:\n",
+    "            for compiler in compiler_flags:\n",
+    "                if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                    for flags in compiler_flags[compiler][architecture]:\n",
+    "                        if compiler == \"nmodl_jit\":\n",
+    "                            state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                            cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        else:\n",
+    "                            state_kernel_name = \"nrn_state_ext\"\n",
+    "                            cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                        if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                            compiler_name = \"mod2ir\"\n",
+    "                        elif compiler == \"nmodl_jit\":\n",
+    "                            compiler_name = \"mod2ir_jit\"\n",
+    "                        else:\n",
+    "                            compiler_name = compiler\n",
+    "                        if \"svml\" in flags or \"SVML\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_svml\"\n",
+    "                            if architecture != \"nvptx64\" and compiler == \"intel\":\n",
+    "                                baseline_state = results[modname][architecture][\"intel\"][_get_flags_string(flags)][state_kernel_name][0]\n",
+    "                                baseline_cur = results[modname][architecture][\"intel\"][_get_flags_string(flags)][cur_kernel_name][0]\n",
+    "                        elif \"sleef\" in flags or \"SLEEF\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_sleef\"\n",
+    "                        if architecture == \"default\":\n",
+    "                            architecture_label = \"auto-scalar\"\n",
+    "                        elif architecture == \"nehalem\":\n",
+    "                            architecture_label = \"nehalem-sse2\"\n",
+    "                        elif architecture == \"broadwell\":\n",
+    "                            architecture_label = \"broadwell-avx2\"\n",
+    "                        elif architecture == \"nvptx64\":\n",
+    "                            architecture_label = architecture\n",
+    "                            if compiler == \"nvhpc\":\n",
+    "                                baseline_state = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][state_kernel_name][0]\n",
+    "                                baseline_cur = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][cur_kernel_name][0]\n",
+    "                        else: # skylake-avx512\n",
+    "                            architecture_label = architecture\n",
+    "                        if modname != \"expsyn\":\n",
+    "                            bar_data_state_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                            bar_data_state_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                            if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(0)\n",
+    "                            else:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                        bar_data_cur_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                        bar_data_cur_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "        for i, runtime in enumerate(bar_data_state_cpu_panda[\"runtime\"]):\n",
+    "            bar_data_state_cpu_panda[\"runtime\"][i] = baseline_state/runtime\n",
+    "        for i, runtime in enumerate(bar_data_cur_cpu_panda[\"runtime\"]):\n",
+    "            bar_data_cur_cpu_panda[\"runtime\"][i] = baseline_cur/runtime\n",
+    "        pd.options.display.float_format = \"{:,.2f}\".format\n",
+    "        if modname != \"expsyn\":\n",
+    "            df_state = pd.DataFrame(bar_data_state_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "            # print(df_state, type(df_state))\n",
+    "            sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_state, ax=axes[0,ax_index])\n",
+    "            axes[0,ax_index].set_yscale('symlog', base=2, linthresh=0.015)\n",
+    "            axes[0,ax_index].set_ylim(0.125,2)\n",
+    "            axes[0,ax_index].set_yticks([0.125, 0.25, 0.5, 1, 2])\n",
+    "            axes[0,ax_index].axhline(1., ls='--', color =\"black\")\n",
+    "            axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].set_title(\"nrn_state_{}\".format(modname), fontsize=fontsize)\n",
+    "            axes[0,ax_index].get_legend().remove()\n",
+    "            if xaxis_label is not None:\n",
+    "                axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "            if print_values:\n",
+    "                for i in axes[0,ax_index].containers:\n",
+    "                    axes[0,ax_index].bar_label(i,)\n",
+    "            import itertools\n",
+    "            num_locations = len(bar_data_state_cpu_panda[\"runtime\"])\n",
+    "            assert num_locations == len(hatches), \"Number of bars should be equal to length of hatches\"\n",
+    "            hatches = itertools.cycle(hatches)\n",
+    "            for i, bar in enumerate(axes[0,ax_index].patches):\n",
+    "                # print(\"i: {} bar: {}\".format(i, bar))\n",
+    "                # if i % num_locations == 0:\n",
+    "                hatch = next(hatches)\n",
+    "                bar.set_hatch(hatch)\n",
+    "            if hatches_colors is not None:\n",
+    "                hatches_colors = itertools.cycle(hatches_colors)\n",
+    "                for i, bar in enumerate(axes[0,ax_index].patches):\n",
+    "                    hatch_color = next(hatches_colors)\n",
+    "                    bar.set_edgecolor(hatch_color)\n",
+    "            ax_index += 1\n",
+    "        df_cur = pd.DataFrame(bar_data_cur_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "        ax = sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_cur, ax=axes[0,ax_index])\n",
+    "        axes[0,ax_index].axhline(1., ls='--', color =\"black\")\n",
+    "        # print(df_cur, type(df_cur))\n",
+    "        axes[0,ax_index].set_yscale('symlog', base=2, linthresh=0.015)\n",
+    "        axes[0,ax_index].set_ylim(0.125,2)\n",
+    "        axes[0,ax_index].set_yticks([0.125, 0.25, 0.5, 1, 2])\n",
+    "        # axes[0,ax_index].set_yticklabels([0.5, 1, 2, 4, 8])\n",
+    "        #axes[0,ax_index].axhline(1., linewidth=2, color=(0, 0, 0, 0.9))\n",
+    "        axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].set_title(\"nrn_cur_{}\".format(modname), fontsize=fontsize)\n",
+    "        axes[0,ax_index].get_legend().remove()\n",
+    "        if xaxis_label is not None:\n",
+    "                axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "        if print_values:\n",
+    "          for i in axes[0,ax_index].containers:\n",
+    "            axes[0,ax_index].bar_label(i,)\n",
+    "        hatches = itertools.cycle(hatches)\n",
+    "        for i, bar in enumerate(axes[0,ax_index].patches):\n",
+    "            # print(\"i: {} bar: {}\".format(i, bar))\n",
+    "            # if i % num_locations == 0:\n",
+    "            hatch = next(hatches)\n",
+    "            bar.set_hatch(hatch)\n",
+    "        if hatches_colors is not None:\n",
+    "            hatches_colors = itertools.cycle(hatches_colors)\n",
+    "            for i, bar in enumerate(axes[0,ax_index].patches):\n",
+    "                hatch_color = next(hatches_colors)\n",
+    "                bar.set_edgecolor(hatch_color)\n",
+    "        ax_index += 1\n",
+    "            \n",
+    "    #if xaxis_label is not None:\n",
+    "    #    fig.text(0.5, 0.04, xaxis_label, ha='center', va='center')\n",
+    "    #else:\n",
+    "    #    fig.text(0.5, 0.04, 'Target Microarchitecture-Instruction Set', ha='center', va='center')\n",
+    "    fig.text(0.06, 0.5, 'Speedup over {}'.format(baseline_name), ha='center', va='center', rotation='vertical', fontsize=fontsize)\n",
+    "    plt.legend(bbox_to_anchor=(1,1), loc=\"upper left\", fontsize=fontsize)\n",
+    "    plt.savefig(\"{}/combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46921dde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -mavx512f -prec-div -fopenmp\",\n",
+    "      \"-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nvhpc\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-fast -O3 -mp=autopar -tp=skylake -Msafeptr=all -Minfo -Mvect=simd:512,gather -mavx512vbmi -mavx512vbmi2 -mavx512vl\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "hh_expsyn_cpu_results = {}\n",
+    "hh_expsyn_cpu_results = load_pickle_result_file([\"./reference_data/hh_expsyn_mavx512f.pickle\", \"./reference_data/hh_expsyn_nvhpc_cpu.pickle\"], hh_expsyn_cpu_results)\n",
+    "json_object = json.dumps(hh_expsyn_cpu_results, indent = 4) \n",
+    "generate_graph_pandas_combined_relative_log_hatches(hh_expsyn_cpu_results, compilers_comparison_config, \"hh_expsyn_cpu_relative_log\", \"graphs_output_pandas\", False, xaxis_label=\"skylake-avx512 Target Microarchitecture\", plot_size=(13,3.5), fontsize=11, hatches=['/', '\\\\', '//', '\\\\\\\\', '--', '+', '++', '/|', '-\\\\', '', 'X'], hatches_colors=['#6baed6', '#0570b0', '#66c2a4','#238b45','#b2df8a','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252'], colors=['w', 'w', 'w','w','w','w','w','w','w','w','w'])\n",
+    "generate_graph_pandas_combined_relative_log_hatches(hh_expsyn_cpu_results, compilers_comparison_config, \"hh_expsyn_cpu_relative_log_darker_hatchgroupped\", \"graphs_output_pandas\", False, xaxis_label=\"skylake-avx512 Target Microarchitecture\", plot_size=(13,3.5), fontsize=11, hatches=['/', '\\\\', '//', '\\\\\\\\', '--', '+', '++', '/|', '-\\\\', '', 'X'], hatches_colors=['#4292c6', '#2171b5', '#41ab5d','#238b45','#006d2c','#fd8d3c','#a63603','#6a51a3','#54278f','#737373','#252525'], colors=['w', 'w', 'w','w','w','w','w','w','w','w','w'])\n",
+    "generate_graph_pandas_combined_relative_log_hatches(hh_expsyn_cpu_results, compilers_comparison_config, \"hh_expsyn_cpu_relative_log_filled\", \"graphs_output_pandas\", False, xaxis_label=\"skylake-avx512 Target Microarchitecture\", plot_size=(13,3.5), fontsize=11, hatches=['/', '\\\\', '//', '\\\\\\\\', '--', '+', '++', '/|', '-\\\\', '', 'X'], hatches_colors=['#252525', '#252525', '#252525','#252525','#252525','#252525','#252525','#252525','#252525','#252525','#252525'])\n",
+    "generate_graph_pandas_combined_relative_log_hatches(hh_expsyn_cpu_results, compilers_comparison_config, \"hh_expsyn_cpu_relative_log_filled_light\", \"graphs_output_pandas\", False, xaxis_label=\"skylake-avx512 Target Microarchitecture\", plot_size=(13,3.5), fontsize=11, hatches=['/', '\\\\', '//', '\\\\\\\\', '--', '+', '++', '/|', '-\\\\', '', 'X'], colors=['#deebf7', '#9ecae1', '#c7e9c0','#a1d99b','#74c476','#fee6ce','#fdd0a2','#dadaeb','#bcbddc','#d9d9d9','#bdbdbd'])\n",
+    "generate_graph_pandas_combined_relative_log_hatches(hh_expsyn_cpu_results, compilers_comparison_config, \"hh_expsyn_cpu_relative_log_filled_light_hatchesgroupped\", \"graphs_output_pandas\", False, xaxis_label=\"skylake-avx512 Target Microarchitecture\", plot_size=(13,3.5), fontsize=11, hatches=['/', '\\\\', '//', '\\\\\\\\', '--', '+', '++', '/|', '-\\\\', '', 'X'], hatches_colors=['#252525', '#252525', '#252525','#252525','#252525','#252525','#252525','#252525','#252525','#252525','#252525'], colors=['#deebf7', '#9ecae1', '#c7e9c0','#a1d99b','#74c476','#fee6ce','#fdd0a2','#dadaeb','#bcbddc','#d9d9d9','#bdbdbd'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "043256fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_combined_relative_gpu_log_hatches(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False, xaxis_label=None, plot_size=(12,6), fontsize=14, baseline_name=\"intel_svml\", hatches=['X', '-']):\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig, axes = plt.subplots(1, 1, squeeze=False, figsize=plot_size)\n",
+    "    ax = axes[0,0]\n",
+    "    bar_data_gpu_panda = {}\n",
+    "    bar_data_gpu_panda[\"kernel\"] = []\n",
+    "    bar_data_gpu_panda[\"compiler\"] = []\n",
+    "    bar_data_gpu_panda[\"runtime\"] = []\n",
+    "    baseline_kernel = {}\n",
+    "    for modname in results:\n",
+    "        for architecture in results[modname]:\n",
+    "            for compiler in compiler_flags:\n",
+    "                if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                    for flags in compiler_flags[compiler][architecture]:\n",
+    "                        if compiler == \"nmodl_jit\":\n",
+    "                            state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                            cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        else:\n",
+    "                            state_kernel_name = \"nrn_state_ext\"\n",
+    "                            cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                        label_state_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        label_cur_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                            compiler_name = \"mod2ir\"\n",
+    "                        elif compiler == \"nmodl_jit\":\n",
+    "                            compiler_name = \"mod2ir_jit\"\n",
+    "                        else:\n",
+    "                            compiler_name = compiler\n",
+    "                        if architecture == \"nvptx64\":\n",
+    "                            architecture_label = architecture\n",
+    "                            if compiler == \"nvhpc\":\n",
+    "                                baseline_kernel[label_state_name] = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][state_kernel_name][0]\n",
+    "                                baseline_kernel[label_cur_name] = results[modname][architecture][\"nvhpc\"][_get_flags_string(flags)][cur_kernel_name][0]\n",
+    "                        else: # skylake-avx512\n",
+    "                            architecture_label = architecture\n",
+    "                        if modname != \"expsyn\":\n",
+    "                            bar_data_gpu_panda[\"kernel\"].append(label_state_name)\n",
+    "                            bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                            if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                                bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                            else:\n",
+    "                                bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][0])\n",
+    "                        bar_data_gpu_panda[\"kernel\"].append(label_cur_name)\n",
+    "                        bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][0])\n",
+    "    for i, runtime in enumerate(bar_data_gpu_panda[\"runtime\"]):\n",
+    "        kernel = bar_data_gpu_panda[\"kernel\"][i]\n",
+    "        print(\"Scaling kernel {} arch {}\".format(kernel, bar_data_gpu_panda[\"compiler\"][i]))\n",
+    "        bar_data_gpu_panda[\"runtime\"][i] = baseline_kernel[kernel]/runtime\n",
+    "    pd.options.display.float_format = \"{:,.2f}\".format\n",
+    "    print(bar_data_gpu_panda)\n",
+    "    df_kernels = pd.DataFrame(bar_data_gpu_panda, columns=[\"kernel\", \"compiler\", \"runtime\"])\n",
+    "    print(df_kernels, type(df_kernels))\n",
+    "    sns.barplot(x='kernel', y='runtime', hue='compiler', data=df_kernels, ax=ax)\n",
+    "    ax.axhline(1., ls='--', color =\"black\")\n",
+    "    ax.xaxis.label.set_visible(False)\n",
+    "    ax.xaxis.set_tick_params(labelsize=fontsize)\n",
+    "    ax.set_yscale('symlog', base=2, linthresh=0.015)\n",
+    "    ax.set_ylim(0.5, 2)\n",
+    "    ax.set_yticks([0.5, 1, 2])\n",
+    "    import itertools\n",
+    "    # hatches = itertools.cycle(hatches)\n",
+    "    # hatch = next(hatches)\n",
+    "    hatches_colors = ['#b2df8a','#969696']\n",
+    "    for i, bar in enumerate(ax.patches):\n",
+    "        print(\"i: {} bar:{}\".format(i, bar))\n",
+    "        hatch_index = 0 if i < len(set(bar_data_gpu_panda[\"kernel\"])) else 1\n",
+    "        hatch = hatches[hatch_index]\n",
+    "        bar.set_hatch(hatch)\n",
+    "        bar.set_edgecolor(hatches_colors[hatch_index])\n",
+    "    # ax.yaxis.label.set_visible(False)\n",
+    "    plt.ylabel('Speedup over {}'.format(baseline_name), fontsize=fontsize)\n",
+    "    # ax.get_legend().remove()\n",
+    "    plt.legend(loc=\"upper right\", fontsize=fontsize)\n",
+    "    # if xaxis_label is not None:\n",
+    "    #     ax.get_xaxis().set_visible(False)\n",
+    "    if print_values:\n",
+    "        for i in ax.containers:\n",
+    "            ax.bar_label(i,)            \n",
+    "    #if xaxis_label is not None:\n",
+    "    #    fig.text(0.5, 0.04, xaxis_label, ha='center', va='center')\n",
+    "    #else:\n",
+    "    #    fig.text(0.5, 0.04, 'Target Microarchitecture-Instruction Set', ha='center', va='center')\n",
+    "    # fig.text(0.06, 0.5, 'Relative Performance ({} = 1)'.format(baseline_name), ha='center', va='center', rotation='vertical')\n",
+    "    # plt.xlabel(\"Kernel Name\")\n",
+    "    # plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "    plt.savefig(\"{}/combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e597d4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hh_expsyn_gpu_1024x128 = {}\n",
+    "hh_expsyn_gpu_1024x128 = load_pickle_result_file([\"./reference_data/hh_gpu_20mil_1024x128.pickle\", \"./reference_data/expsyn_gpu_100mil_1024x128.pickle\"], hh_expsyn_gpu_1024x128)\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"nvhpc\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"-O3 -gpu=nordc,fastmath\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"libdevice_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['w','w']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas_combined_relative_gpu_log_hatches(hh_expsyn_gpu_1024x128, compilers_comparison_config, \"hh_expsyn_gpu_relative_one_plot_log\", \"graphs_output_pandas\", xaxis_label=\"NVPTX64 Architecture\", print_values=False, plot_size=(7,4.5), baseline_name=\"nvhpc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8c862f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_cpu_combined_variance(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False, xaxis_label=None, plot_size=(12,6), show_xlabels=False):\n",
+    "    os.makedirs(output_dir, exist_ok=True)\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig, axes = plt.subplots(1, 3, squeeze=False, figsize=plot_size)\n",
+    "    ax_index = 0\n",
+    "    for modname in results:\n",
+    "        # state\n",
+    "        bar_data_state_cpu_panda = {}\n",
+    "        bar_data_state_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_state_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_state_cpu_panda[\"runtime\"] = []\n",
+    "        # current\n",
+    "        bar_data_cur_cpu_panda = {}\n",
+    "        bar_data_cur_cpu_panda[\"architecture\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"compiler\"] = []\n",
+    "        bar_data_cur_cpu_panda[\"runtime\"] = []\n",
+    "        for architecture in results[modname]:\n",
+    "            for compiler in compiler_flags:\n",
+    "                if compiler in results[modname][architecture] and architecture in compiler_flags[compiler]:\n",
+    "                    for flags in compiler_flags[compiler][architecture]:\n",
+    "                        if compiler == \"nmodl_jit\":\n",
+    "                            state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                            cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        else:\n",
+    "                            state_kernel_name = \"nrn_state_ext\"\n",
+    "                            cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                        if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                            compiler_name = \"mod2ir\"\n",
+    "                        elif compiler == \"nmodl_jit\":\n",
+    "                            compiler_name = \"mod2ir_jit\"\n",
+    "                        else:\n",
+    "                            compiler_name = compiler\n",
+    "                        if \"svml\" in flags or \"SVML\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_svml\"\n",
+    "                        elif \"sleef\" in flags or \"SLEEF\" in flags:\n",
+    "                            compiler_name = compiler_name + \"_sleef\"\n",
+    "                        if architecture == \"default\":\n",
+    "                            architecture_label = \"auto-scalar\"\n",
+    "                        elif architecture == \"nehalem\":\n",
+    "                            architecture_label = \"nehalem-sse2\"\n",
+    "                        elif architecture == \"broadwell\":\n",
+    "                            architecture_label = \"broadwell-avx2\"\n",
+    "                        else: # skylake-avx512\n",
+    "                            architecture_label = architecture\n",
+    "                        if modname != \"expsyn\":\n",
+    "                            bar_data_state_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                            bar_data_state_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                            if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(0)\n",
+    "                            else:\n",
+    "                                bar_data_state_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][1]*results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][1])\n",
+    "                        bar_data_cur_cpu_panda[\"architecture\"].append(architecture_label)\n",
+    "                        bar_data_cur_cpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                        if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(0)\n",
+    "                        else:\n",
+    "                            bar_data_cur_cpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][1]*results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][1])\n",
+    "        if modname != \"expsyn\":\n",
+    "            df_state = pd.DataFrame(bar_data_state_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "            sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_state, ax=axes[0,ax_index])\n",
+    "            axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "            axes[0,ax_index].set_title(\"nrn_state_{}\".format(modname))\n",
+    "            axes[0,ax_index].get_legend().remove()\n",
+    "            if not show_xlabels:\n",
+    "                axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "            if print_values:\n",
+    "                for i in axes[0,ax_index].containers:\n",
+    "                    axes[0,ax_index].bar_label(i,)\n",
+    "            ax_index += 1\n",
+    "        df_cur = pd.DataFrame(bar_data_cur_cpu_panda, columns=[\"architecture\", \"compiler\", \"runtime\"])\n",
+    "        sns.barplot(x='architecture', y='runtime', hue='compiler', data=df_cur, ax=axes[0,ax_index])\n",
+    "        axes[0,ax_index].xaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].yaxis.label.set_visible(False)\n",
+    "        axes[0,ax_index].set_title(\"nrn_cur_{}\".format(modname))\n",
+    "        axes[0,ax_index].get_legend().remove()\n",
+    "        if not show_xlabels:\n",
+    "            axes[0,ax_index].get_xaxis().set_visible(False)\n",
+    "        if print_values:\n",
+    "          for i in axes[0,ax_index].containers:\n",
+    "            axes[0,ax_index].bar_label(i,)\n",
+    "        ax_index += 1\n",
+    "    if xaxis_label is not None:\n",
+    "        fig.text(0.5, 0.04, xaxis_label, ha='center', va='center')\n",
+    "    # else:\n",
+    "    #     fig.text(0.5, 0.04, 'Target Microarchitecture-Instruction Set', ha='center', va='center')\n",
+    "    fig.text(0.06, 0.5, 'Runtime (s)', ha='center', va='center', rotation='vertical')\n",
+    "    plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "    plt.savefig(\"{}/combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d959dade",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hh_expsyn_avx512f_results = {}\n",
+    "hh_expsyn_avx512f_results = load_pickle_result_file([\"./reference_data/hh_expsyn_mavx512f.pickle\", \"./reference_data/hh_expsyn_nvhpc_cpu.pickle\"], hh_expsyn_avx512f_results)\n",
+    "colors = ['#6baed6', '#0570b0', '#66c2a4','#238b45','#b2df8a','#fdd49e','#fc8d59','#9ebcda','#8c96c6','#969696','#525252']\n",
+    "# pick colors according to the following order which matches the order of \n",
+    "# intel #6baed6\n",
+    "# intel svml #0570b0\n",
+    "# gcc #66c2a4\n",
+    "# gcc_svml #238b45\n",
+    "# nvhpc #b2df8a\n",
+    "# clang #fdd49e\n",
+    "# clang_svml #fc8d59\n",
+    "# mod2ir #9ebcda\n",
+    "# mod2ir_svml #8c96c6\n",
+    "# mor2it_jit_svml #969696\n",
+    "# mod2ir_jit_sleef #525252\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"intel\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O2 -mavx512f -prec-div -fopenmp\",\n",
+    "      \"-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"gcc\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nvhpc\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-fast -O3 -mp=autopar -tp=skylake -Msafeptr=all -Minfo -Mvect=simd:512,gather -mavx512vbmi -mavx512vbmi2 -mavx512vl\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"clang\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML\",\n",
+    "      \"-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"skylake-avx512\": [\n",
+    "      \"SVML_nnancontractafn\",\n",
+    "      \"SLEEF_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "generate_graph_pandas_cpu_combined_variance(hh_expsyn_avx512f_results, compilers_comparison_config, \"variance\", \"graphs_output_pandas\", True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c7c6395",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_graph_pandas_gpu_combined_variance(results, compilers_comparison_config, graph_suffix, output_dir, print_values=False):\n",
+    "    compiler_flags = json.loads(compilers_comparison_config)\n",
+    "    fig = plt.figure(figsize=(12, 6))\n",
+    "    for i, modname in enumerate(results):\n",
+    "        df = None\n",
+    "        bar_data_gpu_panda = {}\n",
+    "        bar_data_gpu_panda[\"kernel\"] = []\n",
+    "        bar_data_gpu_panda[\"compiler\"] = []\n",
+    "        bar_data_gpu_panda[\"runtime\"] = []\n",
+    "        architecture = \"nvptx64\"\n",
+    "        for compiler in results[modname][architecture]:\n",
+    "            if compiler in compiler_flags and architecture in compiler_flags[compiler]:\n",
+    "                for flags in compiler_flags[compiler][architecture]:\n",
+    "                    dict_label = \"{}_{}_{}\".format(architecture, compiler, _get_flags_string(flags))\n",
+    "                    if compiler == \"nmodl_jit\":\n",
+    "                        state_kernel_name = \"nrn_state_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                        cur_kernel_name = \"nrn_cur_{}\".format(modname.replace(\"-\", \"_\"))\n",
+    "                    else:\n",
+    "                        state_kernel_name = \"nrn_state_ext\"\n",
+    "                        cur_kernel_name = \"nrn_cur_ext\"\n",
+    "                    if compiler == \"clang\" and \"jit\" in flags:\n",
+    "                        compiler_name = \"mod2ir\"\n",
+    "                    elif compiler == \"nmodl_jit\":\n",
+    "                        compiler_name = \"mod2ir_jit\"\n",
+    "                    else:\n",
+    "                        compiler_name = compiler\n",
+    "                    bar_data_gpu_panda[\"kernel\"].append(\"nrn_state\")\n",
+    "                    bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                    if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                    else:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][1]*results[modname][architecture][compiler][_get_flags_string(flags)][state_kernel_name][1])\n",
+    "                    bar_data_gpu_panda[\"kernel\"].append(\"nrn_current\")\n",
+    "                    bar_data_gpu_panda[\"compiler\"].append(compiler_name)\n",
+    "                    if _get_flags_string(flags) not in results[modname][architecture][compiler]:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(0)\n",
+    "                    else:\n",
+    "                        bar_data_gpu_panda[\"runtime\"].append(results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][1]*results[modname][architecture][compiler][_get_flags_string(flags)][cur_kernel_name][1])\n",
+    "        df_state = pd.DataFrame(bar_data_gpu_panda, columns=[\"kernel\", \"compiler\", \"runtime\"])\n",
+    "        ax = fig.add_subplot(1, 2, i+1)\n",
+    "        ax = sns.barplot(x='kernel', y='runtime', hue='compiler', data=df_state, ax=ax)\n",
+    "        if print_values:\n",
+    "            for i in ax.containers:\n",
+    "                ax.bar_label(i,)\n",
+    "        plt.xlabel(\"Kernel Name\")\n",
+    "        if i == 0:\n",
+    "            plt.ylabel(\"Runtime (s)\")\n",
+    "        else:\n",
+    "            ax.set(ylabel=None)\n",
+    "        plt.title(\"OpenACC and MOD2IR comparison for {}\".format(modname))\n",
+    "        # plt.legend(bbox_to_anchor=(1.04,1), loc=\"upper left\")\n",
+    "        # plt.savefig(\"{}/{}_benchmark_{}.pdf\".format(output_dir, modname, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.savefig(\"{}/gpu_combined_benchmark_{}.pdf\".format(output_dir, graph_suffix), format=\"pdf\", bbox_inches=\"tight\")\n",
+    "    plt.show()\n",
+    "    plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7434d4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hh_expsyn_gpu_1024x128 = {}\n",
+    "hh_expsyn_gpu_1024x128 = load_pickle_result_file([\"./reference_data/hh_gpu_20mil_1024x128.pickle\", \"./reference_data/expsyn_gpu_100mil_1024x128.pickle\"], hh_expsyn_gpu_1024x128)\n",
+    "compilers_comparison_config = \"\"\"\n",
+    "{\n",
+    "  \"nvhpc\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"-O3 -gpu=nordc,fastmath\"\n",
+    "    ]\n",
+    "  },\n",
+    "  \"nmodl_jit\": {\n",
+    "    \"nvptx64\": [\n",
+    "      \"libdevice_nnancontractafn\"\n",
+    "    ]\n",
+    "  }\n",
+    "}\n",
+    "\"\"\"\n",
+    "colors = ['#b2df8a','#bdbdbd']\n",
+    "sns.set_palette(sns.color_palette(colors))\n",
+    "\n",
+    "generate_graph_pandas_gpu_combined_variance(hh_expsyn_gpu_1024x128, compilers_comparison_config, \"hh_expsyn_gpu\", \"graphs_output_pandas\", print_values=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/test/benchmark/artifacts/artifact_script.py b/test/benchmark/artifacts/artifact_script.py
new file mode 100644
index 0000000000..bfac021a99
--- /dev/null
+++ b/test/benchmark/artifacts/artifact_script.py
@@ -0,0 +1,34 @@
+import numpy as np
+import nmodl.dsl as nmodl
+from nmodl import ast, visitor
+
+# HH and expsyn mod files
+models = ["hh.mod", "expsyn.mod"]
+
+# parse NMODL mechanisms, create ASTs and find their SUFFIX
+models_ast = []
+models_name = []
+driver = nmodl.NmodlDriver()
+lookup_visitor = visitor.AstLookupVisitor()
+for mod in models:
+    models_ast.append(driver.parse_file(mod))
+    models_name.append(
+        lookup_visitor.lookup(models_ast[-1], ast.AstNodeType.SUFFIX)[0].get_node_name()
+    )
+
+# code generation and JIT configuration
+cfg = nmodl.CodeGenConfig()
+cfg.llvm_vector_width = 4
+cfg.llvm_opt_level_ir = 3
+cfg.nmodl_ast = True
+jit = nmodl.Jit(cfg)
+
+# simulation configuration
+dt = 0.025
+tstop = 1000
+instances_number = 10000
+
+# representative simulator loop
+for t in np.arange(0, tstop, dt):
+    for i, modast in enumerate(models_ast):
+        jit.run(modast, models_name[i], instances_number)
diff --git a/test/benchmark/artifacts/expsyn.mod b/test/benchmark/artifacts/expsyn.mod
new file mode 100644
index 0000000000..431836ba9a
--- /dev/null
+++ b/test/benchmark/artifacts/expsyn.mod
@@ -0,0 +1,42 @@
+NEURON {
+	POINT_PROCESS ExpSyn
+	RANGE tau, e, i
+	NONSPECIFIC_CURRENT i
+}
+
+UNITS {
+	(nA) = (nanoamp)
+	(mV) = (millivolt)
+	(uS) = (microsiemens)
+}
+
+PARAMETER {
+	tau = 0.1 (ms) <1e-9,1e9>
+	e = 0	(mV)
+}
+
+ASSIGNED {
+	v (mV)
+	i (nA)
+}
+
+STATE {
+	g_state (uS)
+}
+
+INITIAL {
+	g_state=0
+}
+
+BREAKPOINT {
+	SOLVE state METHOD cnexp
+	i = g_state*(v - e)
+}
+
+DERIVATIVE state {
+	g_state' = -g_state/tau
+}
+
+NET_RECEIVE(weight (uS)) {
+	g_state = g_state + weight
+}
diff --git a/test/benchmark/artifacts/hh.mod b/test/benchmark/artifacts/hh.mod
new file mode 100644
index 0000000000..053a15f43f
--- /dev/null
+++ b/test/benchmark/artifacts/hh.mod
@@ -0,0 +1,125 @@
+TITLE hh.mod   squid sodium, potassium, and leak channels
+ 
+COMMENT
+ This is the original Hodgkin-Huxley treatment for the set of sodium, 
+  potassium, and leakage channels found in the squid giant axon membrane.
+  ("A quantitative description of membrane current and its application 
+  conduction and excitation in nerve" J.Physiol. (Lond.) 117:500-544 (1952).)
+ Membrane voltage is in absolute mV and has been reversed in polarity
+  from the original HH convention and shifted to reflect a resting potential
+  of -65 mV.
+ Remember to set celsius=6.3 (or whatever) in your HOC file.
+ See squid.hoc for an example of a simulation using this model.
+ SW Jaslove  6 March, 1992
+ENDCOMMENT
+ 
+UNITS {
+        (mA) = (milliamp)
+        (mV) = (millivolt)
+	(S) = (siemens)
+}
+ 
+? interface
+NEURON {
+        SUFFIX hh
+        USEION na READ ena WRITE ina
+        USEION k READ ek WRITE ik
+        NONSPECIFIC_CURRENT il
+        RANGE gnabar, gkbar, gl, el, gna, gk
+        :GLOBAL minf, hinf, ninf, mtau, htau, ntau
+        RANGE minf, hinf, ninf, mtau, htau, ntau
+	THREADSAFE : assigned GLOBALs will be per thread
+}
+ 
+PARAMETER {
+        gnabar = .12 (S/cm2)	<0,1e9>
+        gkbar = .036 (S/cm2)	<0,1e9>
+        gl = .0003 (S/cm2)	<0,1e9>
+        el = -54.3 (mV)
+}
+ 
+STATE {
+        m h n
+}
+ 
+ASSIGNED {
+        v (mV)
+        celsius (degC)
+        ena (mV)
+        ek (mV)
+
+	gna (S/cm2)
+	gk (S/cm2)
+        ina (mA/cm2)
+        ik (mA/cm2)
+        il (mA/cm2)
+        minf hinf ninf
+	mtau (ms) htau (ms) ntau (ms)
+}
+ 
+? currents
+BREAKPOINT {
+        SOLVE states METHOD cnexp
+        gna = gnabar*m*m*m*h
+	ina = gna*(v - ena)
+        gk = gkbar*n*n*n*n
+	ik = gk*(v - ek)      
+        il = gl*(v - el)
+}
+ 
+ 
+INITIAL {
+	rates(v)
+	m = minf
+	h = hinf
+	n = ninf
+}
+
+? states
+DERIVATIVE states {  
+        rates(v)
+        m' =  (minf-m)/mtau
+        h' = (hinf-h)/htau
+        n' = (ninf-n)/ntau
+}
+ 
+:LOCAL q10
+
+
+? rates
+PROCEDURE rates(v(mV)) {  :Computes rate and other constants at current v.
+                      :Call once from HOC to initialize inf at resting v.
+        LOCAL  alpha, beta, sum, q10
+:        TABLE minf, mtau, hinf, htau, ninf, ntau DEPEND celsius FROM -100 TO 100 WITH 200
+
+UNITSOFF
+        q10 = 3^((celsius - 6.3)/10)
+                :"m" sodium activation system
+        alpha = .1 * vtrap(-(v+40),10)
+        beta =  4 * exp(-(v+65)/18)
+        sum = alpha + beta
+	mtau = 1/(q10*sum)
+        minf = alpha/sum
+                :"h" sodium inactivation system
+        alpha = .07 * exp(-(v+65)/20)
+        beta = 1 / (exp(-(v+35)/10) + 1)
+        sum = alpha + beta
+	htau = 1/(q10*sum)
+        hinf = alpha/sum
+                :"n" potassium activation system
+        alpha = .01*vtrap(-(v+55),10) 
+        beta = .125*exp(-(v+65)/80)
+	sum = alpha + beta
+        ntau = 1/(q10*sum)
+        ninf = alpha/sum
+}
+ 
+FUNCTION vtrap(x,y) {  :Traps for 0 in denominator of rate eqns.
+        if (fabs(x/y) < 1e-6) {
+                vtrap = y*(1 - x/y/2)
+        }else{
+                vtrap = x/(exp(x/y) - 1)
+        }
+}
+ 
+UNITSON
diff --git a/test/benchmark/benchmark_script.py b/test/benchmark/benchmark_script.py
new file mode 100644
index 0000000000..865964b5ef
--- /dev/null
+++ b/test/benchmark/benchmark_script.py
@@ -0,0 +1,541 @@
+import argparse
+from dataclasses import dataclass, field
+import json
+import os
+from pathlib import Path
+import pickle
+import re
+import shutil
+import subprocess
+
+import nmodl.dsl as nmodl
+
+@dataclass
+class CompilersConfig:
+    svml_lib: str = ""
+    intel_exe: str = ""
+    sleef_lib: str = ""
+    clang_exe: str = ""
+    llc_exe: str = ""
+    gcc_exe: str = ""
+    nvhpc_exe: str = ""
+    libdevice_lib: str = ""
+    nmodl_exe: str = ""
+
+    def get_compiler_cmd(self, compiler):
+        if compiler == "intel":
+            return self.intel_exe
+        elif compiler == "clang":
+            return self.clang_exe
+        elif compiler == "gcc":
+            return self.gcc_exe
+        elif compiler == "nvhpc":
+            return self.nvhpc_exe
+        else:
+            raise Exception("Unknown compiler")
+
+@dataclass
+class BenchmarkConfig:
+    math_libraries = ["SVML", "SLEEF"]
+    llvm_fast_math_flags = ["nnan", "contract", "afn"]
+
+    mod_files: str = ""
+    architectures: str = ""
+    compilers: str = ""
+    external_kernel: bool = False
+    nmodl_jit: bool = True
+    output_directory: str = "benchmark_output"
+    instances: int = 100000000
+    experiments: int = 5
+    modfile_directory: str = "."
+    ext_lib_name: str = "libextkernel.so"
+    compiler_flags: dict = field(init=False)
+    gpu_target_architecture: str = "sm_70"
+
+    def __post_init__(self):
+        with open('compiler_flags.json','r') as fp:
+            self.compiler_flags = json.load(fp)
+
+
+class Benchmark:
+
+    def __init__(self, compiler_config, benchmark_config):
+        self.results = {}
+        self.compiler_config = compiler_config
+        self.benchmark_config = benchmark_config
+
+    def translate_mod_file_to_cpp(self, mod_file):
+        """Translate mod file to cpp file that can be
+        compiled by the compilers specified and then
+        executed by NMODL
+        """
+        pass
+
+    def compile_llvm_ir_clang(self, llvm_ir_file_path, flags, external_lib_path):
+        """Compile LLVM IR file with clang"""
+        print("Compiling LLVM IR file with clang")
+        compiler_cmd = self.compiler_config.get_compiler_cmd("clang")
+        bash_command = [compiler_cmd] + flags.split(" ") + ["./"+llvm_ir_file_path, "-fpic", "-shared", "-o {}".format(external_lib_path)]
+        print("Executing command: {} {}".format(compiler_cmd, ' '.join(bash_command)))
+        result = subprocess.run(" ".join(bash_command), capture_output=True, text=True, shell=True, env=os.environ.copy())
+        print("stdout:", result.stdout)
+        print("stderr:", result.stderr)
+        result.check_returncode()
+
+    def translate_mod_file_to_llvm_ir_for_clang(self,
+        modfile_str,
+        modname,
+        compiler,
+        architecture,
+        math_lib,
+        flags):
+        """Translate mod file to cpp wrapper file and
+        LLVM IR file that can be compiled by the compilers
+        specified and then executed by NMODL
+        """
+        cfg = nmodl.CodeGenConfig()
+        cfg.llvm_ir = True
+        cfg.llvm_opt_level_ir = 3
+        cfg.llvm_math_library = math_lib
+        cfg.llvm_fast_math_flags = self.benchmark_config.llvm_fast_math_flags
+        cfg.llvm_cpu_name = architecture
+        if architecture == "skylake-avx512":
+            cfg.llvm_vector_width = 8
+        elif architecture == "broadwell":
+            cfg.llvm_vector_width = 4
+        elif architecture == "nehalem":
+            cfg.llvm_vector_width = 2
+        else:
+            cfg.llvm_vector_width = 1
+        cfg.llvm_opt_level_codegen = 3
+        if math_lib == "SVML":
+            cfg.shared_lib_paths = [self.compiler_config.svml_lib]
+        elif math_lib == "SLEEF":
+            cfg.shared_lib_paths = [self.compiler_config.sleef_lib]
+        cfg.output_dir = str((Path(self.benchmark_config.output_directory)
+            / modname
+            / compiler
+            / architecture
+            / self._get_flags_string(flags+"_"+math_lib)))
+        modast = self.init_ast(modfile_str)
+        # Run JIT to generate the LLVM IR with the wrappers needed to run JIT later
+        jit = nmodl.Jit(cfg)
+        res = jit.run(modast, modname, 1, 1)
+        jit_llvm_ir_file_path = str(Path(cfg.output_dir) / "v{}_{}_opt.ll".format(cfg.llvm_vector_width, modname))
+        jit_llvm_ir_file_ext_path = str(Path(cfg.output_dir) / "v{}_{}_opt_ext.ll".format(cfg.llvm_vector_width, modname))
+        with open(jit_llvm_ir_file_path, "r") as inf:
+            llvm_ir_file_content = inf.read()
+            llvm_ir_file_content = re.sub(r'nrn_state_{}'.format(modname.replace('-','_')), r'_Z13nrn_state_extPv', llvm_ir_file_content)
+            llvm_ir_file_content = re.sub(r'nrn_cur_{}'.format(modname.replace('-','_')), r'_Z11nrn_cur_extPv', llvm_ir_file_content)
+            with open(jit_llvm_ir_file_ext_path, "w") as outf:
+                outf.write(llvm_ir_file_content)
+        return jit_llvm_ir_file_ext_path
+
+    def _get_flags_string(self, flags):
+        return flags.replace(" ", "_").replace('-','').replace('=','_')
+
+    def _make_external_lib_basepath(self, cpp_file, compiler, architecture, flags):
+        cpp_basename = os.path.splitext(os.path.basename(cpp_file))[0]
+        external_lib_dir = (Path(self.benchmark_config.output_directory)
+                / cpp_basename
+                / compiler
+                / architecture
+                / self._get_flags_string(flags))
+        if not os.path.exists(external_lib_dir):
+            os.makedirs(external_lib_dir)
+        return external_lib_dir
+
+    def _get_external_lib_path(self, cpp_file, compiler, architecture, flags):
+        external_lib_path = self._make_external_lib_basepath(
+                cpp_file, compiler, architecture, flags) / self.benchmark_config.ext_lib_name
+        return external_lib_path
+
+    def compile_external_library(self, cpp_file, compiler, architecture, flags):
+        """Compile cpp_file to an external shared library
+        that has the state and current kernels and can be
+        then loaded by NMODL to execute these kernels
+        """
+        print("Compiling external library with {} compiler ({}, {})".format(compiler, architecture, flags))
+        compiler_cmd = self.compiler_config.get_compiler_cmd(compiler)
+
+        cpp_basename = os.path.splitext(os.path.basename(cpp_file))[0]
+        external_lib_dir = self._make_external_lib_basepath(cpp_file, compiler, architecture, flags)
+        # expsyn mod file openacc execution is diferent than the rest of the mod files
+        if compiler == "nvhpc" and architecture == "nvptx64":
+            cpp_basename_org = cpp_basename
+            cpp_basename = cpp_basename + "_openacc"
+            cpp_file_org = cpp_file
+            cpp_file = cpp_file.replace(cpp_basename_org, cpp_basename)
+            print("Changing {} file to {}".format(cpp_file_org, cpp_file))
+        # Replace current cpp_file pragma with correct one and write it in new file
+        sed_replaced_cpp_file = external_lib_dir / (Path(cpp_basename + "_ext.cpp"))
+        with open(cpp_file, "r") as inf:
+            cpp_file_content = inf.read()
+            if "-fopenmp" in flags or "-mp=autopar" in flags or compiler == "intel":
+                cpp_file_content = re.sub(r'#pragma.*', r'#pragma omp simd', cpp_file_content)
+            elif compiler == "clang":
+                cpp_file_content = re.sub(r'#pragma.*', r'#pragma clang vectorize(enable)', cpp_file_content)
+            elif compiler == "gcc":
+                cpp_file_content = re.sub(r'#pragma.*', r'#pragma GCC ivdep', cpp_file_content)
+            elif compiler == "nvhpc" and "openacc" not in cpp_basename and architecture == "nvptx64":
+                cpp_file_content = re.sub(r'#pragma.*', r'#pragma acc parallel loop deviceptr(inst)', cpp_file_content)
+            with open(sed_replaced_cpp_file, "w") as outf:
+                outf.write(cpp_file_content)
+
+        if "openacc" in cpp_file:
+            cpp_file = cpp_file.replace("_openacc", "")
+
+        external_lib_path = self._get_external_lib_path(cpp_file, compiler, architecture, flags)
+        intel_lib_dir = os.path.dirname(self.compiler_config.svml_lib)
+        if architecture != "nvptx64" and ("svml" in flags or "SVML" in flags):
+            bash_command = [compiler_cmd] + flags.split(" ") + ["./"+str(sed_replaced_cpp_file), "-fpic", "-shared", "-o {}".format(external_lib_path), "-Wl,-rpath,{}".format(intel_lib_dir), "-L{}".format(intel_lib_dir), "-lsvml"]
+        elif architecture != "nvptx64":
+            bash_command = [compiler_cmd] + flags.split(" ") + ["./"+str(sed_replaced_cpp_file), "-fpic", "-shared", "-o {}".format(external_lib_path), "-Wl,-rpath,{}".format(intel_lib_dir)]
+        else:
+            bash_command = [compiler_cmd] + flags.split(" ") + ["./"+str(sed_replaced_cpp_file), "-fPIC", "-shared", "-o {}".format(external_lib_path), "-acc", "-nomp", "-gpu=cc70"]
+        if "-fopenmp" in flags:
+            if compiler == "gcc":
+                bash_command.append("-Wl,-rpath,{}".format("/".join(self.compiler_config.gcc_exe.split("/")[0:-2]+["lib64"])))
+            elif compiler == "clang":
+                bash_command.append("-Wl,-rpath,{}".format("/".join(self.compiler_config.clang_exe.split("/")[0:-2]+["lib"])))
+        if compiler == "gcc" or compiler == "clang":
+            bash_command.append("-save-temps=obj")
+        print("Executing command: {}".format(' '.join(bash_command)))
+        result = subprocess.run(" ".join(bash_command), capture_output=True, text=True, shell=True, env=os.environ.copy())
+        print("stdout:", result.stdout)
+        print("stderr:", result.stderr)
+        result.check_returncode()
+
+    def run_external_kernel(
+        self,
+        modfile_str,
+        modname,
+        compiler,
+        architecture,
+        gpu_target_architecture,
+        flags,
+        instances,
+        experiments,
+    ):
+        """Runs all external kernel related benchmarks"""
+        """Runs NMODL JIT kernels"""
+        cfg = nmodl.CodeGenConfig()
+        cfg.llvm_ir = True
+        cfg.llvm_opt_level_ir = 3
+        if architecture != "nvptx64":
+            cfg.llvm_math_library = "SVML"
+        else:
+            cfg.llvm_math_library = "libdevice"
+        cfg.llvm_fast_math_flags = self.benchmark_config.llvm_fast_math_flags
+        if architecture != "nvptx64":
+            cfg.llvm_cpu_name = architecture
+        if architecture == "skylake-avx512":
+            cfg.llvm_vector_width = 8
+        elif architecture == "broadwell":
+            cfg.llvm_vector_width = 4
+        elif architecture == "nehalem":
+            cfg.llvm_vector_width = 2
+        else:
+            cfg.llvm_vector_width = 1
+        cfg.llvm_opt_level_codegen = 3
+        if architecture != "nvptx64":
+            cfg.shared_lib_paths = [self.compiler_config.svml_lib]
+        else:
+            cfg.shared_lib_paths = [self.compiler_config.libdevice_lib]
+            cfg.llvm_gpu_name = "nvptx64"
+            cfg.llvm_gpu_target_architecture = gpu_target_architecture
+        cfg.output_dir = str((Path(self.benchmark_config.output_directory)
+            / modname
+            / compiler
+            / architecture
+            / self._get_flags_string(flags)))
+        modast = self.init_ast(modfile_str)
+        jit = nmodl.Jit(cfg)
+        external_lib_path = "./" / self._get_external_lib_path(modname+".cpp", compiler, architecture, flags)
+        res = jit.run(modast, modname, (int)(experiments), (int)(instances), str(external_lib_path))
+        return res
+
+    def run_JIT_kernels(
+        self,
+        modfile_str,
+        modname,
+        architecture,
+        gpu_target_architecture,
+        fast_math_flags,
+        math_lib,
+        instances,
+        experiments,
+        gpu_grid_dim = 1,
+        gpu_block_dim = 1
+    ):
+        """Runs NMODL JIT kernels"""
+        cfg = nmodl.CodeGenConfig()
+        cfg.llvm_ir = True
+        cfg.llvm_opt_level_ir = 3
+        cfg.llvm_math_library = math_lib
+        cfg.llvm_fast_math_flags = fast_math_flags
+        if architecture != "nvptx64":
+            cfg.llvm_cpu_name = architecture
+        else:
+            cfg.llvm_gpu_name = "nvptx64"
+            cfg.llvm_gpu_target_architecture = gpu_target_architecture
+        if architecture == "skylake-avx512":
+            cfg.llvm_vector_width = 8
+        elif architecture == "broadwell":
+            cfg.llvm_vector_width = 4
+        elif architecture == "nehalem":
+            cfg.llvm_vector_width = 2
+        else:
+            cfg.llvm_vector_width = 1
+        cfg.llvm_opt_level_codegen = 3
+        if math_lib == "SVML":
+            cfg.shared_lib_paths = [self.compiler_config.svml_lib]
+        elif math_lib == "SLEEF":
+            cfg.shared_lib_paths = [self.compiler_config.sleef_lib]
+        elif math_lib == "libdevice":
+            cfg.shared_lib_paths = [self.compiler_config.libdevice_lib]
+        cfg.output_dir = str((Path(self.benchmark_config.output_directory)
+                / modname
+                / "nmodl_jit"
+                / architecture
+                / math_lib))
+        modast = self.init_ast(modfile_str)
+        jit = nmodl.Jit(cfg)
+        res = jit.run(modast, modname, (int)(experiments), (int)(instances), "", gpu_grid_dim, gpu_block_dim)
+        return res
+
+    def init_ast(self, mod_file_string):
+        driver = nmodl.NmodlDriver()
+        modast = driver.parse_string(mod_file_string)
+        return modast
+
+    def run_benchmark(self):
+        for modfile in self.benchmark_config.mod_files:
+            modname = os.path.splitext(os.path.basename(modfile))[0]
+            print("Running benchmark for mod file: {}".format(modfile))
+            if modfile not in self.results:
+                self.results[modname] = {}
+            # Make number of instances smalle for hh kernel due to it's already large memory footprint
+            if modname == "hh":
+                kernel_instance_size = self.benchmark_config.instances / 5
+            else:
+                kernel_instance_size = self.benchmark_config.instances
+
+            with open(modfile) as f:
+                modfile_str = f.read()
+
+                # Delete existing output directory for mod file
+                output_dir = os.path.join(self.benchmark_config.output_directory, modname)
+                if os.path.isdir(output_dir):
+                    shutil.rmtree(output_dir)
+
+                for architecture in self.benchmark_config.architectures:
+                    print('Architecture: {}'.format(architecture))
+                    if architecture not in self.results[modname]:
+                        self.results[modname][architecture] = {}
+                    if self.benchmark_config.external_kernel:
+                        for compiler in self.benchmark_config.compilers:
+                            # Don't try to use NVPTX64 arch for compilers except NVHPC and other architectures with NVHPC compiler
+                            if (architecture == "nvptx64" and compiler != "nvhpc") or (architecture not in ["skylake-avx512", "nvptx64"] and compiler == "nvhpc"):
+                                continue
+                            if compiler not in self.results[modname][architecture]:
+                                self.results[modname][architecture][compiler] = {}
+                            for flags in self.benchmark_config.compiler_flags[compiler][
+                                architecture
+                            ]:
+                                # Translate mod file to .cpp to be compiled by the certain compiler
+                                # TODO: see above
+                                # Compile the .cpp file to a shared library
+                                self.compile_external_library(os.path.join("kernels", modname+".cpp"), compiler, architecture, flags)
+                                # Run NMODL JIT with external shared library
+                                self.results[modname][architecture][compiler][self._get_flags_string(flags)] = self.run_external_kernel(modfile_str,
+                                        modname,
+                                        compiler,
+                                        architecture,
+                                        self.benchmark_config.gpu_target_architecture,
+                                        flags,
+                                        kernel_instance_size,
+                                        self.benchmark_config.experiments)
+                                print(
+                                    "self.results[modname][architecture][compiler][flags] = jit.run(modast, modname, self.config.instances, self.config.experiments, external_lib)"
+                                )
+                                if compiler == "clang" and "SVML" not in flags:
+                                    for math_lib in self.benchmark_config.math_libraries:
+                                        # Generate LLVM IR from NMODL JIT
+                                        # sed the nrn_state_hh name to _Z16nrn_state_hh_extPv to match the external kernel signature name of the external shared lib
+                                        # compile LLVM IR using clang and the compiler flags of the architecture used and generate shared library
+                                        # Run NMODL JIT with external shared library
+                                        jit_llvm_ir_file_ext_path = self.translate_mod_file_to_llvm_ir_for_clang(modfile_str, modname, compiler, architecture, math_lib, flags+"_jit")
+                                        external_lib_path = self._get_external_lib_path(modname, "clang", architecture, flags+"_jit_"+math_lib)
+                                        self.compile_llvm_ir_clang(jit_llvm_ir_file_ext_path, flags, external_lib_path)
+                                        self.results[modname][architecture][compiler][self._get_flags_string(flags)+"_jit_"+math_lib] = self.run_external_kernel(modfile_str,
+                                                modname,
+                                                compiler,
+                                                architecture,
+                                                self.benchmark_config.gpu_target_architecture,
+                                                flags+"_jit_"+math_lib,
+                                                kernel_instance_size,
+                                                self.benchmark_config.experiments)
+                                        print(
+                                            'self.results[modname][architecture][compiler][flags+"jit"] = jit.run(modast, modname, self.config.instances, self.config.experiments, external_lib)'
+                                        )
+                    if self.benchmark_config.nmodl_jit:
+                        self.results[modname][architecture]["nmodl_jit"] = {}
+                        for fast_math in [True]:
+                            if fast_math:
+                                fast_math_flags = self.benchmark_config.llvm_fast_math_flags
+                                fast_math_name = "nnancontractafn"
+                            else:
+                                fast_math_flags = [""]
+                                fast_math_name = "nonfastmath"
+                            if architecture != "nvptx64":
+                                for math_lib in self.benchmark_config.math_libraries:
+                                    # Run NMODL JIT on CPU
+                                    print(
+                                        'self.results[modname][architecture]["nmodl_jit"][math_lib+fast_math_name] = jit.run(modast, modname, self.config.instances, self.config.experiments)'
+                                    )
+                                    self.results[modname][architecture]["nmodl_jit"][
+                                        math_lib + "_" + fast_math_name
+                                    ] = self.run_JIT_kernels(
+                                        modfile_str,
+                                        modname,
+                                        architecture,
+                                        "",
+                                        fast_math_flags,
+                                        math_lib,
+                                        kernel_instance_size,
+                                        self.benchmark_config.experiments
+                                    )
+                            else:
+                                # Run NMODL JIT on GPU
+                                self.results[modname][architecture]["nmodl_jit"][
+                                    "libdevice_" + fast_math_name
+                                ] = self.run_JIT_kernels(
+                                    modfile_str,
+                                    modname,
+                                    architecture,
+                                    self.benchmark_config.gpu_target_architecture,
+                                    fast_math_flags,
+                                    "libdevice",
+                                    kernel_instance_size,
+                                    self.benchmark_config.experiments,
+                                    1024,
+                                    128
+                                )
+                                print(
+                                    'self.results[modname][architecture]["nmodl_jit_cuda"]["libdevice"+fast_math_name] = jit.run(modast, modname, self.config.instances, self.config.experiments)'
+                                )
+                    if not self.benchmark_config.external_kernel and not self.benchmark_config.nmodl_jit:
+                        raise Exception("No kernel to run. Select either --external or/and --nmodl_jit")
+            print(self.results)
+        with open('{}/benchmark_results.pickle'.format(self.benchmark_config.output_directory), 'wb') as handle:
+            pickle.dump(self.results, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Benchmark script for NMODL LLVM.")
+    # Arguments to initialize BenchmarkConfig
+    parser.add_argument(
+        "--modfiles",
+        nargs="+",
+        help="Mod files to benchmark",
+        required=True,
+    )
+    parser.add_argument(
+        "--architectures", nargs="+", help="Architectures to benchmark", required=True
+    )
+    parser.add_argument(
+        "--compilers", nargs="+", help="Compilers to benchmark", required=True,
+        choices=["intel", "clang", "gcc", "nvhpc"]
+    )
+    parser.add_argument(
+        "--external_kernel",
+        help="Run external kernel benchmarks",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--nmodl_jit",
+        help="Run JIT benchmarks with NMODL generated kernels",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--output", help="Output directory for benchmark results", required=True
+    )
+    parser.add_argument(
+        "--instances",
+        type=int,
+        default=100000000,
+        help="Instances to benchmark",
+        required=False,
+    )
+    parser.add_argument(
+        "--experiments",
+        type=int,
+        default=5,
+        help="Experiments to benchmark",
+        required=True,
+    )
+    # Arguments to initialize CompilersConfig
+    parser.add_argument(
+        "--svml_lib", type=str, help="SVML library directory to use", required=True
+    )
+    parser.add_argument(
+        "--intel_exe", type=str, help="Intel compiler executable to use", required=True
+    )
+    parser.add_argument(
+        "--sleef_lib", type=str, help="Sleef library directory to use", required=True
+    )
+    parser.add_argument(
+        "--clang_exe", type=str, help="Clang compiler executable to use", required=True
+    )
+    parser.add_argument(
+        "--llc_exe", type=str, help="LLC compiler executable to use", required=True
+    )
+    parser.add_argument(
+        "--gcc_exe", type=str, help="GCC compiler executable to use", required=True
+    )
+    parser.add_argument(
+        "--nvhpc_exe", type=str, help="NVHPC compiler executable to use", required=True
+    )
+    parser.add_argument(
+        "--libdevice_lib",
+        type=str,
+        help="Libdevice library directory to use",
+        required=True,
+    )
+    parser.add_argument(
+        "--nmodl_exe", type=str, help="NMODL executable to use", required=True
+    )
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def main():
+    args = parse_arguments()
+    benchmark_config = BenchmarkConfig(
+        args.modfiles,
+        args.architectures,
+        args.compilers,
+        args.external_kernel,
+        args.nmodl_jit,
+        args.output,
+        args.instances,
+        args.experiments
+    )
+    compilers_config = CompilersConfig(
+        args.svml_lib,
+        args.intel_exe,
+        args.sleef_lib,
+        args.clang_exe,
+        args.llc_exe,
+        args.gcc_exe,
+        args.nvhpc_exe,
+        args.libdevice_lib,
+        args.nmodl_exe
+    )
+    benchmark = Benchmark(compilers_config, benchmark_config)
+    benchmark.run_benchmark()
+    return
+
+if __name__ == "__main__":
+    main()
diff --git a/test/benchmark/compiler_flags.json b/test/benchmark/compiler_flags.json
new file mode 100644
index 0000000000..4e9162f5a9
--- /dev/null
+++ b/test/benchmark/compiler_flags.json
@@ -0,0 +1,64 @@
+{
+  "intel": {
+    "skylake-avx512": [
+      "-O2 -mavx512f -prec-div -fopenmp",
+      "-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp"
+    ],
+    "broadwell": [
+      "-O2 -march=broadwell -mtune=broadwell -prec-div",
+      "-O2 -march=broadwell -mtune=broadwell -prec-div -fimf-use-svml"
+    ],
+    "nehalem": [
+      "-O2 -msse2 -prec-div",
+      "-O2 -msse2 -prec-div -fimf-use-svml"
+    ],
+    "default": [
+      "-O2 -prec-div",
+      "-O2 -prec-div -fimf-use-svml"
+    ]
+  },
+  "clang": {
+    "skylake-avx512": [
+      "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp",
+      "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML"
+    ],
+    "broadwell": [
+      "-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp",
+      "-O3 -march=broadwell -mtune=broadwell -ffast-math -fopenmp -fveclib=SVML"
+    ],
+    "nehalem": [
+      "-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp",
+      "-O3 -march=nehalem -mtune=nehalem -ffast-math -fopenmp -fveclib=SVML"
+    ],
+    "default": [
+      "-O3 -ffast-math",
+      "-O3 -ffast-math -fopenmp -fveclib=SVML"
+    ]
+  },
+  "gcc": {
+    "skylake-avx512": [
+      "-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -fopenmp",
+      "-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
+    ],
+    "broadwell": [
+      "-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -fopenmp",
+      "-O3 -march=broadwell -mtune=broadwell -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
+    ],
+    "nehalem": [
+      "-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -fopenmp",
+      "-O3 -march=nehalem -mtune=nehalem -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
+    ],
+    "default": [
+      "-O3 -ffast-math -ftree-vectorize",
+      "-O3 -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
+    ]
+  },
+  "nvhpc": {
+    "skylake-avx512": [
+      "-fast -O3 -mp=autopar -tp=skylake -Msafeptr=all -Minfo -Mvect=simd:512,gather -mavx512vbmi -mavx512vbmi2 -mavx512vl"
+    ],
+    "nvptx64": [
+      "-O3 -gpu=nordc,fastmath"
+    ]
+  }
+}
diff --git a/test/benchmark/cpu_docker/Dockerfile b/test/benchmark/cpu_docker/Dockerfile
new file mode 100644
index 0000000000..5e11e32006
--- /dev/null
+++ b/test/benchmark/cpu_docker/Dockerfile
@@ -0,0 +1,83 @@
+FROM ubuntu:22.04
+
+RUN su -
+
+RUN apt update
+
+RUN apt install -y wget gpg vim
+
+RUN apt install -y build-essential
+
+# Install Intel Compiler Classic
+# Download the key to system keyring
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+
+# Add signed entry to apt sources and configure the APT client to use Intel repository:
+RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+# Update apt
+RUN apt update
+
+# Install only Intel Compiler Classic and DPC++ compilers
+RUN apt install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic-2022.2.1
+
+# Install sleef library 3.5.1
+RUN apt install -y libsleef3
+
+# Install LLVM 13
+# First install dependencies
+RUN apt install -y lsb-release software-properties-common gnupg
+
+# Install LLVM 13
+RUN wget https://apt.llvm.org/llvm.sh && \
+chmod +x llvm.sh && \
+./llvm.sh 13
+
+# Install libomp for LLVM 13
+RUN apt install -y libomp-13-dev
+
+# Install NVHPC 22.3
+RUN wget https://developer.download.nvidia.com/hpc-sdk/22.3/nvhpc_2022_223_Linux_x86_64_cuda_11.6.tar.gz && \
+tar xpzf nvhpc_2022_223_Linux_x86_64_cuda_11.6.tar.gz && \
+nvhpc_2022_223_Linux_x86_64_cuda_11.6/install && \
+rm -rf nvhpc_2022_223_Linux_x86_64_cuda_11.6.tar.gz nvhpc_2022_223_Linux_x86_64_cuda_11.6
+
+# Needed to run makelocalrc
+RUN apt install -y gfortran
+
+# Run makelocalrc to set up nvc++, nvc
+RUN /opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/makelocalrc /opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin -v -x /opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin
+
+# Set up necessary environment variables to pass to the benchmark script
+ENV svml_lib=/opt/intel/oneapi/compiler/2022.2.1/linux/compiler/lib/intel64_lin/libsvml.so
+ENV intel_exe=/opt/intel/oneapi/compiler/2022.2.1/linux/bin/intel64/icpc
+ENV sleef_lib=/lib/x86_64-linux-gnu/libsleefgnuabi.so.3.5
+ENV clang_exe=/usr/bin/clang++-13
+ENV llc_exe=/usr/bin/llc-13
+ENV gcc_exe=/usr/bin/g++
+ENV nvhpc_exe=/opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/nvc++
+ENV libdevice_lib=/opt/nvidia/hpc_sdk/Linux_x86_64/22.3/cuda/11.6/nvvm/libdevice/libdevice.10.bc
+
+# Install prerequisites necessary to build NMODL
+RUN apt install -y git cmake flex bison python3-pip
+RUN pip install Jinja2 PyYAML sympy pytest
+
+# Clone NMODL branch for benchmarking LLVM
+RUN git clone --recursive -b magkanar/python_benchmark https://github.com/BlueBrain/nmodl.git
+
+# Setup Intel compiler specific variables to the environment
+RUN echo ". /opt/intel/oneapi/setvars.sh" >> $HOME/.bashrc
+
+# Install NMODL for CPU benchmarking
+RUN cd nmodl && \
+mkdir build && \
+cd build && \
+cmake .. \
+  -DCMAKE_CXX_COMPILER=$(which clang++-13) \
+  -DNMODL_ENABLE_LLVM=ON \
+  -DCMAKE_INSTALL_PREFIX=./install && \
+cmake --build . --target install --parallel
+
+# Install python packages needed for plotting results
+RUN pip install seaborn
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
index b65caeff0d..673662f79b 100644
--- a/test/benchmark/cuda_driver.cpp
+++ b/test/benchmark/cuda_driver.cpp
@@ -23,6 +23,17 @@
 namespace nmodl {
 namespace runner {
 
+CUDADriver::~CUDADriver() {
+    if (cudaModule) {
+        CUresult err = cuModuleUnload(cudaModule);
+        checkCudaErrors(err);
+    }
+    if (context) {
+        CUresult err = cuCtxDestroy(context);
+        checkCudaErrors(err);
+    }
+}
+
 void CUDADriver::checkCudaErrors(CUresult err) {
     if (err != CUDA_SUCCESS) {
         const char* ret = NULL;
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
index 3fd02fd55e..83e031b940 100644
--- a/test/benchmark/cuda_driver.hpp
+++ b/test/benchmark/cuda_driver.hpp
@@ -69,6 +69,8 @@ class CUDADriver {
     explicit CUDADriver(std::unique_ptr<llvm::Module> m)
         : module(std::move(m)) {}
 
+    ~CUDADriver();
+
     /// Initializes the CUDA GPU JIT driver.
     void init(const codegen::Platform& platform, BenchmarkInfo* benchmark_info = nullptr);
 
diff --git a/test/benchmark/ext_kernel.cpp b/test/benchmark/ext_kernel.cpp
index 632328e03f..aabccca6f7 100644
--- a/test/benchmark/ext_kernel.cpp
+++ b/test/benchmark/ext_kernel.cpp
@@ -10,6 +10,8 @@
 #include <iostream>
 
 // external kernel stub
-void nrn_state_hh_ext(void* ) {
-    throw std::runtime_error("Error: this should have been external nrn_state_hh_ext kernel, check library and LD_LIBRARY_PATH\n");
+void nrn_state_hh_ext(void*) {
+    throw std::runtime_error(
+        "Error: this should have been external nrn_state_hh_ext kernel, check library and "
+        "LD_LIBRARY_PATH\n");
 }
diff --git a/test/benchmark/ext_kernel.hpp b/test/benchmark/ext_kernel.hpp
index faf7895a09..041fd2295b 100644
--- a/test/benchmark/ext_kernel.hpp
+++ b/test/benchmark/ext_kernel.hpp
@@ -4,6 +4,6 @@
  * This file is part of NMODL distributed under the terms of the GNU
  * Lesser General Public License. See top-level LICENSE file for details.
  *************************************************************************/
-#pragma once 
+#pragma once
 
 void nrn_state_hh_ext(void*);
diff --git a/test/benchmark/gpu_docker/Dockerfile b/test/benchmark/gpu_docker/Dockerfile
new file mode 100644
index 0000000000..e302827633
--- /dev/null
+++ b/test/benchmark/gpu_docker/Dockerfile
@@ -0,0 +1,86 @@
+FROM nvidia/cuda:11.8.0-base-ubuntu22.04
+
+RUN su -
+
+RUN apt update
+
+RUN apt install -y wget gpg vim
+
+RUN apt install -y build-essential
+
+# Install Intel Compiler Classic
+# Download the key to system keyring
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+
+# Add signed entry to apt sources and configure the APT client to use Intel repository:
+RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+# Update apt
+RUN apt update
+
+# Install only Intel Compiler Classic and DPC++ compilers
+RUN apt install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic-2022.2.1
+
+# Install sleef library 3.5.1
+RUN apt install -y libsleef3
+
+# Install LLVM 13
+# First install dependencies
+RUN apt install -y lsb-release software-properties-common gnupg
+
+# Install LLVM 13
+RUN wget https://apt.llvm.org/llvm.sh && \
+chmod +x llvm.sh && \
+./llvm.sh 13
+
+# Install libomp for LLVM 13
+RUN apt install -y libomp-13-dev
+
+# Install NVHPC 22.3
+RUN wget https://developer.download.nvidia.com/hpc-sdk/22.3/nvhpc_2022_223_Linux_x86_64_cuda_11.6.tar.gz && \
+tar xpzf nvhpc_2022_223_Linux_x86_64_cuda_11.6.tar.gz && \
+nvhpc_2022_223_Linux_x86_64_cuda_11.6/install && \
+rm -rf nvhpc_2022_223_Linux_x86_64_cuda_11.6.tar.gz nvhpc_2022_223_Linux_x86_64_cuda_11.6
+
+# Needed to run makelocalrc
+RUN apt install -y gfortran
+
+# Run makelocalrc to set up nvc++, nvc
+RUN /opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/makelocalrc /opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin -v -x /opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin
+
+# Set up necessary environment variables to pass to the benchmark script
+ENV svml_lib=/opt/intel/oneapi/compiler/2022.2.1/linux/compiler/lib/intel64_lin/libsvml.so
+ENV intel_exe=/opt/intel/oneapi/compiler/2022.2.1/linux/bin/intel64/icpc
+ENV sleef_lib=/lib/x86_64-linux-gnu/libsleefgnuabi.so.3.5
+ENV clang_exe=/usr/bin/clang++-13
+ENV llc_exe=/usr/bin/llc-13
+ENV gcc_exe=/usr/bin/g++
+ENV nvhpc_exe=/opt/nvidia/hpc_sdk/Linux_x86_64/22.3/compilers/bin/nvc++
+ENV libdevice_lib=/opt/nvidia/hpc_sdk/Linux_x86_64/22.3/cuda/11.6/nvvm/libdevice/libdevice.10.bc
+
+# Install prerequisites necessary to build NMODL
+RUN apt install -y git cmake flex bison python3-pip
+RUN pip install Jinja2 PyYAML sympy pytest
+
+# Clone NMODL branch for benchmarking LLVM
+RUN git clone --recursive -b magkanar/python_benchmark https://github.com/BlueBrain/nmodl.git
+
+# Setup Intel compiler specific variables to the environment
+RUN echo ". /opt/intel/oneapi/setvars.sh" >> $HOME/.bashrc
+
+# Install NMODL with GPU enabled (needs separate Docker file with NVIDIA runtime enabled)
+RUN cd nmodl && \
+mkdir build && \
+cd build && \
+cmake .. \
+  -DCMAKE_CXX_COMPILER=$(which clang++-13) \
+  -DNMODL_ENABLE_LLVM=ON \
+  -DNMODL_ENABLE_LLVM_GPU=ON \
+  -DNMODL_ENABLE_LLVM_CUDA=ON \
+  -DCMAKE_CUDA_COMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/22.3/cuda/11.6/bin/nvcc \
+  -DCMAKE_INSTALL_PREFIX=./install && \
+cmake --build . --target install --parallel
+
+# Install python packages needed for plotting results
+RUN pip install seaborn
diff --git a/test/benchmark/install_gpu_docker_env.sh b/test/benchmark/install_gpu_docker_env.sh
new file mode 100644
index 0000000000..b5847e68cf
--- /dev/null
+++ b/test/benchmark/install_gpu_docker_env.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+set -x
+set -e
+
+# Update apt and install basic packages
+sudo apt-get -y update
+sudo apt-get -y --no-install-recommends install \
+ curl
+
+# Install docker
+sudo add-apt-repository -y ppa:graphics-drivers/ppa
+sudo apt-get install \
+   ca-certificates \
+   curl \
+   gnupg \
+   lsb-release
+sudo mkdir -p /etc/apt/keyrings
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
+echo \
+ "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
+ $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+sudo apt-get update
+sudo apt-get install docker-ce docker-ce-cli containerd.io docker-compose-plugin
+
+# Install NVIDIA GPU driver
+sudo apt-get update
+sudo apt-get install -y \
+ nvidia-driver-470 \
+ nvidia-utils-470
+
+# Install nvidia-docker2 and reload the Docker daemon configuration
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
+      && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+      && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
+            sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+            sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+sudo apt-get update
+sudo apt-get install -y nvidia-docker2
+sudo systemctl restart docker
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index 3569c4bd4f..96b46a447c 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -60,7 +60,7 @@ class JITDriver {
         if (!expected_symbol)
             throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
 
-        auto(*res)() = (ReturnType(*)())(intptr_t) expected_symbol->getAddress();
+        auto (*res)() = (ReturnType(*)())(intptr_t) expected_symbol->getAddress();
         ReturnType result = res();
         return result;
     }
@@ -72,7 +72,7 @@ class JITDriver {
         if (!expected_symbol)
             throw std::runtime_error("Error: entry-point symbol not found in JIT\n");
 
-        auto(*res)(ArgType) = (ReturnType(*)(ArgType))(intptr_t) expected_symbol->getAddress();
+        auto (*res)(ArgType) = (ReturnType(*)(ArgType))(intptr_t) expected_symbol->getAddress();
         ReturnType result = res(arg);
         return result;
     }
diff --git a/test/benchmark/kernels/compute-bound.cpp b/test/benchmark/kernels/compute-bound.cpp
index b331b22f76..8145abe93d 100644
--- a/test/benchmark/kernels/compute-bound.cpp
+++ b/test/benchmark/kernels/compute-bound.cpp
@@ -1,29 +1,71 @@
 #include <cmath>
 
-struct hh_Instance  {               // address
-    double* __restrict__ minf;      //  0
-    double* __restrict__ mtau;      //  8
-    double* __restrict__ m;         // 16
-    double* __restrict__ Dm;        // 24
-    double* __restrict__ v_unused;  // 32
-    double* __restrict__ g_unused;  // 40
-    double* __restrict__ voltage;   // 48
-    int* __restrict__ node_index;   // 56
-    double t;                       // 64
-    double dt;                      // 72
-    double celsius;                 // 80
-    int secondorder;                // 88
-    int node_count;                 // 92
+struct hh_Instance {
+    const double* __restrict__ gl;
+    const double* __restrict__ el;
+    double* __restrict__ minf;
+    double* __restrict__ mtau;
+    double* __restrict__ il;
+    double* __restrict__ m;
+    double* __restrict__ Dm;
+    double* __restrict__ v_unused;
+    double* __restrict__ g_unused;
+    double* __restrict__ voltage;
+    int* __restrict__ node_index;
+    double* __restrict__ vec_rhs;
+    double* __restrict__ vec_d;
+    double* __restrict__ _shadow_rhs;
+    double* __restrict__ _shadow_d;
+    double t;
+    double dt;
+    double celsius;
+    int secondorder;
+    int node_count;
 };
 
-void nrn_state_hh_ext(void* __restrict__ mech){
+void nrn_cur_ext(void* __restrict__ mech) {
+    auto inst = static_cast<hh_Instance*>(mech);
+    int id;
+    int node_id;
+    double v, g, rhs, v_org, current;
+
+    #pragma ivdep
+    for (id = 0; id < inst->node_count; id++) {
+        node_id = inst->node_index[id];
+        v = inst->voltage[node_id];
+        v_org = v;
+        v = v + 0.001;
+        {
+            current = 0.0;
+            inst->il[id] = inst->gl[id] * (v - inst->el[id]);
+            current += inst->il[id];
+            g = current;
+        }
+        v = v_org;
+        {
+            current = 0.0;
+            inst->il[id] = inst->gl[id] * (v - inst->el[id]);
+            current += inst->il[id];
+            rhs = current;
+        }
+        g = (g-rhs)/0.001;
+        inst->vec_rhs[node_id] -= rhs;
+        inst->vec_d[node_id] += g;
+    }
+}
+
+void nrn_state_ext(void* __restrict__ mech) {
     auto inst = static_cast<hh_Instance*>(mech);
     int id;
     int node_id;
     double v;
-    for(int id = 0; id<inst->node_count; ++id) {
+
+    #pragma ivdep
+    for (int id = 0; id < inst->node_count; ++id) {
         node_id = inst->node_index[id];
         v = inst->voltage[node_id];
-        inst->m[id] = exp(inst->m[id])+exp(inst->minf[id])+(inst->minf[id]-inst->m[id])/inst->mtau[id]+inst->m[id]+inst->minf[id]*inst->mtau[id];
+        inst->m[id] = exp(inst->m[id]) + exp(inst->minf[id]) +
+                      (inst->minf[id] - inst->m[id]) / inst->mtau[id] + inst->m[id] +
+                      inst->minf[id] * inst->mtau[id];
     }
 }
diff --git a/test/benchmark/kernels/compute-bound.mod b/test/benchmark/kernels/compute-bound.mod
index 500524563e..33a32a001a 100644
--- a/test/benchmark/kernels/compute-bound.mod
+++ b/test/benchmark/kernels/compute-bound.mod
@@ -1,5 +1,5 @@
 NEURON {
-    SUFFIX hh
+    SUFFIX compute_bound
     NONSPECIFIC_CURRENT il
     RANGE minf, mtau, gl, el
 }
diff --git a/test/benchmark/kernels/expsyn.cpp b/test/benchmark/kernels/expsyn.cpp
new file mode 100644
index 0000000000..690360b0b5
--- /dev/null
+++ b/test/benchmark/kernels/expsyn.cpp
@@ -0,0 +1,81 @@
+#include <cmath>
+
+struct ExpSyn_Instance {
+    const double* __restrict__ tau;
+    const double* __restrict__ e;
+    double* __restrict__ i;
+    double* __restrict__ g_state;
+    double* __restrict__ Dg_state;
+    double* __restrict__ v_unused;
+    double* __restrict__ g_unused;
+    double* __restrict__ tsave;
+    const double* __restrict__ node_area;
+    const double* __restrict__ point_process;
+    int* __restrict__ node_area_index;
+    int* __restrict__ point_process_index;
+    double* __restrict__ voltage;
+    int* __restrict__ node_index;
+    double* __restrict__ vec_rhs;
+    double* __restrict__ vec_d;
+    double* __restrict__ _shadow_rhs;
+    double* __restrict__ _shadow_d;
+    double t;
+    double dt;
+    double celsius;
+    int secondorder;
+    int node_count;
+};
+
+void nrn_cur_ext(void* __restrict__ mech) {
+    auto inst = static_cast<ExpSyn_Instance*>(mech);
+    int id;
+    int node_id, node_area_id;
+    double v, g, rhs, v_org, current, mfactor;
+
+    #pragma ivdep
+    for (id = 0; id < inst->node_count; id++) {
+        node_id = inst->node_index[id];
+        node_area_id = inst->node_area_index[id];
+        v = inst->voltage[node_id];
+        v_org = v;
+        v = v + 0.001;
+        {
+            current = 0.0;
+            inst->i[id] = inst->g_state[id] * (v - inst->e[id]);
+            current += inst->i[id];
+            g = current;
+        }
+        v = v_org;
+        {
+            current = 0.0;
+            inst->i[id] = inst->g_state[id] * (v - inst->e[id]);
+            current += inst->i[id];
+            rhs = current;
+        }
+        g = (g-rhs)/0.001;
+        mfactor = 1.e2/inst->node_area[node_area_id];
+        g = g*mfactor;
+        rhs = rhs*mfactor;
+        inst->_shadow_rhs[id] = rhs;
+        inst->_shadow_d[id] = g;
+    }
+    for (id = 0; id < inst->node_count; id++) {
+        node_id = inst->node_index[id];
+        inst->vec_rhs[node_id] -= inst->_shadow_rhs[id];
+        inst->vec_d[node_id] += inst->_shadow_d[id];
+    }
+}
+
+void nrn_state_ext(void* __restrict__ mech) {
+    auto inst = static_cast<ExpSyn_Instance*>(mech);
+    int id;
+    int node_id;
+    double v;
+
+    #pragma ivdep
+    for (id = 0; id < inst->node_count; ++id) {
+        node_id = inst->node_index[id];
+        v = inst->voltage[node_id];
+        inst->g_state[id] = inst->g_state[id] + (1.0 - exp(inst->dt * (( -1.0) / inst->tau[id]))) * ( -(0.0) / (( -1.0) / inst->tau[id]) - inst->g_state[id]);
+    }
+}
diff --git a/test/benchmark/kernels/expsyn.mod b/test/benchmark/kernels/expsyn.mod
index 56ddde3b19..431836ba9a 100644
--- a/test/benchmark/kernels/expsyn.mod
+++ b/test/benchmark/kernels/expsyn.mod
@@ -21,22 +21,22 @@ ASSIGNED {
 }
 
 STATE {
-	g (uS)
+	g_state (uS)
 }
 
 INITIAL {
-	g=0
+	g_state=0
 }
 
 BREAKPOINT {
 	SOLVE state METHOD cnexp
-	i = g*(v - e)
+	i = g_state*(v - e)
 }
 
 DERIVATIVE state {
-	g' = -g/tau
+	g_state' = -g_state/tau
 }
 
 NET_RECEIVE(weight (uS)) {
-	g = g + weight
+	g_state = g_state + weight
 }
diff --git a/test/benchmark/kernels/expsyn_openacc.cpp b/test/benchmark/kernels/expsyn_openacc.cpp
new file mode 100644
index 0000000000..9b2b10dc5d
--- /dev/null
+++ b/test/benchmark/kernels/expsyn_openacc.cpp
@@ -0,0 +1,78 @@
+#include <cmath>
+
+struct ExpSyn_Instance {
+    const double* __restrict__ tau;
+    const double* __restrict__ e;
+    double* __restrict__ i;
+    double* __restrict__ g_state;
+    double* __restrict__ Dg_state;
+    double* __restrict__ v_unused;
+    double* __restrict__ g_unused;
+    double* __restrict__ tsave;
+    const double* __restrict__ node_area;
+    const double* __restrict__ point_process;
+    int* __restrict__ node_area_index;
+    int* __restrict__ point_process_index;
+    double* __restrict__ voltage;
+    int* __restrict__ node_index;
+    double* __restrict__ vec_rhs;
+    double* __restrict__ vec_d;
+    double* __restrict__ _shadow_rhs;
+    double* __restrict__ _shadow_d;
+    double t;
+    double dt;
+    double celsius;
+    int secondorder;
+    int node_count;
+};
+
+void nrn_cur_ext(void* __restrict__ mech) {
+    auto inst = static_cast<ExpSyn_Instance*>(mech);
+    int id;
+    int node_id, node_area_id;
+    double v, g, rhs, v_org, current, mfactor;
+
+    #pragma acc parallel loop deviceptr(inst)
+    for (id = 0; id < inst->node_count; id++) {
+        node_id = inst->node_index[id];
+        node_area_id = inst->node_area_index[id];
+        v = inst->voltage[node_id];
+        v_org = v;
+        v = v + 0.001;
+        {
+            current = 0.0;
+            inst->i[id] = inst->g_state[id] * (v - inst->e[id]);
+            current += inst->i[id];
+            g = current;
+        }
+        v = v_org;
+        {
+            current = 0.0;
+            inst->i[id] = inst->g_state[id] * (v - inst->e[id]);
+            current += inst->i[id];
+            rhs = current;
+        }
+        g = (g-rhs)/0.001;
+        mfactor = 1.e2/inst->node_area[node_area_id];
+        g = g*mfactor;
+        rhs = rhs*mfactor;
+        #pragma acc atomic update
+        inst->vec_rhs[node_id] -= rhs;
+        #pragma acc atomic update
+        inst->vec_d[node_id] += g;
+    }
+}
+
+void nrn_state_ext(void* __restrict__ mech) {
+    auto inst = static_cast<ExpSyn_Instance*>(mech);
+    int id;
+    int node_id;
+    double v;
+
+    #pragma acc parallel loop deviceptr(inst)
+    for (id = 0; id < inst->node_count; ++id) {
+        node_id = inst->node_index[id];
+        v = inst->voltage[node_id];
+        inst->g_state[id] = inst->g_state[id] + (1.0 - exp(inst->dt * (( -1.0) / inst->tau[id]))) * ( -(0.0) / (( -1.0) / inst->tau[id]) - inst->g_state[id]);
+    }
+}
diff --git a/test/benchmark/kernels/hh.cpp b/test/benchmark/kernels/hh.cpp
index 86c1bfb996..a790e45203 100644
--- a/test/benchmark/kernels/hh.cpp
+++ b/test/benchmark/kernels/hh.cpp
@@ -1,10 +1,10 @@
 #include <cmath>
 
-struct hh_Instance  {
-    double* __restrict__ gnabar;
-    double* __restrict__ gkbar;
-    double* __restrict__ gl;
-    double* __restrict__ el;
+struct hh_Instance {
+    const double* __restrict__ gnabar;
+    const double* __restrict__ gkbar;
+    const double* __restrict__ gl;
+    const double* __restrict__ el;
     double* __restrict__ gna;
     double* __restrict__ gk;
     double* __restrict__ il;
@@ -26,10 +26,10 @@ struct hh_Instance  {
     double* __restrict__ ik;
     double* __restrict__ v_unused;
     double* __restrict__ g_unused;
-    double* __restrict__ ion_ena;
+    const double* __restrict__ ion_ena;
     double* __restrict__ ion_ina;
     double* __restrict__ ion_dinadv;
-    double* __restrict__ ion_ek;
+    const double* __restrict__ ion_ek;
     double* __restrict__ ion_ik;
     double* __restrict__ ion_dikdv;
     int* __restrict__ ion_ena_index;
@@ -40,6 +40,10 @@ struct hh_Instance  {
     int* __restrict__ ion_dikdv_index;
     double* __restrict__ voltage;
     int* __restrict__ node_index;
+    double* __restrict__ vec_rhs;
+    double* __restrict__ vec_d;
+    double* __restrict__ _shadow_rhs;
+    double* __restrict__ _shadow_d;
     double t;
     double dt;
     double celsius;
@@ -47,13 +51,71 @@ struct hh_Instance  {
     int node_count;
 };
 
-void nrn_state_hh_ext(void* __restrict__ mech){
+void nrn_cur_ext(void* __restrict__ mech) {
+    auto inst = static_cast<hh_Instance*>(mech);
+    int id;
+    int node_id, ena_id, ek_id, ion_dinadv_id, ion_dikdv_id, ion_ina_id, ion_ik_id;
+    double v, g, rhs, v_org, current, dina, dik;
+
+    #pragma ivdep
+    for (id = 0; id < inst->node_count; id++) {
+        node_id = inst->node_index[id];
+        ena_id = inst->ion_ena_index[id];
+        ek_id = inst->ion_ek_index[id];
+        ion_dinadv_id = inst->ion_dinadv_index[id];
+        ion_dikdv_id = inst->ion_dikdv_index[id];
+        ion_ina_id = inst->ion_ina_index[id];
+        ion_ik_id = inst->ion_ik_index[id];
+        v = inst->voltage[node_id];
+        inst->ena[id] = inst->ion_ena[ena_id];
+        inst->ek[id] = inst->ion_ek[ek_id];
+        v_org = v;
+        v = v + 0.001;
+        {
+            current = 0.0;
+            inst->gna[id] = inst->gnabar[id] * inst->m[id] * inst->m[id] * inst->m[id] * inst->h[id];
+            inst->ina[id] = inst->gna[id] * (v - inst->ena[id]);
+            inst->gk[id] = inst->gkbar[id] * inst->n[id] * inst->n[id] * inst->n[id] * inst->n[id];
+            inst->ik[id] = inst->gk[id] * (v - inst->ek[id]);
+            inst->il[id] = inst->gl[id] * (v - inst->el[id]);
+            current += inst->il[id];
+            current += inst->ina[id];
+            current += inst->ik[id];
+            g = current;
+        }
+        dina = inst->ina[id];
+        dik = inst->ik[id];
+        v = v_org;
+        {
+            current = 0.0;
+            inst->gna[id] = inst->gnabar[id] * inst->m[id] * inst->m[id] * inst->m[id] * inst->h[id];
+            inst->ina[id] = inst->gna[id] * (v - inst->ena[id]);
+            inst->gk[id] = inst->gkbar[id] * inst->n[id] * inst->n[id] * inst->n[id] * inst->n[id];
+            inst->ik[id] = inst->gk[id] * (v - inst->ek[id]);
+            inst->il[id] = inst->gl[id] * (v - inst->el[id]);
+            current += inst->il[id];
+            current += inst->ina[id];
+            current += inst->ik[id];
+            rhs = current;
+        }
+        g = (g-rhs)/0.001;
+        inst->ion_dinadv[ion_dinadv_id] += (dina-inst->ina[id])/0.001;
+        inst->ion_dikdv[ion_dikdv_id] += (dik-inst->ik[id])/0.001;
+        inst->ion_ina[ion_ina_id] += inst->ina[id];
+        inst->ion_ik[ion_ik_id] += inst->ik[id];
+        inst->vec_rhs[node_id] -= rhs;
+        inst->vec_d[node_id] += g;
+    }
+}
+
+void nrn_state_ext(void* __restrict__ mech) {
     auto inst = static_cast<hh_Instance*>(mech);
     int id;
     int node_id, ena_id, ek_id;
     double v;
-    #pragma omp simd
-    for(id = 0; id<inst->node_count; ++id) {
+
+    #pragma ivdep
+    for (id = 0; id < inst->node_count; ++id) {
         node_id = inst->node_index[id];
         ena_id = inst->ion_ena_index[id];
         ek_id = inst->ion_ek_index[id];
@@ -61,24 +123,47 @@ void nrn_state_hh_ext(void* __restrict__ mech){
         inst->ena[id] = inst->ion_ena[ena_id];
         inst->ek[id] = inst->ion_ek[ek_id];
         {
-            double alpha, beta, sum, q10, vtrap_in_0, v_in_1;
+            double alpha, beta, sum, q10, vtrap_in_0, vtrap_in_1, v_in_1;
             v_in_1 = v;
-            q10 = 3*((inst->celsius-6.3)/10);
-            alpha = .07*exp(-(v_in_1+65)/20);
-            beta = 1/(exp(-(v_in_1+35)/10)+1);
-            sum = alpha+beta;
-            inst->htau[id] = 1/(q10*sum);
-            inst->hinf[id] = alpha/sum;
+            q10 = pow(3.0, ((inst->celsius - 6.3) / 10.0));
             {
                 double x_in_0, y_in_0;
-                x_in_0 = alpha;
-                y_in_0 = alpha;
-                vtrap_in_0 = y_in_0*(1-x_in_0/y_in_0/2);
+                x_in_0 =  -(v_in_1 + 40.0);
+                y_in_0 = 10.0;
+                if (fabs(x_in_0 / y_in_0) < 1e-6) {
+                    vtrap_in_0 = y_in_0 * (1.0 - x_in_0 / y_in_0 / 2.0);
+                } else {
+                    vtrap_in_0 = x_in_0 / (exp(x_in_0 / y_in_0) - 1.0);
+                }
+            }
+            alpha = .1 * vtrap_in_0;
+            beta = 4.0 * exp( -(v_in_1 + 65.0) / 18.0);
+            sum = alpha + beta;
+            inst->mtau[id] = 1.0 / (q10 * sum);
+            inst->minf[id] = alpha / sum;
+            alpha = .07 * exp( -(v_in_1 + 65.0) / 20.0);
+            beta = 1.0 / (exp( -(v_in_1 + 35.0) / 10.0) + 1.0);
+            sum = alpha + beta;
+            inst->htau[id] = 1.0 / (q10 * sum);
+            inst->hinf[id] = alpha / sum;
+            {
+                double x_in_1, y_in_1;
+                x_in_1 =  -(v_in_1 + 55.0);
+                y_in_1 = 10.0;
+                if (fabs(x_in_1 / y_in_1) < 1e-6) {
+                    vtrap_in_1 = y_in_1 * (1.0 - x_in_1 / y_in_1 / 2.0);
+                } else {
+                    vtrap_in_1 = x_in_1 / (exp(x_in_1 / y_in_1) - 1.0);
+                }
             }
-            inst->hinf[id] = vtrap_in_0;
+            alpha = .01 * vtrap_in_1;
+            beta = .125 * exp( -(v_in_1 + 65.0) / 80.0);
+            sum = alpha + beta;
+            inst->ntau[id] = 1.0 / (q10 * sum);
+            inst->ninf[id] = alpha / sum;
         }
-        inst->m[id] = inst->m[id]+(1.0-exp(inst->dt*((((-1.0)))/inst->mtau[id])))*(-(((inst->minf[id]))/inst->mtau[id])/((((-1.0)))/inst->mtau[id])-inst->m[id]);
-        inst->h[id] = inst->h[id]+(1.0-exp(inst->dt*((((-1.0)))/inst->htau[id])))*(-(((inst->hinf[id]))/inst->htau[id])/((((-1.0)))/inst->htau[id])-inst->h[id]);
-        inst->n[id] = inst->n[id]+(1.0-exp(inst->dt*((((-1.0)))/inst->ntau[id])))*(-(((inst->ninf[id]))/inst->ntau[id])/((((-1.0)))/inst->ntau[id])-inst->n[id]);
+        inst->m[id] = inst->m[id] + (1.0 - exp(inst->dt * (((( -1.0))) / inst->mtau[id]))) * ( -(((inst->minf[id])) / inst->mtau[id]) / (((( -1.0))) / inst->mtau[id]) - inst->m[id]);
+        inst->h[id] = inst->h[id] + (1.0 - exp(inst->dt * (((( -1.0))) / inst->htau[id]))) * ( -(((inst->hinf[id])) / inst->htau[id]) / (((( -1.0))) / inst->htau[id]) - inst->h[id]);
+        inst->n[id] = inst->n[id] + (1.0 - exp(inst->dt * (((( -1.0))) / inst->ntau[id]))) * ( -(((inst->ninf[id])) / inst->ntau[id]) / (((( -1.0))) / inst->ntau[id]) - inst->n[id]);
     }
 }
diff --git a/test/benchmark/kernels/hh_openacc.cpp b/test/benchmark/kernels/hh_openacc.cpp
new file mode 100644
index 0000000000..2d91581615
--- /dev/null
+++ b/test/benchmark/kernels/hh_openacc.cpp
@@ -0,0 +1,171 @@
+#include <cmath>
+
+struct hh_Instance {
+    const double* __restrict__ gnabar;
+    const double* __restrict__ gkbar;
+    const double* __restrict__ gl;
+    const double* __restrict__ el;
+    double* __restrict__ gna;
+    double* __restrict__ gk;
+    double* __restrict__ il;
+    double* __restrict__ minf;
+    double* __restrict__ hinf;
+    double* __restrict__ ninf;
+    double* __restrict__ mtau;
+    double* __restrict__ htau;
+    double* __restrict__ ntau;
+    double* __restrict__ m;
+    double* __restrict__ h;
+    double* __restrict__ n;
+    double* __restrict__ Dm;
+    double* __restrict__ Dh;
+    double* __restrict__ Dn;
+    double* __restrict__ ena;
+    double* __restrict__ ek;
+    double* __restrict__ ina;
+    double* __restrict__ ik;
+    double* __restrict__ v_unused;
+    double* __restrict__ g_unused;
+    const double* __restrict__ ion_ena;
+    double* __restrict__ ion_ina;
+    double* __restrict__ ion_dinadv;
+    const double* __restrict__ ion_ek;
+    double* __restrict__ ion_ik;
+    double* __restrict__ ion_dikdv;
+    int* __restrict__ ion_ena_index;
+    int* __restrict__ ion_ina_index;
+    int* __restrict__ ion_dinadv_index;
+    int* __restrict__ ion_ek_index;
+    int* __restrict__ ion_ik_index;
+    int* __restrict__ ion_dikdv_index;
+    double* __restrict__ voltage;
+    int* __restrict__ node_index;
+    double* __restrict__ vec_rhs;
+    double* __restrict__ vec_d;
+    double* __restrict__ _shadow_rhs;
+    double* __restrict__ _shadow_d;
+    double t;
+    double dt;
+    double celsius;
+    int secondorder;
+    int node_count;
+};
+
+void nrn_cur_ext(void* __restrict__ mech) {
+    auto inst = static_cast<hh_Instance*>(mech);
+    int id;
+    int node_id, ena_id, ek_id, ion_dinadv_id, ion_dikdv_id, ion_ina_id, ion_ik_id;
+    double v, g, rhs, v_org, current, dina, dik;
+
+    #pragma acc parallel loop deviceptr(inst)
+    for (id = 0; id < inst->node_count; id++) {
+        node_id = inst->node_index[id];
+        ena_id = inst->ion_ena_index[id];
+        ek_id = inst->ion_ek_index[id];
+        ion_dinadv_id = inst->ion_dinadv_index[id];
+        ion_dikdv_id = inst->ion_dikdv_index[id];
+        ion_ina_id = inst->ion_ina_index[id];
+        ion_ik_id = inst->ion_ik_index[id];
+        v = inst->voltage[node_id];
+        inst->ena[id] = inst->ion_ena[ena_id];
+        inst->ek[id] = inst->ion_ek[ek_id];
+        v_org = v;
+        v = v + 0.001;
+        {
+            current = 0.0;
+            inst->gna[id] = inst->gnabar[id] * inst->m[id] * inst->m[id] * inst->m[id] * inst->h[id];
+            inst->ina[id] = inst->gna[id] * (v - inst->ena[id]);
+            inst->gk[id] = inst->gkbar[id] * inst->n[id] * inst->n[id] * inst->n[id] * inst->n[id];
+            inst->ik[id] = inst->gk[id] * (v - inst->ek[id]);
+            inst->il[id] = inst->gl[id] * (v - inst->el[id]);
+            current += inst->il[id];
+            current += inst->ina[id];
+            current += inst->ik[id];
+            g = current;
+        }
+        dina = inst->ina[id];
+        dik = inst->ik[id];
+        v = v_org;
+        {
+            current = 0.0;
+            inst->gna[id] = inst->gnabar[id] * inst->m[id] * inst->m[id] * inst->m[id] * inst->h[id];
+            inst->ina[id] = inst->gna[id] * (v - inst->ena[id]);
+            inst->gk[id] = inst->gkbar[id] * inst->n[id] * inst->n[id] * inst->n[id] * inst->n[id];
+            inst->ik[id] = inst->gk[id] * (v - inst->ek[id]);
+            inst->il[id] = inst->gl[id] * (v - inst->el[id]);
+            current += inst->il[id];
+            current += inst->ina[id];
+            current += inst->ik[id];
+            rhs = current;
+        }
+        g = (g-rhs)/0.001;
+        inst->ion_dinadv[ion_dinadv_id] += (dina-inst->ina[id])/0.001;
+        inst->ion_dikdv[ion_dikdv_id] += (dik-inst->ik[id])/0.001;
+        inst->ion_ina[ion_ina_id] += inst->ina[id];
+        inst->ion_ik[ion_ik_id] += inst->ik[id];
+        #pragma acc atomic update
+        inst->vec_rhs[node_id] -= rhs;
+        #pragma acc atomic update
+        inst->vec_d[node_id] += g;
+    }
+}
+
+void nrn_state_ext(void* __restrict__ mech) {
+    auto inst = static_cast<hh_Instance*>(mech);
+    int id;
+    int node_id, ena_id, ek_id;
+    double v;
+
+    #pragma acc parallel loop deviceptr(inst)
+    for (id = 0; id < inst->node_count; ++id) {
+        node_id = inst->node_index[id];
+        ena_id = inst->ion_ena_index[id];
+        ek_id = inst->ion_ek_index[id];
+        v = inst->voltage[node_id];
+        inst->ena[id] = inst->ion_ena[ena_id];
+        inst->ek[id] = inst->ion_ek[ek_id];
+        {
+            double alpha, beta, sum, q10, vtrap_in_0, vtrap_in_1, v_in_1;
+            v_in_1 = v;
+            q10 = pow(3.0, ((inst->celsius - 6.3) / 10.0));
+            {
+                double x_in_0, y_in_0;
+                x_in_0 =  -(v_in_1 + 40.0);
+                y_in_0 = 10.0;
+                if (fabs(x_in_0 / y_in_0) < 1e-6) {
+                    vtrap_in_0 = y_in_0 * (1.0 - x_in_0 / y_in_0 / 2.0);
+                } else {
+                    vtrap_in_0 = x_in_0 / (exp(x_in_0 / y_in_0) - 1.0);
+                }
+            }
+            alpha = .1 * vtrap_in_0;
+            beta = 4.0 * exp( -(v_in_1 + 65.0) / 18.0);
+            sum = alpha + beta;
+            inst->mtau[id] = 1.0 / (q10 * sum);
+            inst->minf[id] = alpha / sum;
+            alpha = .07 * exp( -(v_in_1 + 65.0) / 20.0);
+            beta = 1.0 / (exp( -(v_in_1 + 35.0) / 10.0) + 1.0);
+            sum = alpha + beta;
+            inst->htau[id] = 1.0 / (q10 * sum);
+            inst->hinf[id] = alpha / sum;
+            {
+                double x_in_1, y_in_1;
+                x_in_1 =  -(v_in_1 + 55.0);
+                y_in_1 = 10.0;
+                if (fabs(x_in_1 / y_in_1) < 1e-6) {
+                    vtrap_in_1 = y_in_1 * (1.0 - x_in_1 / y_in_1 / 2.0);
+                } else {
+                    vtrap_in_1 = x_in_1 / (exp(x_in_1 / y_in_1) - 1.0);
+                }
+            }
+            alpha = .01 * vtrap_in_1;
+            beta = .125 * exp( -(v_in_1 + 65.0) / 80.0);
+            sum = alpha + beta;
+            inst->ntau[id] = 1.0 / (q10 * sum);
+            inst->ninf[id] = alpha / sum;
+        }
+        inst->m[id] = inst->m[id] + (1.0 - exp(inst->dt * (((( -1.0))) / inst->mtau[id]))) * ( -(((inst->minf[id])) / inst->mtau[id]) / (((( -1.0))) / inst->mtau[id]) - inst->m[id]);
+        inst->h[id] = inst->h[id] + (1.0 - exp(inst->dt * (((( -1.0))) / inst->htau[id]))) * ( -(((inst->hinf[id])) / inst->htau[id]) / (((( -1.0))) / inst->htau[id]) - inst->h[id]);
+        inst->n[id] = inst->n[id] + (1.0 - exp(inst->dt * (((( -1.0))) / inst->ntau[id]))) * ( -(((inst->ninf[id])) / inst->ntau[id]) / (((( -1.0))) / inst->ntau[id]) - inst->n[id]);
+    }
+}
diff --git a/test/benchmark/kernels/memory-bound.cpp b/test/benchmark/kernels/memory-bound.cpp
index 8beead4fde..744f3211b5 100644
--- a/test/benchmark/kernels/memory-bound.cpp
+++ b/test/benchmark/kernels/memory-bound.cpp
@@ -1,19 +1,26 @@
 
 
 struct hh_Instance  {
+    const double* __restrict__ gl;
+    const double* __restrict__ el;
     double* __restrict__ minf;
     double* __restrict__ mtau;
+    double* __restrict__ il;
     double* __restrict__ m;
     double* __restrict__ nai;
     double* __restrict__ Dm;
     double* __restrict__ v_unused;
     double* __restrict__ g_unused;
     double* __restrict__ ion_nai;
-    double* __restrict__ style_na;
+    const double* __restrict__ style_na;
     int* __restrict__ ion_nai_index;
     int* __restrict__ style_na_index;
     double* __restrict__ voltage;
     int* __restrict__ node_index;
+    double* __restrict__ vec_rhs;
+    double* __restrict__ vec_d;
+    double* __restrict__ _shadow_rhs;
+    double* __restrict__ _shadow_d;
     double t;
     double dt;
     double celsius;
@@ -21,18 +28,54 @@ struct hh_Instance  {
     int node_count;
 };
 
-void nrn_state_hh_ext(void* __restrict__ mech){
+void nrn_cur_ext(void* __restrict__ mech) {
+    auto inst = static_cast<hh_Instance*>(mech);
+    int id;
+    int node_id, ion_nai_index;
+    double v, g, rhs, v_org, current;
+
+    #pragma ivdep
+    for (id = 0; id < inst->node_count; id++) {
+        node_id = inst->node_index[id];
+        ion_nai_index = inst->ion_nai_index[id];
+        v = inst->voltage[node_id];
+        inst->nai[id] = inst->ion_nai[ion_nai_index];
+        v_org = v;
+        v = v + 0.001;
+        {
+            current = 0.0;
+            inst->il[id] = inst->gl[id] * (v - inst->el[id]);
+            current += inst->il[id];
+            g = current;
+        }
+        v = v_org;
+        {
+            current = 0.0;
+            inst->il[id] = inst->gl[id] * (v - inst->el[id]);
+            current += inst->il[id];
+            rhs = current;
+        }
+        g = (g-rhs)/0.001;
+        inst->ion_nai[ion_nai_index] = inst->nai[id];
+        inst->vec_rhs[node_id] -= rhs;
+        inst->vec_d[node_id] += g;
+    }
+}
+
+void nrn_state_ext(void* __restrict__ mech) {
     auto inst = static_cast<hh_Instance*>(mech);
     int id;
     int node_id, nai_id, ion_nai_id;
     double v;
-    for(int id = 0; id<inst->node_count; ++id) {
+
+    #pragma ivdep
+    for (int id = 0; id < inst->node_count; ++id) {
         node_id = inst->node_index[id];
         nai_id = inst->ion_nai_index[id];
         ion_nai_id = inst->ion_nai_index[id];
         v = inst->voltage[node_id];
         inst->nai[id] = inst->ion_nai[nai_id];
-        inst->m[id] = (inst->minf[id]-inst->m[id])/inst->mtau[id];
+        inst->m[id] = (inst->minf[id] - inst->m[id]) / inst->mtau[id];
         inst->ion_nai[ion_nai_id] = inst->nai[id];
     }
 }
diff --git a/test/benchmark/kernels/memory-bound.mod b/test/benchmark/kernels/memory-bound.mod
index c5b9f3fd04..a92897e455 100644
--- a/test/benchmark/kernels/memory-bound.mod
+++ b/test/benchmark/kernels/memory-bound.mod
@@ -1,5 +1,5 @@
 NEURON {
-    SUFFIX hh
+    SUFFIX memory_bound
     NONSPECIFIC_CURRENT il
     RANGE x, minf, mtau, gl, el
     USEION na WRITE nai
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index cb8ca6bd8c..f80d39ad41 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -7,6 +7,7 @@
 
 #include <chrono>
 #include <cmath>
+#include <dlfcn.h>
 #include <numeric>
 
 #include "llvm_benchmark.hpp"
@@ -17,10 +18,6 @@
 #include "ext_kernel.hpp"
 #include "test/unit/codegen/codegen_data_helper.hpp"
 
-#ifdef NMODL_LLVM_CUDA_BACKEND
-#include "test/benchmark/cuda_driver.hpp"
-#endif
-
 namespace nmodl {
 namespace benchmark {
 
@@ -42,71 +39,6 @@ void LLVMBenchmark::generate_llvm() {
     logger->info("Created LLVM IR module from NMODL AST in {} sec", diff.count());
 }
 
-#ifdef NMODL_LLVM_CUDA_BACKEND
-void checkCudaErrors(cudaError error) {
-    if (error != cudaSuccess) {
-        throw std::runtime_error(fmt::format("CUDA Execution Error: {}\n", cudaGetErrorString(error)));
-    }
-}
-
-void* copy_instance_data_gpu(const codegen::CodegenInstanceData& data) {
-    void* dev_base_ptr;
-    const auto ptr_vars_size = data.num_ptr_members * sizeof(double*);
-    auto scalar_vars_size = 0;
-    const auto num_scalar_vars = data.members.size() - data.num_ptr_members;
-    for (int i = 0; i < num_scalar_vars; i++) {
-        scalar_vars_size += data.members_size[i + data.num_ptr_members];
-    }
-    checkCudaErrors(cudaMalloc(&dev_base_ptr, ptr_vars_size + scalar_vars_size));
-    for (auto i = 0; i < data.num_ptr_members; i++) {
-        // Allocate a vector with the correct size
-        void* dev_member_ptr;
-        auto size_of_var = data.members_size[i];
-        checkCudaErrors(cudaMalloc(&dev_member_ptr, size_of_var * data.num_elements));
-        checkCudaErrors(cudaMemcpy(dev_member_ptr,
-                                   data.members[i],
-                                   size_of_var * data.num_elements,
-                                   cudaMemcpyHostToDevice));
-        // Copy the pointer addresses to the struct
-        auto offseted_place = (char*) dev_base_ptr + data.offsets[i];
-        checkCudaErrors(
-            cudaMemcpy(offseted_place, &dev_member_ptr, sizeof(double*), cudaMemcpyHostToDevice));
-    }
-    // memcpy the scalar values
-    auto offseted_place_dev = (char*) dev_base_ptr + data.offsets[data.num_ptr_members];
-    auto offseted_place_host = (char*) (data.base_ptr) + data.offsets[data.num_ptr_members];
-    checkCudaErrors(cudaMemcpy(
-        offseted_place_dev, offseted_place_host, scalar_vars_size, cudaMemcpyHostToDevice));
-    return dev_base_ptr;
-}
-
-void copy_instance_data_host(codegen::CodegenInstanceData& data, void* dev_base_ptr) {
-    const auto ptr_vars_size = data.num_ptr_members * sizeof(double*);
-    auto scalar_vars_size = 0;
-    const auto num_scalar_vars = data.members.size() - data.num_ptr_members;
-    for (int i = 0; i < num_scalar_vars; i++) {
-        scalar_vars_size += data.members_size[i + data.num_ptr_members];
-    }
-    const auto host_base_ptr = data.base_ptr;
-    for (auto i = 0; i < data.num_ptr_members; i++) {
-        auto size_of_var = data.members_size[i];
-        void* offset_dev_ptr = (char*) dev_base_ptr + data.offsets[i];
-        void* gpu_offset_addr;
-        checkCudaErrors(
-            cudaMemcpy(&gpu_offset_addr, offset_dev_ptr, sizeof(double*), cudaMemcpyDeviceToHost));
-        checkCudaErrors(cudaMemcpy(data.members[i],
-                                   gpu_offset_addr,
-                                   size_of_var * data.num_elements,
-                                   cudaMemcpyDeviceToHost));
-    }
-    // memcpy the scalar values
-    void* offseted_place_dev = (char*) dev_base_ptr + data.offsets[data.num_ptr_members];
-    void* offseted_place_host = (char*) (data.base_ptr) + data.offsets[data.num_ptr_members];
-    checkCudaErrors(cudaMemcpy(
-        offseted_place_host, offseted_place_dev, scalar_vars_size, cudaMemcpyDeviceToHost));
-}
-#endif
-
 BenchmarkResults LLVMBenchmark::run_benchmark() {
     // Set the codegen data helper and find the kernels.
     auto codegen_data = codegen::CodegenDataHelper(llvm_visitor.get_instance_struct_ptr());
@@ -129,34 +61,63 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
 
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
-    // Create the benchmark runner and initialize it.
+    if (external_kernel_library.empty()) {
+        // Create the benchmark runner and initialize it.
 #ifdef NMODL_LLVM_CUDA_BACKEND
-    if (platform.is_CUDA_gpu()) {
-        std::string filename = "cuda_" + mod_filename;
-        cuda_runner = std::make_unique<runner::BenchmarkGPURunner>(
-            std::move(m), filename, output_dir, shared_libs, opt_level_ir, opt_level_codegen);
-        cuda_runner->initialize_driver(platform);
-    } else {
+        if (platform.is_CUDA_gpu()) {
+            std::string filename = "cuda_" + mod_filename;
+            cuda_runner = std::make_unique<runner::BenchmarkGPURunner>(
+                std::move(m), filename, output_dir, shared_libs, opt_level_ir, opt_level_codegen);
+            cuda_runner->initialize_driver(platform);
+        } else {
 #endif
-        std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
-                               mod_filename;
-        cpu_runner = std::make_unique<runner::BenchmarkRunner>(std::move(m),
-                                                               filename,
-                                                               output_dir,
-                                                               backend_name,
-                                                               shared_libs,
-                                                               opt_level_ir,
-                                                               opt_level_codegen);
-        cpu_runner->initialize_driver();
+            std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
+                                mod_filename;
+            cpu_runner = std::make_unique<runner::BenchmarkRunner>(std::move(m),
+                                                                filename,
+                                                                output_dir,
+                                                                backend_name,
+                                                                shared_libs,
+                                                                opt_level_ir,
+                                                                opt_level_codegen);
+            cpu_runner->initialize_driver();
 #ifdef NMODL_LLVM_CUDA_BACKEND
-    }
+        }
 #endif
+    }
 
     BenchmarkResults results{};
-    if (external_kernel) {
+
+    // Kernel functions pointers from the external shared library loaded
+    std::unordered_map<std::string, void (*)(void* __restrict__)> kernel_functions;
+    void* external_kernel_lib_handle = nullptr;
+    if (!external_kernel_library.empty()) {
         // benchmark external kernel
         logger->info("Benchmarking external kernels");
-        kernel_names = {"nrn_state_hh_ext"};
+        kernel_names = {"nrn_cur_ext", "nrn_state_ext"};
+        std::unordered_map<std::string, std::string> kernel_names_map = {
+            {"nrn_cur_ext", "_Z11nrn_cur_extPv"},
+            {"nrn_state_ext", "_Z13nrn_state_extPv"}
+        };
+        // Dlopen the shared library
+        logger->info("Loading external kernel library: {}", external_kernel_library);
+        external_kernel_lib_handle = dlopen(external_kernel_library.c_str(), RTLD_LAZY);
+        if (!external_kernel_lib_handle) {
+            logger->error("Cannot open shared library: {}", dlerror());
+            exit(EXIT_FAILURE);
+        }
+        // Get the function pointers
+        for (auto& kernel_name: kernel_names) {
+            auto func_ptr = dlsym(external_kernel_lib_handle, kernel_names_map[kernel_name].c_str());
+            if (!func_ptr) {
+                logger->error("Cannot find function {} in shared library {}",
+                              kernel_name,
+                              external_kernel_library);
+                exit(EXIT_FAILURE);
+            }
+            kernel_functions[kernel_name] = reinterpret_cast<void (*)(void* __restrict__)>(
+                func_ptr);
+        }
     }
     // Benchmark every kernel.
     for (const auto& kernel_name: kernel_names) {
@@ -166,42 +127,50 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
             // Initialise the data.
             auto instance_data = codegen_data.create_data(instance_size, /*seed=*/1);
 #ifdef NMODL_LLVM_CUDA_BACKEND
-            void* dev_ptr;
             if (platform.is_CUDA_gpu()) {
-                dev_ptr = copy_instance_data_gpu(instance_data);
+                instance_data.copy_instance_data_gpu();
             }
 #endif
             // Log instance size once.
             if (i == 0) {
                 double size_mbs = instance_data.num_bytes / (1024.0 * 1024.0);
-                logger->info("Benchmarking kernel '{}' with {} MBs dataset",
-                                kernel_name,
-                                size_mbs);
+                logger->info("Benchmarking kernel '{}' with {} MBs dataset", kernel_name, size_mbs);
             }
 
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
-            auto start = std::chrono::steady_clock::now();
-            if (external_kernel) {
-                nrn_state_hh_ext(instance_data.base_ptr);
+            std::chrono::steady_clock::time_point start, end;
+            if (!external_kernel_library.empty()) {
+                if (platform.is_CUDA_gpu()) {
+                    start = std::chrono::steady_clock::now();
+                    kernel_functions[kernel_name](instance_data.dev_base_ptr);
+                    end = std::chrono::steady_clock::now();
+                } else {
+                    start = std::chrono::steady_clock::now();
+                    kernel_functions[kernel_name](instance_data.base_ptr);
+                    end = std::chrono::steady_clock::now();
+                }
             } else {
 #ifdef NMODL_LLVM_CUDA_BACKEND
                 if (platform.is_CUDA_gpu()) {
+                    start = std::chrono::steady_clock::now();
                     cuda_runner->run_with_argument<void*>(wrapper_name,
-                                                          dev_ptr,
+                                                          instance_data.dev_base_ptr,
                                                           gpu_execution_parameters);
+                    end = std::chrono::steady_clock::now();
                 } else {
 #endif
+                    start = std::chrono::steady_clock::now();
                     cpu_runner->run_with_argument<int, void*>(wrapper_name, instance_data.base_ptr);
+                    end = std::chrono::steady_clock::now();
 #ifdef NMODL_LLVM_CUDA_BACKEND
                 }
 #endif
             }
-            auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 #ifdef NMODL_LLVM_CUDA_BACKEND
             if (platform.is_CUDA_gpu()) {
-                copy_instance_data_host(instance_data, dev_ptr);
+                instance_data.copy_instance_data_host();
             }
 #endif
             // Log the time taken for each run.
@@ -213,12 +182,12 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
         // Calculate statistics
         double time_mean = std::accumulate(times.begin(), times.end(), 0.0) / num_experiments;
         double time_var = std::accumulate(times.begin(),
-                                        times.end(),
-                                        0.0,
-                                        [time_mean](const double& pres, const double& e) {
-                                            return (e - time_mean) * (e - time_mean);
-                                        }) /
-                        num_experiments;
+                                          times.end(),
+                                          0.0,
+                                          [time_mean](const double& pres, const double& e) {
+                                              return (e - time_mean) * (e - time_mean);
+                                          }) /
+                          num_experiments;
         double time_stdev = std::sqrt(time_var);
         double time_min = *std::min_element(times.begin(), times.end());
         double time_max = *std::max_element(times.begin(), times.end());
@@ -229,6 +198,10 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
         logger->info("Maximum compute time = {:.6f}\n", time_max);
         results[kernel_name] = {time_mean, time_stdev, time_min, time_max};
     }
+    // Close handle of shared library in case it was dlopened.
+    if (external_kernel_lib_handle) {
+        dlclose(external_kernel_lib_handle);
+    }
     return results;
 }
 
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index 9428560322..9dd036f5e8 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -70,7 +70,7 @@ class LLVMBenchmark {
     int opt_level_codegen;
 
     /// Benchmark external kernel
-    bool external_kernel;
+    std::string external_kernel_library;
 
     /// Filestream for dumping logs to the file.
     std::ofstream ofs;
@@ -93,7 +93,7 @@ class LLVMBenchmark {
                   const Platform& platform,
                   int opt_level_ir,
                   int opt_level_codegen,
-                  bool external_kernel)
+                  std::string external_kernel_library)
         : llvm_visitor(llvm_visitor)
         , mod_filename(mod_filename)
         , output_dir(output_dir)
@@ -103,7 +103,7 @@ class LLVMBenchmark {
         , platform(platform)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen)
-        , external_kernel(external_kernel) {}
+        , external_kernel_library(external_kernel_library) {}
     LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,
                   const std::string& mod_filename,
                   const std::string& output_dir,
@@ -113,7 +113,7 @@ class LLVMBenchmark {
                   const Platform& platform,
                   int opt_level_ir,
                   int opt_level_codegen,
-                  bool external_kernel,
+                  std::string external_kernel_library,
                   const GPUExecutionParameters& gpu_exec_params)
         : llvm_visitor(llvm_visitor)
         , mod_filename(mod_filename)
@@ -124,7 +124,7 @@ class LLVMBenchmark {
         , platform(platform)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen)
-        , external_kernel(external_kernel)
+        , external_kernel_library(external_kernel_library)
         , gpu_execution_parameters(gpu_exec_params) {}
 
     /// Runs the benchmark.
diff --git a/test/benchmark/nmodl-llvm-time.sh b/test/benchmark/nmodl-llvm-time.sh
index 06c325f65b..62d75e8724 100755
--- a/test/benchmark/nmodl-llvm-time.sh
+++ b/test/benchmark/nmodl-llvm-time.sh
@@ -252,7 +252,8 @@ for kernel_target in ${KERNEL_TARGETS}; do
                             ${debug} sed -i 's/#pragma.*/#pragma GCC ivdep/g' ${kernels_path}/${kernel_target}.cpp
                         fi
                     fi
-	                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp -shared -fpic -o ${ext_lib}
+                    # Create shared library that is linked with SVML shared library and add its RPATH to the generated shared library to be found later by NMODL
+	                ${debug} ${!compiler_exe} ${flags} ${kernels_path}/${kernel_target}.cpp -shared -fpic -o ${ext_lib} -Wl,-rpath $intel_library_dir -L$intel_library_dir -lsvml
 	                ${debug} eval "objdump ${ext_lib} -d > ${ext_lib::-1}"
 	                ${debug} cd ..
 
diff --git a/test/benchmark/plot_benchmarks_cpu_gpu.py b/test/benchmark/plot_benchmarks_cpu_gpu.py
new file mode 100644
index 0000000000..99c6292088
--- /dev/null
+++ b/test/benchmark/plot_benchmarks_cpu_gpu.py
@@ -0,0 +1,507 @@
+#!/usr/bin/python3
+
+import json
+import matplotlib.pyplot as plt
+import os
+import pandas as pd
+import pickle
+import seaborn as sns
+
+
+def _get_flags_string(flags):
+    return flags.replace(" ", "_").replace('-','').replace('=','_')
+
+def load_pickle_result_file(pickle_files, results):
+    def _merge(a, b, path=None):
+        if path is None:
+            path = []
+        for key in b:
+            if key in a:
+                if isinstance(a[key], dict) and isinstance(b[key], dict):
+                    _merge(a[key], b[key], path + [str(key)])
+                elif a[key] == b[key]:
+                    pass  # same leaf value
+                else:
+                    raise Exception("Conflict at %s" % ".".join(path + [str(key)]))
+            else:
+                a[key] = b[key]
+        return a
+
+    for pickle_file in pickle_files:
+        with open(pickle_file, "rb") as handle:
+            results = _merge(results, pickle.load(handle))
+    return results
+
+
+def generate_graph_pandas_combined_relative_log(
+    results,
+    compilers_comparison_config,
+    graph_suffix,
+    output_dir,
+    print_values=False,
+    xaxis_label=None,
+    plot_size=(12, 6),
+    baseline_name="intel_svml",
+    reference=False,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    compiler_flags = json.loads(compilers_comparison_config)
+    ref_title_str = " (reference)" if reference else ""
+    fig, axes = plt.subplots(1, 3, squeeze=False, figsize=plot_size)
+    ax_index = 0
+    for modname in results:
+        # state
+        bar_data_state_cpu_panda = {}
+        bar_data_state_cpu_panda["architecture"] = []
+        bar_data_state_cpu_panda["compiler"] = []
+        bar_data_state_cpu_panda["runtime"] = []
+        # current
+        bar_data_cur_cpu_panda = {}
+        bar_data_cur_cpu_panda["architecture"] = []
+        bar_data_cur_cpu_panda["compiler"] = []
+        bar_data_cur_cpu_panda["runtime"] = []
+        baseline_cur = 0.0
+        for architecture in results[modname]:
+            for compiler in compiler_flags:
+                if (
+                    compiler in results[modname][architecture]
+                    and architecture in compiler_flags[compiler]
+                ):
+                    for flags in compiler_flags[compiler][architecture]:
+                        if compiler == "nmodl_jit":
+                            state_kernel_name = "nrn_state_{}".format(
+                                modname.replace("-", "_")
+                            )
+                            cur_kernel_name = "nrn_cur_{}".format(
+                                modname.replace("-", "_")
+                            )
+                        else:
+                            state_kernel_name = "nrn_state_ext"
+                            cur_kernel_name = "nrn_cur_ext"
+                        if compiler == "clang" and "jit" in flags:
+                            compiler_name = "mod2ir"
+                        elif compiler == "nmodl_jit":
+                            compiler_name = "mod2ir_jit"
+                        else:
+                            compiler_name = compiler
+                        if "svml" in flags or "SVML" in flags:
+                            compiler_name = compiler_name + "_svml"
+                            if architecture != "nvptx64" and compiler == "intel":
+                                baseline_state = results[modname][architecture][
+                                    "intel"
+                                ][_get_flags_string(flags)][state_kernel_name][0]
+                                baseline_cur = results[modname][architecture]["intel"][
+                                    _get_flags_string(flags)
+                                ][cur_kernel_name][0]
+                        elif "sleef" in flags or "SLEEF" in flags:
+                            compiler_name = compiler_name + "_sleef"
+                        if architecture == "default":
+                            architecture_label = "auto-scalar"
+                        elif architecture == "nehalem":
+                            architecture_label = "nehalem-sse2"
+                        elif architecture == "broadwell":
+                            architecture_label = "broadwell-avx2"
+                        elif architecture == "nvptx64":
+                            architecture_label = architecture
+                            if compiler == "nvhpc":
+                                baseline_state = results[modname][architecture][
+                                    "nvhpc"
+                                ][_get_flags_string(flags)][state_kernel_name][0]
+                                baseline_cur = results[modname][architecture]["nvhpc"][
+                                    _get_flags_string(flags)
+                                ][cur_kernel_name][0]
+                        else:  # skylake-avx512
+                            architecture_label = architecture
+                        if modname != "expsyn":
+                            bar_data_state_cpu_panda["architecture"].append(
+                                architecture_label
+                            )
+                            bar_data_state_cpu_panda["compiler"].append(compiler_name)
+                            if (
+                                _get_flags_string(flags)
+                                not in results[modname][architecture][compiler]
+                            ):
+                                bar_data_state_cpu_panda["runtime"].append(0)
+                            else:
+                                bar_data_state_cpu_panda["runtime"].append(
+                                    results[modname][architecture][compiler][
+                                        _get_flags_string(flags)
+                                    ][state_kernel_name][0]
+                                )
+                        bar_data_cur_cpu_panda["architecture"].append(
+                            architecture_label
+                        )
+                        bar_data_cur_cpu_panda["compiler"].append(compiler_name)
+                        if (
+                            _get_flags_string(flags)
+                            not in results[modname][architecture][compiler]
+                        ):
+                            bar_data_cur_cpu_panda["runtime"].append(0)
+                        else:
+                            bar_data_cur_cpu_panda["runtime"].append(
+                                results[modname][architecture][compiler][
+                                    _get_flags_string(flags)
+                                ][cur_kernel_name][0]
+                            )
+        for i, runtime in enumerate(bar_data_state_cpu_panda["runtime"]):
+            bar_data_state_cpu_panda["runtime"][i] = baseline_state / runtime
+        for i, runtime in enumerate(bar_data_cur_cpu_panda["runtime"]):
+            bar_data_cur_cpu_panda["runtime"][i] = baseline_cur / runtime
+        pd.options.display.float_format = "{:,.2f}".format
+        if modname != "expsyn":
+            df_state = pd.DataFrame(
+                bar_data_state_cpu_panda,
+                columns=["architecture", "compiler", "runtime"],
+            )
+            print(df_state, type(df_state))
+            sns.barplot(
+                x="architecture",
+                y="runtime",
+                hue="compiler",
+                data=df_state,
+                ax=axes[0, ax_index],
+            )
+            axes[0, ax_index].set_yscale("symlog", base=2, linthresh=0.015)
+            axes[0, ax_index].set_ylim(0.125, 2)
+            axes[0, ax_index].set_yticks(
+                [0.125, 0.25, 0.5, 1, 2], [0.125, 0.25, 0.5, 1, 2]
+            )
+            axes[0, ax_index].axhline(1.0, ls="--", color="black")
+            axes[0, ax_index].xaxis.label.set_visible(False)
+            axes[0, ax_index].yaxis.label.set_visible(False)
+            axes[0, ax_index].set_title(f"nrn_state_{modname}{ref_title_str}")
+            axes[0, ax_index].get_legend().remove()
+            if xaxis_label is not None:
+                axes[0, ax_index].get_xaxis().set_visible(False)
+            if print_values:
+                for i in axes[0, ax_index].containers:
+                    axes[0, ax_index].bar_label(
+                        i,
+                    )
+            ax_index += 1
+        df_cur = pd.DataFrame(
+            bar_data_cur_cpu_panda, columns=["architecture", "compiler", "runtime"]
+        )
+        ax = sns.barplot(
+            x="architecture",
+            y="runtime",
+            hue="compiler",
+            data=df_cur,
+            ax=axes[0, ax_index],
+        )
+        axes[0, ax_index].axhline(1.0, ls="--", color="black")
+        print(df_cur, type(df_cur))
+        axes[0, ax_index].set_yscale("symlog", base=2, linthresh=0.015)
+        axes[0, ax_index].set_ylim(0.125, 2)
+        axes[0, ax_index].set_yticks([0.125, 0.25, 0.5, 1, 2], [0.125, 0.25, 0.5, 1, 2])
+        axes[0, ax_index].xaxis.label.set_visible(False)
+        axes[0, ax_index].yaxis.label.set_visible(False)
+        axes[0, ax_index].set_title(f"nrn_cur_{modname}{ref_title_str}")
+        axes[0, ax_index].get_legend().remove()
+        if xaxis_label is not None:
+            axes[0, ax_index].get_xaxis().set_visible(False)
+        if print_values:
+            for i in axes[0, ax_index].containers:
+                axes[0, ax_index].bar_label(
+                    i,
+                )
+        ax_index += 1
+
+    fig.text(
+        0.06,
+        0.5,
+        "Speedup relative to {}".format(baseline_name),
+        ha="center",
+        va="center",
+        rotation="vertical",
+    )
+    plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    plt.savefig(
+        "{}/combined_benchmark_{}.pdf".format(output_dir, graph_suffix),
+        format="pdf",
+        bbox_inches="tight",
+    )
+    plt.show()
+    plt.close()
+
+
+def generate_graph_pandas_combined_relative_gpu_log(
+    results,
+    compilers_comparison_config,
+    graph_suffix,
+    output_dir,
+    print_values=False,
+    xaxis_label=None,
+    plot_size=(12, 6),
+    baseline_name="intel_svml",
+    reference=False,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    compiler_flags = json.loads(compilers_comparison_config)
+    ref_title_str = " (reference)" if reference else ""
+    fig, axes = plt.subplots(1, 1, squeeze=False, figsize=plot_size)
+    ax = axes[0, 0]
+    bar_data_gpu_panda = {}
+    bar_data_gpu_panda["kernel"] = []
+    bar_data_gpu_panda["compiler"] = []
+    bar_data_gpu_panda["runtime"] = []
+    baseline_kernel = {}
+    for modname in results:
+        for architecture in results[modname]:
+            for compiler in compiler_flags:
+                if (
+                    compiler in results[modname][architecture]
+                    and architecture in compiler_flags[compiler]
+                ):
+                    for flags in compiler_flags[compiler][architecture]:
+                        if compiler == "nmodl_jit":
+                            state_kernel_name = "nrn_state_{}".format(
+                                modname.replace("-", "_")
+                            )
+                            cur_kernel_name = "nrn_cur_{}".format(
+                                modname.replace("-", "_")
+                            )
+                        else:
+                            state_kernel_name = "nrn_state_ext"
+                            cur_kernel_name = "nrn_cur_ext"
+                        label_state_name = "nrn_state_{}".format(
+                            modname.replace("-", "_")
+                        )
+                        label_cur_name = "nrn_cur_{}".format(modname.replace("-", "_"))
+                        if compiler == "clang" and "jit" in flags:
+                            compiler_name = "mod2ir"
+                        elif compiler == "nmodl_jit":
+                            compiler_name = "mod2ir_jit"
+                        else:
+                            compiler_name = compiler
+                        if architecture == "nvptx64":
+                            architecture_label = architecture
+                            if compiler == "nvhpc":
+                                baseline_kernel[label_state_name] = results[modname][
+                                    architecture
+                                ]["nvhpc"][_get_flags_string(flags)][state_kernel_name][
+                                    0
+                                ]
+                                baseline_kernel[label_cur_name] = results[modname][
+                                    architecture
+                                ]["nvhpc"][_get_flags_string(flags)][cur_kernel_name][0]
+                        else:  # skylake-avx512
+                            architecture_label = architecture
+                        if modname != "expsyn":
+                            bar_data_gpu_panda["kernel"].append(label_state_name)
+                            bar_data_gpu_panda["compiler"].append(compiler_name)
+                            if (
+                                _get_flags_string(flags)
+                                not in results[modname][architecture][compiler]
+                            ):
+                                bar_data_gpu_panda["runtime"].append(0)
+                            else:
+                                bar_data_gpu_panda["runtime"].append(
+                                    results[modname][architecture][compiler][
+                                        _get_flags_string(flags)
+                                    ][state_kernel_name][0]
+                                )
+                        bar_data_gpu_panda["kernel"].append(label_cur_name)
+                        bar_data_gpu_panda["compiler"].append(compiler_name)
+                        if (
+                            _get_flags_string(flags)
+                            not in results[modname][architecture][compiler]
+                        ):
+                            bar_data_gpu_panda["runtime"].append(0)
+                        else:
+                            bar_data_gpu_panda["runtime"].append(
+                                results[modname][architecture][compiler][
+                                    _get_flags_string(flags)
+                                ][cur_kernel_name][0]
+                            )
+    for i, runtime in enumerate(bar_data_gpu_panda["runtime"]):
+        kernel = bar_data_gpu_panda["kernel"][i]
+        print(
+            "Scaling kernel {} arch {}".format(
+                kernel, bar_data_gpu_panda["compiler"][i]
+            )
+        )
+        bar_data_gpu_panda["runtime"][i] = baseline_kernel[kernel] / runtime
+    pd.options.display.float_format = "{:,.2f}".format
+    print(bar_data_gpu_panda)
+    df_kernels = pd.DataFrame(
+        bar_data_gpu_panda, columns=["kernel", "compiler", "runtime"]
+    )
+    print(df_kernels, type(df_kernels))
+    sns.barplot(x="kernel", y="runtime", hue="compiler", data=df_kernels, ax=ax)
+    ax.axhline(1.0, ls="--", color="black")
+    ax.xaxis.label.set_visible(False)
+    ax.set_yscale("symlog", base=2, linthresh=0.015)
+    ax.set_ylim(0.5, 2)
+    ax.set_yticks([0.5, 1, 2], [0.5, 1, 2])
+    ax.set_title(f"GPU benchmarks{ref_title_str}")
+    plt.ylabel("Speedup relative to {}".format(baseline_name))
+    plt.legend(loc="upper right")
+    if print_values:
+        for i in ax.containers:
+            ax.bar_label(
+                i,
+            )
+    plt.savefig(
+        "{}/combined_benchmark_{}.pdf".format(output_dir, graph_suffix),
+        format="pdf",
+        bbox_inches="tight",
+    )
+    plt.show()
+    plt.close()
+
+
+def plot_cpu_results():
+    colors = [
+        "#6baed6",  # intel
+        "#0570b0",  # intel svml
+        "#66c2a4",  # gcc
+        "#238b45",  # gcc svml
+        "#b2df8a",  # nvhpc
+        "#fdd49e",  # clang
+        "#fc8d59",  # clang svml
+        "#9ebcda",  # mod2ir
+        "#8c96c6",  # mod2ir svml
+        "#969696",  # mod2ir jit svml
+        "#525252",  # mod2ir jit sleef
+    ]
+
+    sns.set_palette(sns.color_palette(colors))
+    compilers_comparison_config = """
+    {
+      "intel": {
+        "skylake-avx512": [
+          "-O2 -mavx512f -prec-div -fopenmp",
+          "-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp"
+        ]
+      },
+      "gcc": {
+        "skylake-avx512": [
+          "-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -fopenmp",
+          "-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
+        ]
+      },
+      "nvhpc": {
+        "skylake-avx512": [
+          "-fast -O3 -mp=autopar -tp=skylake -Msafeptr=all -Minfo -Mvect=simd:512,gather -mavx512vbmi -mavx512vbmi2 -mavx512vl"
+        ]
+      },
+      "clang": {
+        "skylake-avx512": [
+          "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp",
+          "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML",
+          "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML",
+          "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF"
+        ]
+      },
+      "nmodl_jit": {
+        "skylake-avx512": [
+          "SVML_nnancontractafn",
+          "SLEEF_nnancontractafn"
+        ]
+      }
+    }
+    """
+    # reference
+    hh_expsyn_cpu_reference = load_pickle_result_file(
+        [
+            "./reference_data/hh_expsyn_mavx512f.pickle",
+            "./reference_data/hh_expsyn_nvhpc_cpu.pickle",
+        ],
+        {},
+    )
+    json_object = json.dumps(hh_expsyn_cpu_reference, indent=4)
+    generate_graph_pandas_combined_relative_log(
+        hh_expsyn_cpu_reference,
+        compilers_comparison_config,
+        "reference_hh_expsyn_cpu_relative_log",
+        "graphs_output_pandas",
+        False,
+        xaxis_label="skylake-avx512 Target Microarchitecture",
+        plot_size=(10, 3.5),
+    )
+    # newly collected data
+    hh_expsyn_cpu_results = load_pickle_result_file(
+        [
+            "./hh_expsyn_cpu/benchmark_results.pickle"
+        ],
+        {},
+    )
+    json_object = json.dumps(hh_expsyn_cpu_results, indent=4)
+    generate_graph_pandas_combined_relative_log(
+        hh_expsyn_cpu_results,
+        compilers_comparison_config,
+        "hh_expsyn_cpu_relative_log",
+        "graphs_output_pandas",
+        False,
+        xaxis_label="skylake-avx512 Target Microarchitecture",
+        plot_size=(10, 3.5),
+    )
+
+
+def plot_gpu_results():
+    colors = ["#b2df8a", "#969696"]
+    sns.set_palette(sns.color_palette(colors))
+    compilers_comparison_config = """
+    {
+      "nvhpc": {
+        "nvptx64": [
+          "-O3 -gpu=nordc,fastmath"
+        ]
+      },
+      "nmodl_jit": {
+        "nvptx64": [
+          "libdevice_nnancontractafn"
+        ]
+      }
+    }
+    """
+    # reference
+    hh_expsyn_gpu_reference = load_pickle_result_file(
+        [
+            "./reference_data/hh_gpu_20mil_1024x128.pickle",
+            "./reference_data/expsyn_gpu_100mil_1024x128.pickle",
+        ],
+        {},
+    )
+
+    generate_graph_pandas_combined_relative_gpu_log(
+        hh_expsyn_gpu_reference,
+        compilers_comparison_config,
+        "reference_hh_expsyn_gpu_relative_one_plot_log",
+        "graphs_output_pandas",
+        xaxis_label="NVPTX64 Architecture",
+        print_values=False,
+        plot_size=(4, 3),
+        baseline_name="nvhpc",
+        reference=True,
+    )
+    
+    # newly collected
+    hh_expsyn_gpu_results = load_pickle_result_file(
+        [
+            "./hh_expsyn_gpu/benchmark_results.pickle"
+        ],
+        {},
+    )
+
+    generate_graph_pandas_combined_relative_gpu_log(
+        hh_expsyn_gpu_results,
+        compilers_comparison_config,
+        "hh_expsyn_gpu_relative_one_plot_log",
+        "graphs_output_pandas",
+        xaxis_label="NVPTX64 Architecture",
+        print_values=False,
+        plot_size=(4, 3),
+        baseline_name="nvhpc",
+        reference=False,
+    )
+
+
+def main():
+    plot_cpu_results()
+    plot_gpu_results()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/benchmark/plot_benchmarks_cpu_only.py b/test/benchmark/plot_benchmarks_cpu_only.py
new file mode 100644
index 0000000000..e5d3736fd4
--- /dev/null
+++ b/test/benchmark/plot_benchmarks_cpu_only.py
@@ -0,0 +1,321 @@
+#!/usr/bin/python3
+
+import json
+import matplotlib.pyplot as plt
+import os
+import pandas as pd
+import pickle
+import seaborn as sns
+
+
+def _get_flags_string(flags):
+    return flags.replace(" ", "_").replace('-','').replace('=','_')
+
+def load_pickle_result_file(pickle_files, results):
+    def _merge(a, b, path=None):
+        if path is None:
+            path = []
+        for key in b:
+            if key in a:
+                if isinstance(a[key], dict) and isinstance(b[key], dict):
+                    _merge(a[key], b[key], path + [str(key)])
+                elif a[key] == b[key]:
+                    pass  # same leaf value
+                else:
+                    raise Exception("Conflict at %s" % ".".join(path + [str(key)]))
+            else:
+                a[key] = b[key]
+        return a
+
+    for pickle_file in pickle_files:
+        with open(pickle_file, "rb") as handle:
+            results = _merge(results, pickle.load(handle))
+    return results
+
+
+def generate_graph_pandas_combined_relative_log(
+    results,
+    compilers_comparison_config,
+    graph_suffix,
+    output_dir,
+    print_values=False,
+    xaxis_label=None,
+    plot_size=(12, 6),
+    baseline_name="intel_svml",
+    reference=False,
+):
+    os.makedirs(output_dir, exist_ok=True)
+    compiler_flags = json.loads(compilers_comparison_config)
+    ref_title_str = " (reference)" if reference else ""
+    fig, axes = plt.subplots(1, 3, squeeze=False, figsize=plot_size)
+    ax_index = 0
+    for modname in results:
+        # state
+        bar_data_state_cpu_panda = {}
+        bar_data_state_cpu_panda["architecture"] = []
+        bar_data_state_cpu_panda["compiler"] = []
+        bar_data_state_cpu_panda["runtime"] = []
+        # current
+        bar_data_cur_cpu_panda = {}
+        bar_data_cur_cpu_panda["architecture"] = []
+        bar_data_cur_cpu_panda["compiler"] = []
+        bar_data_cur_cpu_panda["runtime"] = []
+        baseline_cur = 0.0
+        for architecture in results[modname]:
+            for compiler in compiler_flags:
+                if (
+                    compiler in results[modname][architecture]
+                    and architecture in compiler_flags[compiler]
+                ):
+                    for flags in compiler_flags[compiler][architecture]:
+                        if compiler == "nmodl_jit":
+                            state_kernel_name = "nrn_state_{}".format(
+                                modname.replace("-", "_")
+                            )
+                            cur_kernel_name = "nrn_cur_{}".format(
+                                modname.replace("-", "_")
+                            )
+                        else:
+                            state_kernel_name = "nrn_state_ext"
+                            cur_kernel_name = "nrn_cur_ext"
+                        if compiler == "clang" and "jit" in flags:
+                            compiler_name = "mod2ir"
+                        elif compiler == "nmodl_jit":
+                            compiler_name = "mod2ir_jit"
+                        else:
+                            compiler_name = compiler
+                        if "svml" in flags or "SVML" in flags:
+                            compiler_name = compiler_name + "_svml"
+                            if architecture != "nvptx64" and compiler == "intel":
+                                baseline_state = results[modname][architecture][
+                                    "intel"
+                                ][_get_flags_string(flags)][state_kernel_name][0]
+                                baseline_cur = results[modname][architecture]["intel"][
+                                    _get_flags_string(flags)
+                                ][cur_kernel_name][0]
+                        elif "sleef" in flags or "SLEEF" in flags:
+                            compiler_name = compiler_name + "_sleef"
+                        if architecture == "default":
+                            architecture_label = "auto-scalar"
+                        elif architecture == "nehalem":
+                            architecture_label = "nehalem-sse2"
+                        elif architecture == "broadwell":
+                            architecture_label = "broadwell-avx2"
+                        elif architecture == "nvptx64":
+                            architecture_label = architecture
+                            if compiler == "nvhpc":
+                                baseline_state = results[modname][architecture][
+                                    "nvhpc"
+                                ][_get_flags_string(flags)][state_kernel_name][0]
+                                baseline_cur = results[modname][architecture]["nvhpc"][
+                                    _get_flags_string(flags)
+                                ][cur_kernel_name][0]
+                        else:  # skylake-avx512
+                            architecture_label = architecture
+                        if modname != "expsyn":
+                            bar_data_state_cpu_panda["architecture"].append(
+                                architecture_label
+                            )
+                            bar_data_state_cpu_panda["compiler"].append(compiler_name)
+                            if (
+                                _get_flags_string(flags)
+                                not in results[modname][architecture][compiler]
+                            ):
+                                bar_data_state_cpu_panda["runtime"].append(0)
+                            else:
+                                bar_data_state_cpu_panda["runtime"].append(
+                                    results[modname][architecture][compiler][
+                                        _get_flags_string(flags)
+                                    ][state_kernel_name][0]
+                                )
+                        bar_data_cur_cpu_panda["architecture"].append(
+                            architecture_label
+                        )
+                        bar_data_cur_cpu_panda["compiler"].append(compiler_name)
+                        if (
+                            _get_flags_string(flags)
+                            not in results[modname][architecture][compiler]
+                        ):
+                            bar_data_cur_cpu_panda["runtime"].append(0)
+                        else:
+                            bar_data_cur_cpu_panda["runtime"].append(
+                                results[modname][architecture][compiler][
+                                    _get_flags_string(flags)
+                                ][cur_kernel_name][0]
+                            )
+        for i, runtime in enumerate(bar_data_state_cpu_panda["runtime"]):
+            bar_data_state_cpu_panda["runtime"][i] = baseline_state / runtime
+        for i, runtime in enumerate(bar_data_cur_cpu_panda["runtime"]):
+            bar_data_cur_cpu_panda["runtime"][i] = baseline_cur / runtime
+        pd.options.display.float_format = "{:,.2f}".format
+        if modname != "expsyn":
+            df_state = pd.DataFrame(
+                bar_data_state_cpu_panda,
+                columns=["architecture", "compiler", "runtime"],
+            )
+            print(df_state, type(df_state))
+            sns.barplot(
+                x="architecture",
+                y="runtime",
+                hue="compiler",
+                data=df_state,
+                ax=axes[0, ax_index],
+            )
+            axes[0, ax_index].set_yscale("symlog", base=2, linthresh=0.015)
+            axes[0, ax_index].set_ylim(0.125, 2)
+            axes[0, ax_index].set_yticks(
+                [0.125, 0.25, 0.5, 1, 2], [0.125, 0.25, 0.5, 1, 2]
+            )
+            axes[0, ax_index].axhline(1.0, ls="--", color="black")
+            axes[0, ax_index].xaxis.label.set_visible(False)
+            axes[0, ax_index].yaxis.label.set_visible(False)
+            axes[0, ax_index].set_title(f"nrn_state_{modname}{ref_title_str}")
+            axes[0, ax_index].get_legend().remove()
+            if xaxis_label is not None:
+                axes[0, ax_index].get_xaxis().set_visible(False)
+            if print_values:
+                for i in axes[0, ax_index].containers:
+                    axes[0, ax_index].bar_label(
+                        i,
+                    )
+            ax_index += 1
+        df_cur = pd.DataFrame(
+            bar_data_cur_cpu_panda, columns=["architecture", "compiler", "runtime"]
+        )
+        ax = sns.barplot(
+            x="architecture",
+            y="runtime",
+            hue="compiler",
+            data=df_cur,
+            ax=axes[0, ax_index],
+        )
+        axes[0, ax_index].axhline(1.0, ls="--", color="black")
+        print(df_cur, type(df_cur))
+        axes[0, ax_index].set_yscale("symlog", base=2, linthresh=0.015)
+        axes[0, ax_index].set_ylim(0.125, 2)
+        axes[0, ax_index].set_yticks([0.125, 0.25, 0.5, 1, 2], [0.125, 0.25, 0.5, 1, 2])
+        axes[0, ax_index].xaxis.label.set_visible(False)
+        axes[0, ax_index].yaxis.label.set_visible(False)
+        axes[0, ax_index].set_title(f"nrn_cur_{modname}{ref_title_str}")
+        axes[0, ax_index].get_legend().remove()
+        if xaxis_label is not None:
+            axes[0, ax_index].get_xaxis().set_visible(False)
+        if print_values:
+            for i in axes[0, ax_index].containers:
+                axes[0, ax_index].bar_label(
+                    i,
+                )
+        ax_index += 1
+
+    fig.text(
+        0.06,
+        0.5,
+        "Speedup relative to {}".format(baseline_name),
+        ha="center",
+        va="center",
+        rotation="vertical",
+    )
+    plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    plt.savefig(
+        "{}/combined_benchmark_{}.pdf".format(output_dir, graph_suffix),
+        format="pdf",
+        bbox_inches="tight",
+    )
+    plt.show()
+    plt.close()
+
+
+def plot_cpu_results():
+    colors = [
+        "#6baed6",  # intel
+        "#0570b0",  # intel svml
+        "#66c2a4",  # gcc
+        "#238b45",  # gcc svml
+        "#b2df8a",  # nvhpc
+        "#fdd49e",  # clang
+        "#fc8d59",  # clang svml
+        "#9ebcda",  # mod2ir
+        "#8c96c6",  # mod2ir svml
+        "#969696",  # mod2ir jit svml
+        "#525252",  # mod2ir jit sleef
+    ]
+
+    sns.set_palette(sns.color_palette(colors))
+    compilers_comparison_config = """
+    {
+      "intel": {
+        "skylake-avx512": [
+          "-O2 -mavx512f -prec-div -fopenmp",
+          "-O2 -mavx512f -prec-div -fimf-use-svml -fopenmp"
+        ]
+      },
+      "gcc": {
+        "skylake-avx512": [
+          "-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -fopenmp",
+          "-O3 -march=skylake-avx512 -mtune=skylake -mavx512f -ffast-math -ftree-vectorize -mveclibabi=svml -fopenmp"
+        ]
+      },
+      "nvhpc": {
+        "skylake-avx512": [
+          "-fast -O3 -mp=autopar -tp=skylake -Msafeptr=all -Minfo -Mvect=simd:512,gather -mavx512vbmi -mavx512vbmi2 -mavx512vl"
+        ]
+      },
+      "clang": {
+        "skylake-avx512": [
+          "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp",
+          "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp -fveclib=SVML",
+          "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SVML",
+          "-O3 -march=skylake-avx512 -mtune=skylake -ffast-math -fopenmp jit SLEEF"
+        ]
+      },
+      "nmodl_jit": {
+        "skylake-avx512": [
+          "SVML_nnancontractafn",
+          "SLEEF_nnancontractafn"
+        ]
+      }
+    }
+    """
+    # reference
+    hh_expsyn_cpu_reference = load_pickle_result_file(
+        [
+            "./reference_data/hh_expsyn_mavx512f.pickle",
+            "./reference_data/hh_expsyn_nvhpc_cpu.pickle",
+        ],
+        {},
+    )
+    json_object = json.dumps(hh_expsyn_cpu_reference, indent=4)
+    generate_graph_pandas_combined_relative_log(
+        hh_expsyn_cpu_reference,
+        compilers_comparison_config,
+        "reference_hh_expsyn_cpu_relative_log",
+        "graphs_output_pandas",
+        False,
+        xaxis_label="skylake-avx512 Target Microarchitecture",
+        plot_size=(10, 3.5),
+    )
+    # newly collected data
+    hh_expsyn_cpu_results = load_pickle_result_file(
+        [
+            "./hh_expsyn_cpu/benchmark_results.pickle"
+        ],
+        {},
+    )
+    json_object = json.dumps(hh_expsyn_cpu_results, indent=4)
+    generate_graph_pandas_combined_relative_log(
+        hh_expsyn_cpu_results,
+        compilers_comparison_config,
+        "hh_expsyn_cpu_relative_log",
+        "graphs_output_pandas",
+        False,
+        xaxis_label="skylake-avx512 Target Microarchitecture",
+        plot_size=(10, 3.5),
+    )
+
+
+def main():
+    plot_cpu_results()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/benchmark/reference_data/compute_bound.pickle b/test/benchmark/reference_data/compute_bound.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..b2822fc573e16038be3e7052cd63574412fa74c7
GIT binary patch
literal 8758
zcmb`Mdmz(o8^?taB?^Vkl2CHUq4eapR=P{65UJG6Y;9#T+lVM7Jc*Q2sUC-v(t(OZ
ziRdIsIV(1&P0=}(&Uvq&PS5+acz9obeYe?upZmJ6`}$t@eQzS{nW!z*^|v7=Qi<xt
z;`s1shVEWIY>$`?F?x~m9yB_|$CKang1}_+X`Ub>6&IP3Ib0gmgX!D#$pAK&O{V&A
z$uvKHj2^=3PPHdf$Ie2SnNC&Jr1FLE&Y{yHQ`Pq%&L1G9p&~v!4*wF5PvO(Pf32xp
zd2V1D3t@9eQT>k%(??iSL4!l;H6p^A<@{GV)J@^X`0yfSR#T}s#qdQ_$aFe|$7fOa
z3^JY1rO|w8RK6FNxt=BwtA6U)Ev25J2n(<tsv4CmgD|0uyvIbT)d*`86%$<6oR@?}
zlqn!k_03ZdwtZCpJ@Gq=5m6?n?YMj50T8V|gYz;*c3}~yo)q@#t_KhNJDEfjA=CfE
z_FbzGruA5B#xZ|uKyjBTBHq6Wg4?*$Ih83H<t^tz`_L;0b1R-#k{mb&2%agmcG3wT
zKC3@L^%;N?ss8uJlh-i$WJhPa1(G?^0-t4E<{(V>1Y;}f*kO45K1#ZL;~5Y+44xiU
zLn4QQgMC0XB@1Dpqf|6D^6L>%L2YByc&-76fWi+3dU!}8hyKd3z{X~dWRR<wscjKc
z5H_mpx}s~p^YHv^|FnInVIc04g3N}`k_@uC$Y$LoZG?FXbsG}v-oo?MFJ3QyYX@SZ
zW(Q4W4S@JWDzI5z9-eq8x*o2A$AirFVt>z`u6NbHXYsdnK>oT8AfM~kuB6E7RwK-I
z8zG*d{u17?^|8B?mIvZN&bE2|eF1LR&wT#p4-gf!qodu<q$7;$|1r_zCIu1umlarX
zJx>GXw7i7J9hdNU4R*xC(;F^t#r3XXz6dj^ou6}oArClKH(ob76F^jix_I?_+!NP;
z64UBpSP{Xx{<F^d><6x5<D9{s!+^`OWb@R<dpMWc*Kn~ZY^oQV&!tfL6gs=7kh!6l
zfegr}X{~m9G@PA*$ZYo`?^ZQDWFdZoM&b$HZODekAJGj?Mp)p^R8oL%6`<X^BD+O8
z2hgm}sVfvr!_m~gg{)U>!m8w_(Q%mwQ=WK-_+sxwNU@}<Lx-4<boDczJ6~05>4{Bc
z!9?9+8-#6UzfaDYTL)}6&-@wu350?rIz2V~6rOcHk@9RBgW^eJ;jv;6pk$4J!pY+C
zXg|s)na*U<eRwpUFAJ}%dXfJur2>_|GPH&*N0`Cw#fFIvB!mg%$E!rmw*sN@oNvss
z1y%{AwC$S8AjO*qTaw`}+kPG@a8r50B!iQSpp=el-4!s*4hVS+CH#>XEK?SROJ$J1
z;gVT=A2#jBtKZdFGTn<qW3xCCxrJ{G1nWC(5oX~jm1`aj^Dk^#({QhI4#Zf_i7c(D
zD2baWbF1@lktYpdJLo*Z4Uz-Ywa_B^gS-bIo-UQ2=qrQI`l7xPv%nxcneG&KCK-=-
z_ahL<bM7H3!blhR6ZIQwVFZO5ri&Ygfp|B?;NC)sB^nbaHJKPuVc;wKdl(;3)<Kv!
zcx_nnjwTRUoHuSU`FL(H)bYq0|NC)t6Y9DL+=3(L^ATp8{lz`=DCA0A?J`%X#N!~I
z-+H+B^-GDNmX!%RaR1$Ia6@U0>>K0EA;Q<Km>CORBciNf94T|YIv(n2y+!QaZv=E=
z{~07bWSuZwszaIt&MR1=aCe!?HW2)^D_kR*B(Y+mhIvw~mxai>V(#&ItDiR@OjLU#
ztZW$+U{SMlaMjzbIIC6PSbytDf)&R{S6nGY*pO4x_joDi0OK5nbXJG~2*!yin!*x^
zupPC2(yN~3BFrviot}T{GlX?qTjH6zZzzcO<taClX*leRUc=JAChzafT%i0qO6?f5
z5awI%Zf?lp!u)^uP#RyT0K8+zng*R#lE~Xk*?G259gNm)B$Ihs8iLzwZ#!bK6}W0M
zn-h94cwSe=>f;&PTjvy<RBYP2{~)XeR{HR?QP436mM@q#fY1Vq$S>(pBqbb>y0HOF
zBwdkjpjUyg(Q=&1KiwEG;Zi9D>&C}}kX5nVFxUnUX0M%7kgU|1d+a0BN2lQwmsCsO
zaxq#^+lYY6D`#>{Zfg%*9l6-M>h&8C=EP~9Ge5uu=JxEYeOYhPK(s|?E-z!_Tzzp)
zL8{qk9iuUTRy#0#vibu+qn~wMyc`}OJg`z(@ugf3Xw9FE()K)F1En`_RfM(~8K$wL
z66LS##6#wnUEPVd(S37H!O7*5`fpZ5n0>p}IQ?N`fo(+sskBiV*dh)EH#U%ZVCy*T
zc)CVU24Qov#;l_#B|@}P)=mjJ2Q7X_rs8-tmp6E>LBr<G^`dyJqj`3@VwZPX;NIC~
zQ$IfYW>i04f7_Jw?5gM{FW1}<VfrGTRb>-ihA^3{i`md_2o}jUt@C&X<M`E8Yqf-N
zGNlgCE!z$%nc`QEuv)dG?EJ@aAap`c2577Y`fsi}>&N>3iTBl4f4=UUv!=OZsQN(a
zYNkK&Yn~*6&?k=SaLo(_k$KLCb67zVJ5lBnr>Ql=7h$}R*Y0aR=0mwZIyNoe0$O%i
zb%SWFXda#wj{ggFZ``%s`6}X)Ag7I|zdB6M-v@2M?Aagn$3ZCTsa1Z*wd^I00u!A`
zXxeRi7&=~4>P+@-M@Z>-;b&8+7!U=rBQvw+<N5Q|UCnwomTnwYV7{hbqX3>NYnr1m
zO+gG<VybY9%DV+(TKMK;Lrf&W5@lZ9PrZ8?S~e;@><X64fRHzprLZEhL3~M9vIuO)
zL;lkUekMjYiSiY*2bYz$b;ULeK$xZS(%N<Y@4yJ&gryjY<A7IrGV9fgVoAK1u=uX3
z{U)e93(Zqwe8OWPp~8p7dU<(+c&#`xM^}jRTK6TaUwMD)v~|zhtVXSdYC6f{GFHt6
zr>$$1V=%HG<K*Jil+QU5ft&4LE+5zdt*{|Qs)n*}I#jw@%7GOmsF}@lW}N5ZQXF_@
zpMZOIThc)JJ))h^9f(JDrd&UT=feH)Z00=>wp&!+Urdn5T<dl?HG@-(urcSp#1*Ts
zp>(WxH??W5EC`0{V2&k}P{`muxvj8tWyd@x*t;`yYmE0^v4?r{SmCp@MimxSCQI9=
z5>`jIZd)I_V~@2SWWXYFk&aAxAbijJ_@@tcwjlBjYYL1V@qqT)ZG~r!?rF1k154YY
zQ<1mY2Dkz}OmtjfMb-0ekT0qp?SZQ~GN&juGy-ATk5+8s&jbT%-ZpHY&SMJ@QIrvB
zYLjuUzPPP$OKN!xb`W8UH{_0<T~`ihM2@YcBgCM7jzOf>iBCPC)o+_evby7rFz4!v
z!D=orvGq$WcW-$P=SB60lopmA)yL7)`?ODu-toj<!4Hld+84E|*UkVo?2CzI0#uZG
zSFK3*A**^|Yu;W*Zr+`NFb(O8T4qP^^|7tYF{48Y#Ez|^CR2QS4{ke%x7Od2vX*8@
z@%1w|G5x+>)r*|^!*jp=$Jc-O=DXkC`+gMIqw9*67i67SdJ@j}K0J>~T)z%#_JUbM
zH5FmSh}PU5AV`HBhXmKH{8&=9><C2I=Biu8r|>@zR$045cF-iK`Bme6PyUtPHTlH8
zqISCp6h(ds_wJNN*gSo!oHyqN0cnU}morZT#6jix26_P!k*iG3@1FA@8e!w#U>0($
z5Fpk6GemzgoPJbIZ70~DyNCz=>-PI+{qHWuziSmFZoV7)O?Wdw{0RCK%fDo|I|>li
z@cu&XX(<R@qb#XnrZ`j*HBsh8&f%?^P|>()>bknBY_O@#UHcVi5i|%D^ObhtsciG3
zoc4vc+h{j*rubbDL3T=|kk0rX*a+Nj2d9w+$^f^_`Df+{`y_J94k|v8m<8$NW|*U*
zD_0J<gv+bk*q89uMs4qo_w_ip-T%h@kLE^V+#5b6-x{6|HN@N@N=fzRJm7y8W2e6{
z2>9hLjz}FoL=rzHR-VLkGkgm*sB(sCjXV)@>(h#3+bBXfG8nbdT1@x@=Xd(Q@^?QE
zUrg&}L_l?xBUCuI4Yh=Ed~8gb@a!s#L&}sMzlte&99mk@PVG|=rW$Q)AG^c_@Krc2
z=M5*}<0zhbl!forzBOl)dp8=1;TG2DTNtRtLScvx3iR!O^A2I&XFfw2+IiuF&{W%3
zaH{%iJt4|m4Dg70aROm$w<JD{%Flu{KdIRqG(7=Cq2Xa&x2|wo_71T7hD_Xc{oZ85
zg>Z`ZW=Pb5U^@uN!@`gY?%5!oZd<@lx-5wltG<2nl~xmMN~F@?Njng)Bdoe$^L|PY
z6!Pk-n`Uw4iFjg6?F;E|TQPCD@7eUXum+bc|LlV@t%2|qz4fUzo{Z1#*7%nDNkpv^
zX6>7aA3~wQ0V#5qV2@On>iX_ajkVAXX)4v_k*jc2i(aC3cfu&PNp8J#Vg<5(*>-n?
zz{&~uW(Pg_TmpLnF_Qf}a?@BzpfLM~A|?~!5EfBqcJBK65}5R`2GeNMP7v?+cOH~l
zfP?-#6aAM(?%%ZK;==ijtbXeuw4wYgqqC5v;zV!rm3uWoh$=TFE`-a9U-K2KF4}A|
zDHd83TXlP@u;zy_`6){o!2<Z98m%PlAY&0uOzc}|duYtXx8Kgq8MO;x?)&N%2B#$g
z>B&VYHfqp<h=r`qh1h+J0^PS%T`3%1$9iC1P;LpG$<ml_4%1@YYchU$d-IP&z~
zBKLFvR&VzC)EWu_c83m}8)3ztpik0?7-<?0Nm!358<hE~NTjVEV6W|jAJ5sy_4~59
z!wpRO(~-C`8FPrVb~*jNXF5*%b3@;|l<09IAr7cXybz-ex23V6l=cW%++tt#w#qyG
zKm_e-aF4@V&|eQDs;{XyILW|i-Acn)`DH9+fK76|9vlwap&G)%8rSi#Q2V-(XnCkn
zW_1ThA{#o#zt6?jzSw!{jqC3a)|z)-mt-alNZlRV)hGqVe=LLv_+v2jvQGAVfNQVP
zYWxGXBsD~D?~Rvl;34hxMxy19;#Hy$@ROydOWUo>RwArrjDdZ~5ekT-x}z8R<HyI{
zaS7J#Jy(4V*3$Ugy+$A7FGF?myzRZ!FaktqZD+{bHk_+38;KT!cBh1KMsQuQYUTi!
ztMG#@Bdsb-2MqyX_kPvor)PRVt8Q*uRK(Z>&BLq&=aQgy2wBC5vh!zbKs>uf{VRPV
zj;7w1jYP{>-rXBDs}M$PJF}ga1lL01b3S<iX|OI^_7}b+#}ayA69i%m3mqQ9Ohwy^
zL`&l#THdaS%|@vp;tZJ&+jro(hF^I3a`=80W?gnoVEZyST{rdkh4gBU58P~WJ*boZ
z>(cd~_g?ojQ~P&sTJ#u+G%c!97m^Q#jo%aafmW8!t;(0u$#^Vh||dD&_Dpm`>;
zempJxM2$PlE2lx$q_qwb&s)VS>i*c-0Lc5gt5$~_AP<@^lL}W{|N06zlwK4)>|!pm
zeiGU;!1WQV&Ctaq4gK=&f$+aRJZ#MC9zw{CQZw3gzZzK=JPAF%lhg!VcG$UTsT~kY
zZfxc(4VP$qdNu#X)~h?sP}^J6s-v=zb!9EJBy39uV9kE3SzrOhFm&nJmVwSAdg5a;
m^^%PJko7(DXK(DjyacQj+zS<?B_MoXwjO_gH_`CNoBSVe`=$Q?

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/expsyn_cpu_results.pickle b/test/benchmark/reference_data/expsyn_cpu_results.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..4a4f3119dda94e41a924f75f75179416b2fe3707
GIT binary patch
literal 7739
zcmbuEc_7s3AIG(utEA*uopO|NL~^C_IF=`+bYN9#48{>=m>EYYiP(;*R<Vlcl+v%L
zR1}3!xz7d><0_*=EiGF5eZHg3{(cRm_1AmGjPL9Fem>vN`|~`41*0@)3-Eq+MY4n_
z0p5)5)W}_t^I5{q6c@6uCzJO=(4ESpc=F@~D=m^Yo#N!|?#FvQkxHkMoP6mdN&qu*
zK1Lml#gp0x(l9bLNzjv;L56p#MKT@GMG&pWf|UkcnUSnX#FrRMGL!Q6*VtMYJEo>4
zVdN@e7P#Ri9iu1rD&C1~Uw{!i<(KrD`dVhBFH2_l;v^S$FBe}1h2iHl1oj0;12f3E
z-U*|H_ey4*jP8c(v&kFjj8z1P)AW4PoBTo6P#^V1tjEZ(c+aKr#?RpT>b||g9&kr&
z(gkjjf;)itvVL@Ra^l_p46DT?7Z);v=|yI`kzAN`3dN7&#H7*PcTo5m#P!d-Du#nG
z3QkJ%6RNonIL%4DG5S|A>fBuR&i=@LzFfVpCpP96IAG*MJ?MXa)(4DwZ=`e9g}lb7
zPg=-pdoh(TXz?gCBrgE-bax~>x|0akNG>#Q3f0S-FJ;GB{e&knju>e!;La>h4Fbxz
z)G7JZW+0MqazTUyU&?C5x%0JmRKV(|DF0|S;EqxCL{-W%h3z2By%P@2@#48B=;TSJ
zy7De8_D^E|C{d~x%`4tuw6!zmePwVL5YSIV%(S}=;#5F{dqopplz!yZ@PpR?Mo}>-
zmhB`@@F{2!$L!BRjQVqs@oHmf!Z7K7UY@kgok`kYyV{a3N7K#4*}gJb7_C|??_qZb
zDx)dkBeFj352E_1kIgTId^rYN8`Nsa;G{-9#pjg#N{j{@`}Jpq+k$vrcPie9PH@P4
z$zf?`X3ifZ<6@CPkvT?|ZkY<g(l6oqiP%H5U>k_qV(a6kT>c>W#{@)9#bI>7*IvWm
zI+W7jT6FSx{!tLENlUH-*%Ki0{{!-=L`W_}r8BZQFV5G;t<<)_h6Sm=-)yhFfYFQ7
zF-ZpHhd^9>?;hUB;mh>ovBd(jlPz#7)%yp-C-=i+e<dy}V5=dBj<7x<nm(^yM5$gh
zXHQ~zc!o?N)`&!<QU8AKc<+i2cOqUFgJ#rJFSFHUSs_OD)H02Gk5<4t8J2FlG#_Hb
z`SErCDOpjVn)1(P^!GOhH;BsntuesJEmtMr14{v;fexLFlZC|c(Oi!>bu1&ylGs2j
z(l?Pb#)duEFdd_5_bU?n<}-lgZb`M4YyglbO*Lrt+c7f9^N^-=Kj@6Ae_lym2J1J_
zQ8IO=o+*%cAEDlVlt7S34Ov1ZQ=Mp3CY|iWB)beXa(=edC`ta5V2_cgMb8cK!Rr8X
zYwEhQ-rzb%F+{dP;_!$tjzL;?g>^A<zHnT(cgr+jSmhckp<_+h5*~khpEv;{{mGVZ
zq<NbB$DBy`n>NA^SDk#PLeOMJ^mQcw&AhaoC%898q;Xgo?t2;<dSHdsjK~t`po0oh
zhnqMDK={m*SS#?7s2*RIFqPs)_M~_bW#T4Cq}vn-5-$dWGJ2N1$ZLGH)8ZXA?6O^*
zV@kRMwVZiIjOwZiq9{h6HTwaOeVJuX9!`~;aa#x@DThVw@*E7coHPG&rpjaxMQK8Z
z!&?ZGX8u3IR}<|;e(J&4xy2YQi7v*^53~|EI_&6ae}O1|TIE;;orE9GqlmLlT3w(4
zDU);SYcqi2em(tt{U#8_J!#@Mn+ccPBne*-JpEo|x|18}lVv0?rZ1K9`PE2CH+q)D
zaF5u~`!QOOZ`YkHas>Kzbc3+l-DM!+_ofce&hzE&6OfnaJ@En~ht7sm2UQ_4dvCm1
zxL|2EbaF|SPiBJ;VWZXe5|1ejkN;q29z?~4+^cqH?#Ar`@MF3uhvGMbh$~dqG&|1s
z4yt!fD9&?)>@!ixxV-Sj5(vR{-j;VGAUdjl+q83Q_+Fm#@`Ur6-+mvX#kQ8thr;vH
z7%3^e_L%#-Ht=aAk}RE+LFfo2KMZx`3)&w&o|Y;Vj?wgN0k7<y5{SsdRtsVzp*!_o
z5Mal5@~mDmR>aZslyhyCLAcHujLeg+g<UkJ!{dHJF|fP73`DkX=xb!mm$mk-nT!h^
zI;mXfw*TB+nu1YnyHnOP?@Em71X~33tc!`VF!(;!5i^(bJI;@+g&}r*q{3Te1u8J^
zF_n3C66RaZeU}R_YvlOCzM|1tDw2U1Z8KE<rPUld$g3%Os(vi!P_NQD%Wku86R^w2
z4ExnQ?zo<x-dG4LI#*Wo1c?<2>z`aDX5#(;L~7^*e;FgbylnC~PAG=2t<7k#GUu>i
z=v^9lrrQL<hV9WJm8?0ytJtNy9{Q~*%xS2KD!;iKqji<LKFkk_1<M|5Rxo2NF=|#;
zTj8(T11x+~xaQ-nH;bFEKz{F*n~KuFzM7g7CWYSKu;_@ke&Y^X!oF`$Va~(sXpe2{
zF>*|<bk<VX1|+fK6VpHZ1SCpj5}|G*M$Joqla1O9HW>LUgmvt=J`tmrlZymrPjdqi
zk?WavpG}aADTO)Zoz^?DAonPT@~~ytL4Y~=es*rK1i-9ql!_?Q8OfH~DRFi!%bAdv
zG2&N+Snw>=)LJXi`~<-kZHDZ&6}*6v9#aZ)s#28>tnP(grFgDvH`^6xj=x+Sd~zYs
z=;*bQucVGd^U_Pt$2tTG%igeLfA+W-a6~-jajN<j5WoE#wNe%~kwglMI?`!mXMc((
zZ@)H_!zx4w^77{MFFz&D@axfbn_M$Sv;znFomAEh*(vKnFyb2P-YT@a3S#A9mfV(O
ze9OG_X|w##Tj4_+&)Vm_Ggic?bnlV=jDaMK%9Pv1&v{7_tEv0#a9_>t&RKE{4~iB>
z(%o?l)deuybjtpfEo3GIVor{J%H~$Spk<f;BIS2-U>i1$QCO}8o6xd&v5PI9uq`f2
zeckC%7fg6Nl-;JEyU4J2L+L&I`n&R*Td97mcFSofjd`2YL?q#H<USVBoamDZqM4d;
z`~-_Hd!J>&(#v5mWv&`X60_ZX4`N&Y$Hd(ZP#1kR5Aqaf7YUO${2S`AWH`5a$+;V4
z`!Ldz8Ly$DlML`Bd6}CXXM<>9>)ZXnKf~EpF)tJmjNf*~52%l=VR&Ir+rm`ZCJVyV
zcHhs|lX)w^%W&<n-ODIB&ZYl#;^N^!jMUzzJ(8!q21YM6*{l^UApAw1lzLe3x14=z
zi;oVZ)UFF}KHzz?U}P$c3<|S=mFibsX81IB7Ge2t+zc1SD1|bFIeL~iix0>z5`_Ld
z>s3W0GP8rGA!xDl&hB49%+WaG9s)BapV743?7X1unY*x^h`ODzX|4`bbw|`3rqvsa
zDt0&QVf{pmrYytnv2}=d<UGGN_Zq9H6b>pfylc+3LJQ?YJzua@PZ&f-g}`#xD89h#
zQqx%Tpt%^Wa7(-!S||cV5FHa-fT}_C*CxhKj39uGzYF;5nVy@VVSj4}Mn5nXDIR>A
z4^f^OxOWnx4Mf4V<}>fL`7)P^h|_E*L#AsEiiF*u>S0u}6H}i5`V_jJ{`KY+M^Eu8
zcx;*8)VW*3LFxrYi&m&nVkX>zO<Q&Ft>ktnIgWVF#8-~bfQ4_Smp}Y%pdlVQZ4h_D
zg>D_#LY5sZ<HRq78n39IYW>nEoUm{VnJ&f2{yAbEoTpjLtTsBS4kV%alJB*lyK~~M
zsZsaNABm)Hg4*D#z46eK-2}7z=fSg8C#<2EECijZZn9>JiZh&9j5X7RcF1j9We&+K
z`zY+=VPa>^EmLig{Qxkol|4Sje~biEo|2gN@Bw^o-I)$Xd>2f!<rcK`_VG}e<u%e8
znZm*ZjP%$tT`)i5{vTl&*|yBOKCQ$GXxd{p*6lX~nt*#-F0e~RqN)2;zvtJ5ageL6
z72divU~1j&6B`Yue+0p*m?YN~O4N)mOPt}c-IMG=Q78Ka=q%D2-e7Bg&dyJt82Z4c
zaR2n)-v`N~?jAcuyH^&?hRMrW$@O&C{4D5z4vC!T>Ao;tjQcGkwEj=Hi7z+drHgdb
z10!H}u)N`|w15!|OC|5}&T>Nwq3Q#7cD%jTMfhnnHq7B8=TVV6?zkPjk_R*XqJoBL
zbCRG5cHE>fRA><P9hnkM#$uQFBhM1o+I|k2%eqIncjYAlxv|ddnt>c3dq&0!FFQb(
zJ{0z5pIgjm{(fKFKfCmMT+KU-#zp&F!zM)B*9wRzZ%PEwy7$d7Kg5^5PxL9fy9kPH
z&ve_iDYla!dooFD1Lqh(+^cm<+|*Vf47d4r++zy-rWFRJA<Hq6O<JthCAk6c>w;{K
zmh}+#(Gk%gy_)ZRRI?L(ZXJZFK>KtayCenX^XlCGRqxcLL9}pVR6p+ItpYFbb;o))
zqeOn^x($Y^t<`Wm+xg^tdm?mquEV}u(TyNpy$Q*QRW#s>+t>8!a(EpDBHu)wrmU+2
zwzH?c4ko{c$X89VzOw8zVf*m8&Nr6Jw}gJ%WqL|rg)uytT%U<?>3JBnIYo-mLe7Hd
zp@zHG!Df-qlUXJ+k9Mc39L{x^-C`2u&@4-$Z%){hYY0iO%i&Gd{0RiH(f3*VmDq3F
zFPj#o4o68wdMXccG?oG>GqHD^`T&Tt<%)@F#0K!oW>U#n*IU^FQ_aHHBAP}$0T@-*
z?4dpr%LVan^Wl!IaP;rX8XmyMX7X2qpS!=w>Ct-|j0ASXn{U>B3#GBw=0MdU`1cAo
z;_&iwAq)9Z7t9o39Laz|b-kebgz%)*7(MRlHC>y9Fe>moj#~Bz@G5z1!QU44P+EOl
z2}Wjt<#JONnSpT<7UIQ?U{3oIR`BY!eZayu_{*fbj_?0z4Q#q5FIXG#5+;DsyQR0Y
zZGvFkjVvaIm{b!cetYn@o$r&gw1>&#mk*k{0&N{Y5-_OLy9(;CUE9P+IUZ&){v=f@
z!WIJxei-SzSX#yIg;1*;D6Y0Lo`X@<^7JOUnl(W(rp<g?i7sY8ga7i7Be&?CO@OFv
zqio1oA0!AcTN;=jeKkgcDVwI0SGrjULfWqURMzQqu;o%i+QT`-7H`eQ^c4wK1dQ~U
zg1;@hv54gc);M?i$frk10*#Zly-e4iK(m0nOR8sZB$_Jib3U|Apjkftoap&!r@#@#
YZjWO1SP*L@2IpOc14IaZU&g}!0v5kLdjJ3c

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/expsyn_gpu.pickle b/test/benchmark/reference_data/expsyn_gpu.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..f8be2d5ad55c8cb0d3dd4741f70b396c731bca14
GIT binary patch
literal 492
zcmZo*o%)cG0Ss!V^suE?6cktHO#w35^U4ZJD$GoPY>gh)yt0gfWRS3^zj1teL1}zm
zeo;!YPFiAdNp50E23QeyUQu3ra%oX~YDLKu4R`y_E6og*PJZt8o`3QgW~eB++qW5=
z^L24`cDJAW%RyWEwPMMX9$t{9;*!LYRG7Am&04b0XPtAm+ge-Jxu%cT-9BUH`s3X{
zE&v((lxFljFe{l-THM2#mz$rG6Q7k?0=8T<Co?G}wJb9^73j1)6nF6<+y(X=&}nV^
z&%SFfW^}i^t32(e*y|>s$kto^{@toT#_>;|gJjkMohE?nG?>op70v>8na$ko4jv6#
z%D-iwyM6X7g}y)~pr^7oEnU_>V<XUYJ(5VS$xF;j&d)0;N=z<EOale2Mh`X@-JZVi
z$p=YyyE_Zgc2C`V0_c$Q9MARr76BPgt@i#kkH_hvjd|1ZGV0vzrfz-DDAnZwbjaL=
YtIj=n4`duj`?Er!3+STK(&A!004KWB`Tzg`

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/expsyn_gpu_100mil_1024x128.pickle b/test/benchmark/reference_data/expsyn_gpu_100mil_1024x128.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..aac7dd2e272e441e22c05ad054cd40fbed07c21c
GIT binary patch
literal 349
zcmZo*of^c*00y;Fde~Ac3W_W9rU04jd1VD96=o(twnh(YURg##GDuj|-#9+Kpfo-&
zzbGYHCoQqKBsZ}n1FVQUuP84*xwI%gwW4H-hP!>|>;SR54KLj7J(jmTnp?}^Zr`Tq
z5SYrb%H4i)qbl>PDGnu5dU!#aic1npQeoOMR$a0>x_-aA-KtyMdM}<Zx!Y$<NxPfm
zcNfSw=xTmiji+QvX>ku{UT%I$PJC8o3CLNJIhjc*sb!hTsquMviFwKSc_l@O$t8(t
zAirqz@FAQA_8-u7t!-0hHHOQ(+w*-ne>!+o7*HhaeUIwmr$EN(bF8@*l|a`CAiEBx
tGq1xw^ua7;piV>n-7cZa-0gF#Oq!&_0^IEj#Afmq7M6h=SXx}H2LMQ;jcNb@

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/expsyn_icc_clang.pickle b/test/benchmark/reference_data/expsyn_icc_clang.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..73838f74ed367f8872562e432f3025612b733f07
GIT binary patch
literal 6519
zcmbuDdpuO>AIH0)l0s=C6;Y`qL?rc)^tcrwDKt&YAZ83RLn@c<V%!R8mF(I|SJg(O
zbT10y7DKn0L=4ilU6zvH_ek1(byWKK>vLwD*Llz9`97cL`#s?jiN1;hy8l8Fc#^aY
zEOsz6AtXVAC*?t-Q@OsJ?h6TTCWqz=k~iE|m%<95xqEmAc7G$y3}8~+xd9Z~22O$o
zMorIK&#pUKi;=dK`N0$^Klr3>q^AC`{UBbriuhTlI0-x%;wCnS%AtL}t=ikD`Mi1t
zMk;3xI|(mu#HiXZHnPo83q)4%PU{)ToCGeLC*kf(WqNi$c!cd73Y|`6a~M>P7b%K(
znx2o~wgpD!bmL;h1(87TF_0~M9D|W~PC~&-^~a<r9m&J&BNo2Ii1w^HT;bMhj5^j{
zoM^aE9YoGTccrYcfWlMw_v0z+y*U&Ir=>Qe9Ks@}0|_TDV<cDb;dy7gK1RYDqTJ+0
z@J4uF_Wrq)W>Suiw_3;34mV(Af2i5|e0mN>A9Ee}!ix1E_;;ND^w1|b#(vFVV{X2L
z9Hfz_Jt8C<Bg4$#;1j~d@cb)E(IJtOLA+yM%G0^k2gu1XDd#MV76?u+aPe<~=jSK0
z>z_-5C=~qeI4_t0QT-1H-H%0MGFTLPAkE#^dtKk=G)-AQ!^9kZ)67^y=lm`^SR|EV
z)sTB(AZEsG+8p0N%2ZWS6X87X8Ag`xvJYop*o0A4w$Ui36bBH^qK3C&LSW+ZhA|m_
z9=^o#bQ>Z|tPzFj$NW#-jQmo<-PI3OERxz~{2}6Rj7(2YIHfszIvDgOZ)qVl0=`0H
z_a<5#1uEHpSMldNKbGfKxRuFbWac!Q;Tw7tJSy?I%P@KXLZIsXRuD><CFeja(hrdg
zUNzgHa~zo8VxM7RR0$-F3OPzUpqfR7^t)Y>llmmNZ(R50v>Ha^UR*NXdwc)@l(*|B
zNDTu~&3ZrhOa?(R=948%D%0JM$qAskbEx!B0YGMp^!>kY)K-Bl5k;4pS3yY#JKQp4
zq~!s|GVW=^l|6mJRG6t<&iWH0gRm3Fn?k_ikEJ(<m?+UfR7KT_$2Sr%3O%;`AkDxi
zojOfe8=Yx3`4_e~0*x?g#F;f6K(lD|^@ezQpEPBhnwq3p7)_Rwjjx_{2Ox^d-^j$9
z64tnSH$O=vs)x&yV$!^*zBC3=JYEuze#(%33^tqAn}HO%H-pY))7XIwA|5q(U*+MV
z@}`}#4>3}o{F}<w=$%m3#%AnoK1m=f{Ax=tivjZMJgh3vwBuUaU^MG#*6wqY_CZ-+
zH!<7oF$zTK&a3IK<AIRvB||(?8+IOp8sP3l>A|HiI9w*J_o_D~DLuQA6tppAa^=K9
zAsDIoSQRYG+m2CV@~%=T=VKsJ-L|fH{el#<!&X;Q;{(*E+UzmL$66uNI+g|)?HHj5
z!fv<o+;BJ+xV(A&Mf^^pw%Qv!pEDnly~CN}!!m%Cf8;r)z-ke!{DhbPNN-XiWfk1I
zoKqAUjThG3&m6ey{UD44H}xMN&mABNws_X9Y$aHYevY-z)D=yAYZ>`UjFIu4#LB#V
zssv-o%h4y@L1?|Yoza*}3fmeMXZ<V%^3a)e?~S_t0gPJl-~^|gV?mtzHOKvy8v(nZ
z->~0L<asXM(N4?YxQY=^Z<HY_)=+i1JiiUZd_C_wzjH`=tN3B3M-+!+gs3O?%e@PR
zLd}vMdqPM9AsU`8ws{1+T(4<F#r}8>ubyCWM%apxqhrHWyNSRen&jD)d=$b=R8y4F
zWJVmFq&X}&+sTM$20}ROT$^2HJp-0FLnp}J?l6d~*f#6(ON4#>$YB>X?|YpLjMgz1
zx7cST1AtEJ@H#pdM3rvUGpl)hk+jg4U889d9Z?7U@iTosMlG(X5-#DLAbeyFBu|th
zNcxw<_Lnj%!XSQaC$$+2SvCb26y0OD9zF=-@t9FtLd^Pt5uEmyN}s0>Ex-Xa^MU}_
zu7b48DYoep5ZS704A-nCU=;e7!&I5LH!39<tsO9>SzGfrfLO4^=Jg;rdPEOPpBd|3
z>5Ha?zW;If%r~%)XB_n5je-r`vijyN&MPRsmh}(+JR_J()C?Ts>jM0!9zitU?u_kD
z;<-c;cZXi@cRjK9<+`U&{eDI_1f3pV2Pf*3qe)@1>Tv8gJ=N+syn87KaZ=RWabOBb
zM)TvU{Vi4Ds9#ndL4A?wg%N-Llcakqpy*0fS?rdBLxH|0qxE|C=@-0DDgEX8$8x&K
z?K($&Ae;;fuZ84KuIK{Bm72~&8=%u@qPtAD9kG@awteHN#*pB<82Q~zbyQ1+ozy<M
zbAVb~If&GhJjoBK#KM1xlP_4`#r$?wi+G0?MY4uqv|P!}H9T=86h(kdy|VsP5NQw7
zC#)xrzp_j0nbPZFYFOcrw&37$Ax338tcQ9IypB=%kb+~DlA8(Mg+F8KC*HpE`bfUc
z+!e5z7G6^WJ1>G|qG+WLZ<Z2yontndr$3t1;A&BZLb>ipj5OWiBKCx6!{AbLaea*Q
z2oSGUPTLgkMF5-r6!7;m8?T)?rV<ibVe-iKkM|&Rn@rVjTv*u-V(Ih?p7PKxkW_F<
zS+Im@_6-;@OiG`v&ZlBjvfa+dRGSW^tSDJhaNr3r_cybJ$&v@ASAknA@|}-cL_jzR
zgZTw{)x_aEINh|oSQ1!BGrRPBn-Y6VC`OBvN69PNd%!-5nK<9kyB<WL7NvESJYnIF
zXSQ&E&X#(wJs7PzsK}zEtO1fBvrbbb86X)hGwH-#s3CHaS{Xx?JF#&XN$Stvex)`F
zqna-J?2;9;F{)KiG#nolL6Gz-vxUbNP8g<_i;?RhqdG>-HGtWDxGUi-{7UEt3Vr2Z
z!X<|(J&`~Dh(F|vt<;cd+Eb^28@pwquKYvz!(VF@;<%oGQRr7@3o|I%Rb`MOE<;PM
z{xJZCTw&ytP-P}G=Av<zadePuUo^D?M+r5aJ461I?;4gj`YBi=uG^4RXb0BFTa`xK
z97fa(mp78_6YNX%q0OQOZqV1!{oF!p@C<vCyXQYX{X<WB_k8yG9m99CyIH5W#<h7e
zM#}G5v7zm-K+Rf%ZU%`YKrB#+F12WaAM$m7T^c$mGNa%ewB<FoBF1#IfR#bZeswvl
z17dsC%F1h<gr_Dyi~7w@TP!jD!IE2#;S7%ZOKDXYu!{%krB@nef{<=Mv0URiDKg(;
zweG|VFh^Q!WSBV?E5T;%l1+YF0V5$lZ0u?^yK2JtPa}a=Z_Iql<K6>@bRZBHXYR74
zpTS6}%->+-yTw3P7$BGEa1lgl?6{g}Wm3%csm)gL7MC$fnB}`YY$rs?KLG(N9tmL6
zwWldtYz>%2IBU>9z;7Qh4K@ZEiwogMb5Idfs!Bp_HfXr4dmZBlBE_oBIL4lowB&{C
z_Nv6wum>Iq)sFk!21ET>4<a1}V5ou1(dx4h^js*{&(qZpS^1sAr9MsTgrE(2c*n($
z+m<_+V^r@}e!OTnoHF(IPu_O6Jx_{S@ni!d!yU?XmSK5*^Ha#93citc?!F+5DhIJb
zL;YO{)Vcjc{nj`rj@lPn+o_9@?)7t5Ci%lMig&J>k^k!s5V4(`D<>}?1#Lf6Jx$Mi
z8-)L>le(k%&{MU0rd~Cyf!8_hXCqCvrgo2m=;uJcT?O@94CKZo$YC_6aQvyv<<MT&
z`-G?EkAe3B^?Rz`H|@?QCFcJ^U3o}@2~j^gT6Ek~9;zTDkM;L&a2r3SYk!*Z2*O&E
z|A)BGR--=AN+xsgM~niOU$D;D1L0dw)jg3VZ4IJ*(r||ySXPqos$7)h<R{pR5jWt-
zr6qF}p&BZ5{-_R9#_0YaUxOtbHU#qg{vv<xRh_uXL0a1x4r6l%LCNu4uuNRAzv=pG
zz-}JwGmR~NPD)!e+LC!{XcCME-08Hi>5vTf!WC}jI9Xs+WRd6@qBoPEHR~_!kG^0O
zyQQ)7Z{%Z?tfed|y($GdBC(b{TN@^xmXVraDw+_RBpw&GCan;ez_xUy=kL4O0JByh
zbI;z|lj#`U4IEpoQXT}<{cVlvSz-GIy@tSdEwZGW{c;|b*<`unMnBlb^<6{P?R|O?
zSV&tVeoO9whYk?N%k-iHJH|i&lmyk;9GwHNu1e?F#RW062@`+3HELl0W$DqZfRX=T
zR*YT=G^q`mhRx0&VHRt+D&=|RFF{`<_a!PD{o~vqY%dk>cykVlukuky*>2weC_bHc
z_A%yX36g%bMh)}_7H&gn7|~0Z!D$DN0D~N_?pb*Zh;j+xw#_iBliN}nHQO`K`#D~i
zWg79`j8zS8Y&+tYV+uQ=G_Gr@jIe`%QRr7|RG)p_Ug8csTaRCmGI)_Dtj)UTFSl(Z
zJd#(i+IBCgFPh5P!_*W*V_}gLm!_6U+kiE{*rir9tAnu1Y>itpov0Zum#y_bmdO&l

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/hh.pickle b/test/benchmark/reference_data/hh.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..518192a9fd36b518cbc706eb1b964d8a451f47a2
GIT binary patch
literal 8351
zcmb`Mc_7qV|HsAEV$xzurEb}xA||QJ&6(=zsHTK=F)?NgnT45AO4*XisDx;_5?#^_
z(M8Ksc`98kqPne=Z7@BS(q`B5`D(kr>znBL_1AkwoY(oB&-uJR=bS*rnCJ;|-T$^k
zOXNA6=q=F(l0j}Pwu{(H)crxxgC}Bnfs`oOQW*jt7SqkcxBDj)o)3?~6#Fn(8%5Cu
z7*!|cU+eNXh>?ryuJ%8uBjKCPHN&Ht&Vjg2eQs1(BZ`(N6Tc)BxrkVwe(l}FfV_@(
z78qffalA!BBS!DW+|hEslm<e7_dKgXBSq0-p+v!*$s{Po+D>P%*)Bp6*G0r(uth#B
zmM@Db;`?}PVv%CCa~|ujF<ys}`S6MeZki%S?aOknO?)yNgdkqJ=TZh47L5{^TYWo^
zijiBE`I~gH0;W;?jD-9xlR-!poQixU?#5DNdb#l2y9XZncQR6x)}S+E&s~ee$fSsM
z%RZ+9qgFrr_F{nm#EyWJXFJ!Eqp-c>tg$ji-cvV3{QJ>%Ah5Z(Xd<<RU^><nagzop
z67|0i&+zmRF&tJbrjv5Ct2EPBvNmBfFK+3sgw%M5gz^@uhVn}g>WzM_Hf5w7!xHun
zW**;+5ijrT(Q+3POdGa`MM*z221J%}hr@JFf<vPx2i@9wJ~_zfiH9m*FTzMaQinO}
z)dm=UWMV|%)}bJDoBS0r_mYF`ej(dc=>T!mp5UmIq6y;%<kHkfEC+E|aC5oauK*&J
z4B>J4ZeBzvx(6RY#Dl@(^FC!y_qR%)vKW#dlDKW#?kj^{Si-Wu3DcfRX@GCmU+ZEl
zfV6HaWSVB0!~xOJFG<|}TQudXF;OSET^MaB@IRvvz8=$*z20jxHnTxg?%G<l<TMeP
zkq$&4zDW=qKH14u54O?CjhWP(Q7|y?ASdkH4+O!Uaaw*OND1!Py{l`xhmpsCL2(zi
zl)%6dHqKEy%|OWV<4+lv5(H`=Lcnw3G5I`^j|)@e!sd~NVdr19tlbYF4291b=PUGK
zY;UGve)Cum#R*jtOm~q&<ezuQzh{Gy?Td)Ul+JV*d)JK0z4EajDmqPm3|A!})IWxy
z4-}y)b(PLAxJwkh@I|yP4DLDL-PCs!5JfXz{~A1xl;TXVy;;-*jDpt2*WDk!9|q^r
zyT)ewfhc=8JGj({$PTe&5Rb)i@nUg_`EwMZv}i$r;|hhW&t;Cm_TaL`LYB~%OH@Du
zNsp2jFH2lAaWO{L9_)6Vt<Rv+32ocb-a@shzVvpj$b<!eJxg9=KubohTLDJ&Q}lWK
zLvJu@T({14Za@);;}>FHL@<F+$WbPSM6jK>E<Q{S<0CGED-!cqpFe%A7Bbj;0gJ~K
zkaD;4&1LDOVvLN>rK&PpLSVbOzu6b7y9I=}+`?>WE*UqCvTd`6mF7~6)|l~Yem8jo
z^+&8{XB?~uBBhu<;*kxpzP8^>%!NYk<>Bh$>cJo)-un!^R=qi9%Ef5hpBL%tA87$N
z!^v}g>l_fmsIo(jk>oQ79GQ8OI|3sOetV5P#}mLeDNgV?9S<V7vf2LqF(NlO>O|yC
z`aX|dLfs(UTb{7`CWN|9)UNHShEYSY>VY_MIIOi|Wp=EcIyorQjy`53)S=D#^EC8o
z-T|MbFzIOhIuJWfH@!@+CPMvFe-V54g+~2Fdh!MB;0tqTD|GMv0*GV74s|%jf|y<=
zXsWUxV?_-!#@X!sISC_&;gRLM3MH80kA;_wJ7GIC%zdIOIM}`IExwQStKQM5+3Bn$
zTLXdRvEAHa@?o$x!-m*9u<;wG9r$e%|2`>fr`>4g$z1RbFO!DA&9Qee>eRjQ%Fll|
z2tm9-{rw~&u(SIO%l?qOUpL)`%JmnsOkm)7U5`2zdV2!zeXq9jt1g3ROmLtmq?7X2
zQ=4AzRAOUfx&LZXY^OU$^(sNNaf#3~)UTM6^PlU&?u<1gGPX|+*F>4Q;*kF_j7}an
z9Uo%dNeusTfJhlyomM>=qL#$~$3Gno)oa~&9J~)$olpCpEhsvSQC$tw=<4J$z(C^%
zDhy~M!qzv3dwo>!$D(j6j4quwPm{OyfCXFWrmL70j%o5)!|dMVTp=UqoY`{Z-gGC7
zM(Muacl9E;OsDzqkil#-5I-dkr3Pyg1bue6#-OJ-f~J8{;FeO8f(x&RFgO;6+rZy_
z5qzOwd7?cTM7{d#M=PcF7%kRvyK*T6HgVlcTaMa}YY+y@73nl>qH_1c;Tjh*yZrWq
zV#F_SoOxio7lcB=Ua~4}8c<C6g|l_XIx>n*{iUITYtSM%=~#@3n+7$nqf}$<^|~|&
z#k$<Mns_OZ6VL#;`tV)cHn6<9-K*Q<%n6U{wv*34`)DX%K7VZTzv+IpN?i)tW@j*B
zFL}H5B7Fu%HIK*E`R+c2QLSah!tpmN0lugENt0vJ?(4#weJ<M^NYa41A>BgBf16ne
zqHU9I?e*<I|Iz*Cecp#(@ILt8m(SnlfUcr+jW}o|e)6q;HE98i-9Ep~^r0DShj#wf
zHA&TQrs>H}qZl5xRSg)7QOFGYj>a3Mn5NWq{J};aBPh$X$LY5gh7;@W@V`*^%LU!9
zXSzSofpk1KO=JEV2z)yubFx1j8i{s+X5qtG6=btO&j&h~r0$0U8g@%@de>o%k*tZ&
zZFCo5^qkrf-X%Fg<j)5eH0$47dU3*P`S=V~w|I<Z4*zZB#-dDE?&jq9ze2A-v!r1?
zHcDX&87z%5dddvfswEgjh27ga&;WW3rFZ^KYgJSs<bn__CvyQ2@-GGZOJekrC?7C;
zcf+bI+q)ErIUF2W1-Ex3voNYkc2*wOFb+haLC);r6=b}qa!BxF(~1lz*v!}sYu7-*
zuc$X<2VMxos8VO_rg4|M>y*{^g!KdOR~@k`JNxmJ46yI{DPeUJi^0CDl5z#VFB|}(
zNX<^xvVas=*1YoctGwONwXgnNC&Ur<ge<YpBXLpyWW$(p#ogK9`eMnP?*jg&D^{=4
zZ<QJE$4J#<->$OSpCK2h3&ciO=pZ&{YZ~3TN6K6kyyWUmQz(}UM_ur~B{hZ_I$<i+
zTM<Nx|8Lbxu61W{-(69<Ai2_)H3SNA{fYnN-0y<ql>fzQ+jg*nYSmEc6I>1)JzTM>
zIB3}2S#UNqn72n|H$MhO&(RrrgL4%`mBfqTcZLXE-&|3OeY>V`<>1J5ZhUldY8?ED
z=4Bf?Sa1ZbHW(u09J@nCAj@l?o^g5+M%1enOBU@+hs8_jGHjtjAZ116&rVH+PQD+m
zC|x`zrD$^-MuJ@1==+h-q)6pgbv<br0}xA3yu0DVC4;ECJHpa00-9C&LoNARqakss
za$c64>JWlxbUJaM`~d->-sf|lbiu<3$JF*><T&1mqjVHH6X`%aZtbXOpcthQdsx+*
zj6#+g)9m>Sd}ZA^Wyd2rEf9far8tMb4#e$_hH>7BL{5k$!-VVoyj<3^j9h#-&M>8Z
zI{q0*W_|X?kN@MtfB5K+AHVx_1@=vMEZ<vq$1{YDtt!r}fBZ!4CydG?!c@#PAe80%
zUKRv|>%$~^y5qCkttukzUorY+-5Vp5yvZ0n%QU|`_55*+%4XX95VH*0TCs%oy{Nq&
zs>&l*Y-<bo6QeQjo1Avmln}^6!h#j@K<p1v6!>M4BNtwrJMBkjjOI{W!|1-?rO&cb
zA56OB3dp*48%~{o<pMkDKF)rr)4j#`YnD!Wm@PR}=J88#4n`wvf4_B~lZ;Wx{#UD(
zo6Q3epI+Xux{ZvQM$w5hr1PMk3><E+o58*XzVbe$xt3E8zA|&Y)k*#LM4+udm%Z=d
z?KRpZKAEXnUHKU4Rrno@%Gd?CUeR(U)AT_2S*s;fr;&3T{W$(~7}O<)g}F}yQa1vw
za&r5ahdCfDJqrv1O9<}8{~Px|nh^51w=k_5|D1xY>SX?95p%)cO5WP2lpnhQqAg<0
zSVMm@ek98rxjLiu7F;3(X{S0IgCvv1dq4j{6OI(J+o#w5`+g9?zx@A|zxQ=GQ2)r8
zMZs3j<`pV!I|mnTCHsw<=*!{ERT4i(P$CE2G|5JlRgB&ib9Mzr%erdjqL*+2kX=up
z#z{;;G~V*6xeD#UrzXm{f3qPCck9l_K5ylGG15J|f`5I3Hyqc*s(xz4@*tw*{kLv}
zzM>~4jWUGYY%xCyBgVg)SIZgjA<dipPaD^DK)R@;-P)H-Y);9{{sH#hIF;T=Xg>Kn
zl-;SnZMQsD32`jF$W?e7Jqbjar06fDWHM6pROmhD$>j*>4-c+9T-|UNqo=L|P*Uy=
z5J%r0u`oDEkk0xZ(yz8mrRj#Lbx8v;QnsFLxnUOU!P0}wT4n~gNNL`9*J-=S+Yf1T
z*`(+v1sF|F+dE0R6%Jt1Cfz|r*%!f}hEnVj+~yLf=KVzN?Z!^(UY5E01P5Dn7K?qI
zCQpPBorSL(2y`iT7`-YH-T+8eU#WetT#Djcj67Q;v6q+q1*=~xA6?fp5>|hx_Ass6
z-JoCY82eb{`qDm^hEH2^$a*10I@xz8O)!9tsx)|3_!A{4c%^agqf$o8$cWL?z^k&i
znlg;E2gV$m?1NzO#V0&#8vH=S+1@)#ffF&|f1hqezk~Se*>EP$RGF%etxDgUH029j
zfHd9RC(;1oS9+yNwyES#Qe<g!+F$+#6EHF>{5i`rVk0cc%h4le77^{r@V%bd2dxQY
z%l;yN>q4nDQscU3D|AVE6Rzq0xbPv=0g0CyCA<Q}VRtu<nGYE)jiR({lEd?R7zO;H
z&Q@_xgp{bM)I4r#2Pt7VxpvoVM}qdt!MlF}{$|53wU;D4NP@P;#G(4_i`p;<*5VHP
z)K<7-l&)}o;JxE3#o=d`)vPPf3(s*Z(K`}Y3fW-LQR;qt7znR7<?hqr=?toE-y8m?
z)R1s{4s3XCP}Go{j}>8w_dVNm$`Q^%WgeVAMC{cNy&et!Q=NG0eRWwFneKdeHdLVo
zMju_wh?{l?#1zMl9U1vV@cP#9KQ&4^*xnis&w3|s+*XUX1A;x0MAP$tQa1I$PXVG9
zG6HFo?u4mw2^ftv%J-Z}KMrAwtX5rGECmsxv~+K+CPC2WYp|zA6a7>MUxqh5>9t8_
z?r<$wn%J_|d~*;$nEg7{)MpbJ#M3GC^OBs`aK=^Z_}91*jWB!NA7<^R)q}7QFzd`;
z5)kTrZup<cyC&zdRiPKtYx!ZxYFO0LZ2j>sI*!2Mc}XbuBp<HjdoG4FmKRb#-5Mjg
z2kn`8m9TQsn7K!@_DlhBsGL8^5+43OJoNYqL>tY`x@}dF=gSdvza$e6rtShUyvO;t
zF{l2YN5Ox-^K9$9`>Ww7Xl!f}bK9IX;g}f2)K`!Oz*dX&&GQUCun+ip_VPY=YtSjv
z`Pix!?|nfV(h65z869Dpc0h~Z^L}$?)#6kD6n;orb$I!3WBGb8;c2J$C(6*;l!`xW
zwXxO9*n5JVM4KCWejDb~p#$D7Cu3EOBp#i5WUgrO?e!OuP2f19m%p&m1)?UhNDLnC
zFdl@{E)zM%AY#cz|Bd3??{lFqJaUpPwqmR6f+^QEk3#1Z`Y14H-{8d{+EShj_;Wfb
q$%1k90ke){E7`f4Aq%pd0l;X(mDtG^AY7*$n9&r_y=-Ek@qYn#P!tRR

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/hh_expsyn_mavx512f.pickle b/test/benchmark/reference_data/hh_expsyn_mavx512f.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..98b994096759fe2c316f6217e5a9848125e6a7e2
GIT binary patch
literal 7423
zcmc(kdpy+nzsFruq9h}iY%Y}&(&{StOl*B}Nv)#GVj9fkI>L-xBBYk0i%2C_Xo>EU
zLfX=WHW?wxrJCHzEozl<JMZsQr#+{yw)VGsj(@z&!}vVEuh0AQetq5}L_F@zWFf&<
zQ0$B-5ohOE;=6<sgGuMOvjsnhyRg^{cY%~#^*wEiH^b4%#aHn2Se7@7=E(7;G5pxE
zGcaoM54|O^;tWPBI+m1`^(})}>@>O=Df2)?<)mdj{hl2gB~84^hfQZQKECa(`y{^V
ztluz7q1(k5FvBqFmDjco&k)9FAoI61W_AJWSWcAur^RVZ7Y`=KhvDPv@gD3m5YOo<
zv2Q3CX}TKkKTz8N>#t79UEV7MA`S;l((WJy>Ey_bo<Al5qw(~uapwanFzQ^Jv-;Q5
zZ$aEvTxNUVGJtTRL^nG+3Z7qil|GHhr2DWv=xk>ilkLr5_%a;Xp58723{r#oon0@k
zc7+EVTlP5NPNX45{Tp{Cmu&~qe%77c_hyoEQOAm{2s4(A!05*pui9i)OE6V(Sc$i~
zu^m&#Do~<Li|Pr3^uItu^8he+7YDk73ypA%#`N66V0mmIrwkq?D*P%HBh#|dDG@KW
z0USq_e%EX>h@Bg6a_8icxkr`k(vDV3oQ_dUdV-kdb%2vhxaP!rhJbS7A$p(8ErEOD
zj_!2UX2HTze<mhH>Dv+WnmcPfM!00KN;e`CqrPqJ*MiFDf;ccB`t<$)IZExLZL6L_
zMTU&o%xu_n2~fPt&Ybj}LpYb?owi~pVHoAl%hOz4*t9h^D=kPl`W0snBxN;Wv@|>8
z>QBX!!3nt~8@3e>f%wkZ{;oNnlw+Lu^3)9GVvNj|9o5kPO$$?{d&g=Qr(MC+aq|AN
z#jYt69P*!YSS(w%oE&7qb|)Lp`51YnzlcoL*M;>bD|8M`tOa4uUnHL*N)9ry^5U?6
zD@JTrl_l+o*#I&qoL?bx6hQPcO?PzL5Fl#*1LQ-A(3tN_XJm6a(%j?fOr<eW_3cSX
z;zPZ39AnKhEG+<$DG}ej%#4f)wTBtI8vpPbs`|d(L0PUhM(wPPa}Bh@LByEiJw922
zdKts=@N{w~wnt!y46#QvmM81uxf8r9`LPp02ZL?rhuW^Po)?XgL%ir9rS=NE;x=fI
zQA`JMeU-6{2K2TlsZWha@Gh!!g%@wKX);C%y=v>7Q3IHCFLB|r2aiD1488Du{Rd&v
zgf+zG{58Q!zWkXT7;Rd@dEMy5A_z7bq*V8UxK%MmEOrtpLD}0#^Czkp*;^ebE?G1d
z7QTF=BrM#7An;le{qt>tK<>Q-EIP~4lg0L?JF@9a7O4$^j=Dk?k{H>FnCZuhU4pe4
zhf8L3Xn-ilIeBYOJ}Jb5xf7ZX@4?8`Lq20`)Bvnq#9C&gUjd@}osok?B>_SCV8d5X
zgf3d=&@dV!OU-WgX%^+MILnxK!^;H3Ra+$!`%R=24~0%^Ziikx$2D8X&(8xEuk^Y-
zyU>rYLUUh)8-=J2PSiLbH-C4!8$*Zg>o;%C_k<13;zWokLtc2if06kywZ7Q)>h$Jv
zeN?y@%?J_RbNwvrRGrrGd*j3VK`fAN%(4&%&d<~9`H6R(%eO+U&$hQeucRZ6(evcN
zs$!|<AhH!h+OmfUyEI8@KE>FV$riZ0);t1jMA4H~tS390AnPs5=jMV~95{Z%Nzy})
zp04({-1F!sj6#Dn6_iHBVD!X(w}0EPF-E+5DMbtUD+mjnClYR1!qNAjdpkPQKKS|G
zIhqHX!(x1V?USD)rQR1GqxSLH{dyKg7^(XjzS}Angi&oJt*1B@9;5cz(t{<cN@NC8
zCC?YSd0E;)?~ORgu+DdZ9&Q)e<z=%11osbmda}S^`+w*Gz9lGYpSrE!O29Qia+6Tu
z!pLrTj=D>Ei@66tG)~EsW6F^|5NaK~`KsaQVT|a-8g=5vOE7A!Jw4UVvL2%s(|)dM
zj)P+f`=Und%lxl;tUqnX7jnJP+H2@&oi;|R=2vI;v(+$aG~KN;(03dJvq_}Y22%0!
z0;fvuFj=7BV2P34*_eWSR|~M+hx^!X7av60QDxH$+lT^3e-HG_v0L|)wuBeA8zYG;
z?@D#Lw4mNA6+$v;3=myBkGLOo$cRyWtfRI0WauvnbJ?%A)k7<*k21Nz>RACI&0NE!
zMR1{{{wKsE26SCxx$UA-NLsxe1-E}i4g_-XlvVR<LG&aF#~O5#A~*7nALUIB#%Ny7
zSwj_l1K5)GU{QtZzr&4Hz3N%`i6O#dlfQ}lml58m_VVOp!&Ho@I$C-j5uLES!jscA
zNn1e7*gh-H4O+$LrGe^(SISFDLEUOoo<GP-hbur`t8?GFNChxjqh|u^)D%G_FD5Ga
ztAe|J;xqlj(;&gUvv)BYugSq5bU(KIjJXT+t{>Ec9}h5q<9`P?s?Wc$c*BV{jHJwd
z-_72&5mvwCR{LbA2gDt^J-1qnFz@SvyIwuNwRGkzj96oON0&!t13^M3<pg~l5U7jH
z%vCHRBWSErdALI%2_vOsB{aqhlD6@IL1$SEOg@d(2jfjR9t6Qx2Y3CXfXpX<+=g3T
zwBrRix%~hUuq{R=148tP)yp<#gAy```uv8Im&^(wz!*GzcTov=`tzw1R$hnGQeV;{
zJB{B(Kv2Fqxa%hs*p9wbh0*3lgXYRGsOh?&%Tip`F9L;d+|a-cEi#J6%4wbrb?30r
ztJwphp%+hq1KbMnecTHmYAP;hZLuP10+M?a!*7d^zu@BHED9C!KDPgA#%zn58l!jQ
zSB$3I9-n>hRSQOK`?gBoZH5!m7H|6|yZRPDeV)=ap{~*iEn3*9H%8cF*G(!$HCz2&
z<rubu*j1RhNcJh=&9}^$ZHH*ng$E7+NQ~jZr95K&gI#8c+XFxxogQ33g`y(^2~-j0
z`&a=;;mS0Lsk>l(j#HQUP7e?}ybIeqh*|DKD1Y;O*-n{0@oC{+U}8CJYSAl$oAkIX
za`kIrC~H^VpZ~%rij=8O{;+V{u}2tbq*ID2)JkBulH1Q?G%o_7C?Av4BSo0=kIk3u
zl=$DQ%%C&M8rb?@?sEk=S()#oo6<ligzU&Ixl2m<@QjV=^HfONH3b^uRB~Zve#l9x
zkaL2xt)`vy-8V{A;G7^f|9Z0Ql*_;UR`xrL_H>Slm=)RxvkIkNm2dn8qi&IqsIUz@
zQV9O_(-}_vaFsF$FyRK~0W$xh?x0CD{2@L!Cvt`w0r4%9B~P-+j~8W%je7VC%i2S9
z;cg~YY~+0kIxlbhR8!^4aJwNXqoLB81^Jhm7|pdD7AehxtQ-1WIOT5NQV<nmJ@~v<
zg5z5zOP<u0+$W1+_CSG&HS^{`>*a~24y5mePb+w`+qJxGgUCU+{o(hoK<`z}vKSDd
zI>GukS1-O=?FXXf+P$OpY6QsNoGjaA9-atV2?e4yCGebUju%nS-eU|{T_EJrZMuW)
zNSSIjulIf%0Y#5C$mE1%_`uOx=_lMkaRiY$n#;UeBdC-w#L~Er^W=vf1&PG7wmcD8
z2Hn;ClydNKA-Ik6jO8tx&PjoY9j$tYD8DF~Pc!AGcMkcknP#94VKlQcHN`F(l4NL0
zx>vf~QxHM+346bPNLVyNJdu&u)G?k+fIhRjhi2>Hun+)DUc8<gcoRhYFLB9Ln@1$s
z>PS<RhrFKjP@HMi2LtjD$8%}u+LIsxZYBL9#3e{Zjwv#j4XS&`sXfEUqOS6Argbs_
z!`i*Q0?sM#m(hdyg(f4y*k`r3l<Q-((~jx5`>{VpgTtz$*g=W_<9~<h&v7MSC?m%e
z8O^3Z&!sLHXxv^%9;=2<##@nfr}-faLA=DX&C#+eN2J*tc245jYm7V+r0O`zu>i5$
zMS9sANWCF1+jBlIw-PnOakl>Nr26IovqQvr!!eP&7)@Mv@K(zW87N}@%aX_D5zu?a
zWcg+$NCVvG(N(UhB>#Lb{F0-d-oiJBhB4w^dpUE|)d?7tpNjkCO4l^Pr~hccY#(;>
z*Bgf1(9~2hs48s$vfj&sK0gQpnOH#BUH8M}WIL}uo7G~2kyB%URqkb&<H~1P-e4#8
zWAtFr`83la1H!Zay8*K!>D;}UL(n;?wN3XXorT@+h`CaeoC|x{k)gtwVLU?o=}-#u
zwDYi06^*B~u?eD}Pg~Xfg4iD*bdA?8Wcm{Z|9=da9cdT09WoolXwm&whHAO6^&JVC
z&!vjqg2-yqO>D3y-Is@UIbl?*sTeJMSA*HshLDuzuX_BWmw?!<pllMaBG{LIe8B9`
zdTSwj+8iU9qTe;bHmL)p`r5oYjpZP8)90V;gCT~b!23=V+Z@S(`v6t$(h+-09k4xC
zgq2bOCEs`Y%Kq6>*@XZ9%7EE?@ttOCFw8A1?OA!Mxo0qH_6l)`SO<r+<vWu{y6qzz
z$piDdSI+@EY02>oreSa)E;kEyZcf?=g)dZnUfWLaaq7P_U^b_(kId`dg%P6_TRuvd
z3CNrtJ;TKixXoQ>ye26^qarc6DqK-Lz#nSDGWd~M>;QyN75({5%hntaRfg4!get=1
ze{I0*(0UP3S_&VIO=kQwe$2uqsE8S&a~$r6f|$M8$m%$`(Pe{=a{}2yP*Ha*$2~p<
zcagF?o05xsET9Q06&%iQX%STNw<US=&G6l|FD2mvrFPjNN)x0-^LEaOXj|xiEwaC@
zDis|M4w5E$dD!b;XD5w?k3y<`^gbj#2Pod??(;Vt1qIN3xmf0mFk#}?CwcQ#yVRxx
zIN$E)!!jy&P6m<yU%wh1IOr{uaGfJk_eUbB3e?r-yoH<1^atU3&ex&vs+K6P44O6<
z#0|4u{m0=MzA?$0n>cFLaaQoZmOHA7oa*fWv&URGC;*yGi~5HB>)T(A1XF(7iodof
z5u*(U8-5IxfKjI0BznOrrag$^aCR3J5ipc*O7iBrHM+?zfiMEwO$cx3gbQY~*VUxt
z^)5iONau%PvHp>0s_Z=rv|V@@Sq&Yf9_PUIs7k*#Svoum#7@145T~(3&A^xLe*qy-
BlYsyL

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/hh_expsyn_nvhpc_cpu.pickle b/test/benchmark/reference_data/hh_expsyn_nvhpc_cpu.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..6a3a64157d4a1d2e69496f5bb9a4e7ce4491e505
GIT binary patch
literal 517
zcmZo*o%)NB0Ss!VX!Nk9RumLh=1r-c(!*DrU73@ZovNEyR$*#r1QKG+E6XTI2C*{J
z5{paX{f*;u3*r+?OY#d6i{eWP;-OmNeTx&*QVU9o;uCXn;(aso((>bd%Tkj|;)^qL
zQ>=id>7*x?WTY0w=RyoGOUlheW*fmdIY8Sqdbsn7^5T<Ai{eu&N~UPI+rPfL{p!ZM
zR_^vm^E-cwPV;fMe|4{3MX5y2-TqBvYi)Psl9DMsydX`*C5a`eFl`qu1t*5&rn=jw
zgioD#d&*CD`wQh!QNOzq-R&<i|9RN?YEH?N(&FOc9;S>8puHK)K+HPuzWpsX=f>H+
zkKFA(Jnvkqs@m*s|63^cH}CBXAVZsF#x%ZCtiFA?*=hZ;ry=h4YP;m-JuJTDZvSwD
W=#;c8r-6*^+nJe@m>|9_)dK)L4%f;6

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/hh_gpu.pickle b/test/benchmark/reference_data/hh_gpu.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..a05fe10a3d6667c586724af86db27a9fae57d12b
GIT binary patch
literal 472
zcmZo*oqCp$0Ss!V^e|;)OaW5td1VD96=o(twnh(YURg##GDuj|-#9+Kpfo-&zbGYH
zCoQqKBsZ}Hq(P&HJFh4&KDo3gKDDA`iiW#=2V=Y4VvQ&6_MWW&tUH!8xZAg+oLJMi
zV!ONjWT{y%oAWbEru6WFG!>U5mZZY86<x90(i$x5ZugqG!)cFasJng9zO_>XjXi*j
zPeqMUX@(_JN{f3q^K$c3a^kaqhJY*=&B;tkNiEAvP6awG55-+vFn0kx26R^I?+h1(
zJyPy=`&8=qw^VEZiacYTqH_A2yL}sPbcgj%Eugb_5YB?=D)HGXIq~8Qce|G_vr}3s
zUb)+s=pWf8AF&q5NIuK_-;NLHsvb!sSL7w;CFkdr6eT8?B&Owo9f|Ie|GT1ghpcnA
zyVC2Lq`p!J=#rQ9g{B>yK$l3ZI?<b-h}9)_#mDrowYuBgU3`|e{Gc(=0lJR_8m_GZ
TGJ?~-NN$`1bV+Gxaj_l%Tl~c~

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/hh_gpu_20mil_1024x128.pickle b/test/benchmark/reference_data/hh_gpu_20mil_1024x128.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..e36ad05729f4fb41d949e3374cc7effb5ae932e2
GIT binary patch
literal 337
zcmZo*o$AKO00y;FdYCdYrT{7Syt0Cl3NsTRTcd|HuPmb=86+&~ZycXqP#T|?UzC!p
zla^RqlABlp(xB19omZ3>pIllLpIT8eMZ?{`{jBJ=U7=Im?LD3q2Wv_Hb+>PkRob7J
zC+BWI=?4pkQgV37lpbD?rs9&sl2n+sqK<^%2A4>8yZgKhkySlr?)F95|F?0U*bii!
zxzs3j;8Dqx(&8S@yxjbhocOHF5|Fbbb25`sQp+-vQ{(gU67!Pt^Gb>mlS>lQ^1x2x
zf;kQ7H=xVf*g0eq7`5E(nPuGj*C{@Cw{M+e@H{u*Do`pqS8RKfI?!c22$w;06;-Hw
oV(?k+ZufS|#g%uBZGi3x7joORbv=;rKwCNA^&ZG!rNza106~6+5dZ)H

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/hh_ic_clang_gcc_w_wout_svml.pickle b/test/benchmark/reference_data/hh_ic_clang_gcc_w_wout_svml.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..b156034fad1089e567a14e2e13ec64a70c885260
GIT binary patch
literal 7607
zcmbuEc|4WbAIGCm3@H>*vNWWUElK5EqLZaarIK71_h!Ag?j=g4QDbYF>9%huTZ>9*
zlQI&~sv_ZLySB1aQ?f+A?{hVonVu2-`s?$^^E&VIIp6dBob%WsviHoyVS;~Q+*n}`
z4{jJ&Ggi!<&Y*F8*n$TUPbQo0Baq^@dQ^WF-Ob%INboGdWHG6392S+np3T+7sBTNo
z4gc2d7+HA!CXxBG1N?+K+B<*GI1q=n_vAf(%jU*P5-$a^X>9smueB?dK6tQ77^9^P
z8TlGn`!Q-CQ^GuIe;35h;fd+Pa@kx??705HsSHnF1}Bgn806ar_6-QLh~;{KhY_u+
z*!_)F8;rm3`HGHa6%czhv~2b^lY+Fm23P*7-h+|N@IPg&9HcO6wF_;pJxGYnhto`q
zasY%AE8^xuW3Cf8@RO}RmBF9|vVCdnKJWgG(z)JL$97!~MikljvS;h6G3xZcAWqHP
z2_oX6kmVUgG8Bq<b@(5OUAHlEXLQ%)=Q1!wJh{GJS8F|p_D+}UH%1el$o+kIs+T95
zy3Ap*4e8o+kMGK|Q<20-c_M?Rb@Bs7-Eu~6V*Pf5P+C^7eo7}Phh*I)*?mcN7zLi*
zxt?{c3{xayGG|#`--#(w(y=QhB^~Npo4?u6$6;e(v5*`@*2?IXgFi;<7e2g^*>N7m
zA8pty-2EGf33EFWGSx^yq@+~}yu->c3ilFyX8g+m0J#}oF#n?%fXp5{@BX+41c=H%
z3}N{B)0w{hR7Mcp&BxRAzs+fR6y!ASurx+fpD*UiExChH%f*(sDi2Q(&7)U&Yo8`#
zLe0CQrasNPj**!j_eAOz8b-}Q?blnSY(QM~E>18%E~uB`>)hM~Rw(|Hj?JRefsF0P
z@(iVud*ziT^hRzZM!J`l)I1k)1dABGDJ5B2AXpkpb(U6<aZ$u;M?8t%?~Bon3jtxW
z@5e*U#2pXauxCGr-q{_ua-R@h=?_3d_3g8a=ITi$b{X}HmE@Em9fty3#4*xH$Q1j0
zO9|lE4LL$ji$H8{xbf!J5^~C#nJF91VYm4`6*G!2tpd2D?dQy#w_}Qg*p<`!4NC>?
zjbi%xx%&`BB^X?qsBkLNkNJ;XC;nF_f({0as4LeXrG0}5M$X)OtLy?`VY`x-YyQeC
z2O<S?`dB9+Ao{7l5eeR+NWRpxyX=;Sk&g1=n;+jUgc2+B-Ly8P7epO?73!2l*fwSv
zQQF@nxLU4KKF1C^P{=PbyxXsVAink^cR4V1eV!gX-Dx=~L79lET7nx!b{S8`-xZw!
z0~b74*zmhAh?-5B)gs*lfoz`zOd8Y8kI81y+}Jb*Q((@&ZMZhu(WMvK;F{Q&CE_9=
zx{l{0KKF_N(VaN!^~^X@h)SNN>9O?~nW&U{DXx%$vC9hsS^_#iG%r1@d}0^@A@{|G
zK~P+GeCRL&Qi0yNCS7sg3&M(mpvoN`uo$Agao4gRl2VkS=mNvx7+J?wEZ|ctVDLO;
zxei~#3SOPklGl@o>fpqRG3g#OAG$9we-9BNy`&*1d;<gN-^r>o!i7chMi{M~!TsIL
zUkH-n-~wkMi!>1T6GjCqL;)2^R?Semt7Xr`h%qr@jhtgAM3nM^WXG2gAS`&XwP802
zbCmv#@cqQvSg!p1DU=Cb$zpCE6AKiJ?bF*7T0tb~$OVKiCdFxWFchn=Jq>Fm?N!zo
zZ3aPQZg_p4uq_DoyXuh#q0l)T4;jK29oX-_G?tqO^^0XxUp9wH|N8Vb#|GT&E#)&`
zz1_sZXpV`tyi6UWc1xl3`^X$?qE+g3x%)pO1Eq*R(nzh0fD^$cm&kePPSCZ*!;&~3
zy8S?O1i4L<h4|sb&KWA=z?s)7_LEl&dW?}mp~zZAaVj8wqGamIL(gfI?5@wg37J8%
z#w}`E$B&dNLct0hEt*hb58eE|ZsXE2$nF-~aI`JyPlDC>$5{U}7hB#6Q_+SXjAYfP
zncsW^OW9I&Xv4O6ZGaW3N{TpMKnmOAv{6R35qkX^<piZ`tKBi`!O5wUOqPQ1PE6}Q
z9ZA5>A2RIsGqYmPp>Q-7BfWc3qLqhbfVc3EfUex>AnLPTiM0GjW@_)95?2kLTAbgt
z`tsB%VD6;s;MB5nAdEAVmxbgDDs~1@v1str|H#q-3%J|6C=nR}fX48HWn<!Dr*$ts
z5F(u&N2Ha3GgT^<%x8*tc~-u@^$Ltal#lNGP@e&HyMDrn8>e<aS}Bfst?O(;_-y;5
z#6!y4tto}_p$v@7w<;AauFMDUjV%txs^5Y5)!2Ewb|d)=s-6V5TkXUssNE+q$Jq^L
zz-{8<on0^k3D#x(m?DAma|JzgFe%$=tC5|0A37AxdWx-5-a05$;R@}?mnLD<W+LC|
z{$UJo{5xe)vv%a_lddIj$i|<#t*u$G?vr<?U1PvmtJ!2_oaY52!pOm;Y%5C2KNlAY
z1m}V+Ebd%^((AQ&q`C4v5KLt3jWLrUBj_2yy?fgQj9{G#er5UlF^oQyJ%8|QnjUN!
z6sj}D%Z?xzbjr3`TKRXaK8lflC4;+e6an!%`^cFc(Cpfl+*x5Tr<Dw%S#RyED7!82
zmS<gD2G0^^FUoFbZkh>VLvCv05kmq(ZqO;)s((_e^`0wS7|ff@Y^-kq#YfMEYMM7t
zNLZiiRI4MS=+V@$-SFooI4I3t9xuj#Z2rXCBD9GT1y;;c)2*)xBWi*ZJIa;iM{^IR
z`v~q^eL1T|gq$ExzW(QzMCpGX=(bp2se3gdVKv;0B!3OM(`$-ReeUJ?@6w@{)qfhN
z?)B_25PzG%O}vZ4_AY`&TlvtAYnl&-nx@-wt6%MbhS}7kuFhDolQ7P3@NnPF=K7K)
z`xm!EgHyjKx2V#1Gw@xi3^p?0g6QlV$vR4U3P=1u7JF#rLHjl{lT$P5fujOHHFa!)
zUL}afHRU0W4urRT*=+E&Z}fZDm(KmqgUe@rZH^4exQWq1dCeQv#SdYy5@~Gapbz3t
zUCx1*&Sb7q#Hr)PPOM%Au3qFG@R!Pi<}h{DIh(odAOdgJTxc6XxY{2h{aD|{95|zE
zKjzrvSsTK^$0sV|a%LJvwXaU8Dm%fYhc6oPS}Kc4#*3;yM~0X_iGd#IWS(M>`5dF_
zm(q(n?8`8!nH+hoMEDZHYw{zue&_zmmpYg+Osy9qW36>DHP5mDxIH9ze9LG6o^Z`;
z@o*L?aAVIrO8TaK7^x{_nrB79;kPlHWy1Asg0lVjDr&P!A%IPP2>82MU1zy8zHwVP
zMl<)6yl6V01BXg|gHlC@*C5<7PX*<}7Z#GOR%?AK>i5alVTT^m-gf-<DVVBpYI$L~
z6$tnHH8C=kf(jO7^@5?14<WCqmI<pPJup&Lsu*EvVGbeZXzLcJyc>l31H~SRb7brk
z@dv$DF-ze7uyWDpOVP#kP{&CPmnZk^1@UI$`I?pREs7Jn?Ej%2Qf9A{V&85G!)U(t
zyZD2bRRG?FY111D+p^nRbw@<OF0xsmCTouznWjPr%VpBW<5#SKs%cG>RZ%v8syUWk
zPL+VuHuR9eXEr}z_!*{r0Y*NL4=81Bn+rv&vsZE3ez?EX*`!D<sw1}BH<?}Y!eM8+
z<}283ypd^s4RqLKx!Kcy<DLZZmeX?Ks-Wi%F0=W8<C}7Z!32ahOrLN6DF6smx28w#
z8V3Z3bM3u&IvGJ@sRrd#!V`>KKP%Mg4I{SC*)1uYTu6;3;rLJ;*?Gir4mz{>Aq}&C
z$%NU{eFKl}TG9p(^WxL=LKgvqwXWTwimhZ2HBDyrhF5Daa(vM>b9U7cn0-;;Xw$jT
zuph*5s$B+`fRGz>X7jxdZkA3H#)wLDx{&H23KW`;Ewk0m0>$JYx6b)+cK>!U8jGb=
zKO{j-$cGM}x2FM4KaKf&3!Ei{K-9@xshbu<)C4D1D$qN`hvrS6K?_<xOINS|f^6_L
z8^8QU-!FU#?=L_5>qGB=XKB7+h|9z%aL?AWNpzV-IZQyk>CvIx<3TKlSLsw&fC+pX
z;jbp!T`JZ)0PSeR+%+>S!M;~=A$zW<)q|M&Ea_2FHQ^&=Xqf$<ga<}e<$aQ;!12Mw
z!^79hC=Zsdcg8*&=5j!ulaL**OD9FHo!x4>!NMM+)y##qS))h5Yh&MS^y`Fzsa@{W
zUL;J^-<L2q{Mu9o@^_GNKF#W`#4j2c8RaJUq^(LO9CqF|{W2sqUoS{Lq;?h=E=AnO
zy==w^==1)nY`2rL&d~H69eHD~M1u&R7V(-I2#5RcS3?3FIM6F%3@7dR2tlVDt`MDy
z<uR&=u(O$X9li!u>^e$W^O;RXjCizcexdjzIArC0N{^KE$B1W>W066Dro-DH`d-W*
z+B{Jce|^LF0pkC>dRLq|7B;i01iqq_t`)90F9W29T`ZHkT0mTJW{i1G9*_LnGx%C_
z(=nPoZO_t8=QCkRt}c9dQc(oNOT(n(eVYm7{$Ly$@^?d>ujIfOSp}QNVC8u+O0gu2
zIxgv<iNbLZDy}DX8q1Lzotcv6pUi~Uv@P;b9SGjp<bpPRIKS|!M&EpznkcAbL8w~{
z)$YN^db!PX)sVzujFj3mltP8iV^qFq^7~1>P$uQeS|+5Nb|a&Qzk>WZO=KPHx~;cu
z3Jd4!Ks|@`rk>q+3q-!B{<z1J2+vpif80X}_Hy^>+ucS%6E>_>%yV^tOK|y0+RTx6
zpqG{Vr=5S~8c#M4)W|=3XuT$!zn1P#+z}@QEwu4M$GmCs){tlVhWFZpRtQ#M@WIYg
zTfJXW_bvqcn&m&SLr+8LZfj)A9+`(xC2q3XWK{wj-vm34yEREVS^}FCU$C#?h?YVP
zM0G1H_yqSzUVO4**)jO)KG@4RPo0J(S#b04&#gF|tOBRais-4j=e6sA!0h1;jqqqP
z0{%mDrK>lkvB|s7iA~Fk&Oq_rm!feNYz0yD!9St{K70-#*m-K#-Zr|7#K@!l)X%RK
z;nG#XTBRJZ)(#+S7rsmkpGXG5+t13^I@$$a{@*s-8yf^$nzuVL{;X*`h`85TDLLT;
zgxsKmou@3Lqv!+23;z?UsoE!A5>}YoCv7Aa!$f`USw~Yc3jU*B2?h3ZY|<$&)ceu+
c894AzLGFbSoYweH9Ez(-%ZQrba00de2g@4LwEzGB

literal 0
HcmV?d00001

diff --git a/test/benchmark/reference_data/memory_bound.pickle b/test/benchmark/reference_data/memory_bound.pickle
new file mode 100644
index 0000000000000000000000000000000000000000..b272a62c475f0226e2c72df8554a88978a6c5dad
GIT binary patch
literal 8721
zcmb`Nc_38#`^PIGOR`r|AzLY=C}9qjPRLHFzM96EK^QU%WsNo^$x^m%maDz1O?zER
zN(-TwnIg@MWn?c}bnE*ZTJG=OX~g&EulI}@=XIXX^LgIS^EqNf)5J8!4E{u>CP)X8
z0)s-f%<~GOQ@m3nQ*{%>y-8#OJ%Bd&Le!5!BL#p=kaRS{2ZxY|-hPb1-$+qHD0m`0
z1WyX1rRt&x=XGW7!lrr@S)A@>%ZN{fcbYlj!F~5Z^cHSopNXZVCQQV>M5PgEq;Fqi
z9@E*@Vo-}BV@0A(&wgHsBFy}<<ZJiFfH?E2IVa<HS}L8IAmT$LVieOH7vRZc0+kj>
zp!wp-v=9=BK_b$ELj1Op1Y$K&tINwTo<b4T9d4PZEgeOg{Ko1K=3fC3QM9{6nk5)(
zvN!35=~)z4D{XzE^s)iaNasI|^f?Veb8gJz(_;s*M2P_eiqGJMrT<PQ7-itZttoek
zP-NCItHl206d>r%AV;giA8!`fFZ0J^S%D}lG}g~mQX54gqte!DimL(va-q9aLJ39c
z?Gn)RRx?0JQ2P7j@tgc;c-PhTs|0i0zq;q`jYbsFJg(Eoo01G1H>GxFHGKq8Wm9c>
z^uAyYS(E1}x$q$^*`p6LNq?dUOA3c?I{|-)rQ9{}q{kNHQ2sZ^Dr@T%f<bCFXD{1Y
zg(428CfgErk>L9G)hyDA9YOF8#`L6^3kI3FWsKV;2*}h7mnWhY-Ee(5Q8!VFCy4p}
z_2oxa00=!nf)W_y9e{;m@Zz#qJn)nt%D3zpd{^;X77uL${LnUlbZ+V>a=u*Kf+8*-
zI$Rm2AHq8|s%spwf?%->Qxh&d-2-gOKhpW1-)Ct~paqE7p~$#eoX2&O-+*m$N5tKH
zB8a)IaU$wzShVC^vEckVRkL`mliTN96baosfz#tA2~;VJYnx*jK*e7$TTXJGa4M$r
zetC7~F%*%FaQ{=q2}^4me#YVTSX!^ydTpj@4@RXp6e<dVLJXqNLI^|}flLt|u9kkB
zef$(yiX2jMP{){;0Cw;}Nwwk{6zPskaqO`AGz^$yt=*aMQWRG(`NaMNQ90nTO`G-?
zs}2Km$m>0&g@Gx34cAD>nofjOt?`7kMVXB3N&ZeimNxmhrYYQVQ%l+EP%GFzLKl)(
zNiMJ6h$1@^j`S;B?*$+`|H?b!`%t7&xiWq&%?-;odV)BG<Vy%31!8gX6@^lz3I#Ke
zN+o?Sn|QKcAel}jQ5k_)Mb%CCR|&16w%DF*KoNa+^aL>%iWcXw$!Wt5cMwk(a8I0!
z0n~q&(7HzozWe-OKPWtsJkRdli6V97j%sD?>p)aEnnYxifspDu5xXP;+cS_5LiELd
z#l;8G=oHfTS3j$*cydrMi4qtrkh|%3%XhQI;c^=6ezhs(*y5*Gw7h8u193H-Xv5hp
zkek^n+87sc1I2Ykd&#wV_CRD?qzXM0O+fS;c%5`7VQcO<TH-)3h5$b=f|nm2i@5MR
z$US(VQDy^_=C*YiMX%=pc-GVH$EGd>aon`ya+H|hJNS@_z6v%+aoth&B|C%f0eElD
zW2YMdu*t`zWR{Z~u-x!f!Xj_@>-Qlf)ZF}7#w&aRthFhdo|9gKBHY{<0zFh3#BrCT
zMrT*Spmk-|of=T-<)z<Edaa`cpq2L}GImyicwze4^kyg)YSWP-3ilaqRFfvMN(@B|
zPhB^UZ-;c|h8UQXork^1-6xfwktr{bwQi+~?#nyi|HxRgD%{Rbkj2JUw)t|tAl3#M
z+ol^~tV>46`cp6BW+}%hNSL6w-o5b!KBBF_c&JVP1oH!kY~Gf5EhoXS+p2qRW$cH<
z(YYTUIq(7)BZ5qotYBg4_Kwf8wsgi|Ek+DW9!lPyn>hE1({}MMdMIM_fWKHKx)?Ao
zOsZMhSpnka;n#25y#(^I#;m*((Q*sCU%4Ziupjc9IdJBP!&7<4SkY#y;Da`U89N)x
z*bzD=PwrC&De*9htQM~*s@1axkPqus96bh5ghwo>D;KW>A|Z~+<XL*{>j3MO(|zwy
zEeVdve6bMk=dm0_yJL0@A1kyYc1)hamEvOgE);P+UmYMyh3<>{AvM!HxecfST*EZ9
zGKZn!48&Vb$UqU@`RCTC#$N&$_8D!>l&K*2x31QBpT(#~;g~!H+G?X2yHUh$ab<9s
ztQmlP{KIqC9LPMLw@t2(UGFepub4A4YQZcN<$1iM1IvJ?Ugoml-VP8gIyW=PA{dy`
zC>)cgY`m;iAC}F1!z}aY!uLS-**R2M8g?=-VE-cND9K^S*yHUtrd&gDt?$L!)<$A=
zy?Rv~*%=yMCa3$6)!||+$Dl#;3JD^3hmryY-Ehz|O)$?Kw5RXC`)W`>UVj}zhILbJ
z-{ixGujHXfQknlL{_dGzqUGZX?Q%U(q$PB0_9Bn-a2o>Lw0z^$fdX&<od-3C&7N>z
zBj3Je)4JLpL`_+C=L|WZ|LUg8zVGWFctd{r<Mrqq_3pxxUnWdM5vP=%cJ1p>1DnpD
zdMIPN2Sn|J=atS7LILb7F^RzW<&dLVo7Kby*1-(M8Y88Lug7|I8Hwv}m8N1j;rhQ&
zkH}TyUKDk2xQimD$@9;@t+)*EjHIJCKEuS&l)7(vYgMVhyI{^SFCu8jporT3DM7O(
z$3og@ik~Z(tpY;7H8Fx4kLAyht1cYby$EyGjYVN`I}V|UBrEikq9OPX*E40MpDLI-
z*DLjzcMe}5Y~6iIIAK9Aiu<6@{N_}{b;y#^rsWIQmVtO6kw{Zd!b1MT0DdHfkVF~6
z6z;B^UZ=$LY3Ja=j!|?2I!pkZj)n-?y|EzPakEwD2p(+fczP9OZHrMv+pMIDeho%}
zy1S$>Bbp<K&fD8!wy(!{aia<A5bw_&R)(Cnw2q77+Dso`>u7a@xK1>RO9>ALA*=XG
zdGU6^0q$%(H@23BBE}p}k<ArN;N<+CW{3Gp-Fu>vbzBn$Y&9z2;axUS`ak35D58kW
zhdl+s-Y|`D_ulx_FA1F@H)+L{{}59JGBfoOLu{XIK@qV#y)rKylp%3-rL2yn8G<l4
zUc(gybA}8ajmvVySGFz+$%M+_ULKb^_BpofwILP~c0jaRZosH?H4q7N+4}0=LVC@>
zHa0{hY>S}-QBBH$$iD(X@EWN2;s`99Bb`gRQm(7hI<-;6v3t%+uc_d&oUTmi#LFvy
z%Ikq&D;0)tp;V0x{2r}Ikd+17``TN^0StQ&-P9M``%Srvj#|vbs7B$kT!~I<P1X|_
z<1ZXktIk^pU_EK&t6ua0m^Z#PZB@ZAVD*vG`t{Z*?lND+uI>aLc$ml1nQAb3Gn;i9
z)q}yMN9VF!v53e=?oioX8ndk<s7lyEHim_py#lf|H(XjRvGX#a3z?gg^)dwxwj^hx
z{uiO9GHY7Z)Y4%RU~;x=OFN}tIYv*Aq55wLAo!E!5g1{{^NqfZr@9G?zkBW1PYiwF
ztKWWo@7s~zux`umT@qX92|d2vcIhJRnWZSg@AOhoNqh%l;8M5paUR^xzumU`$qh|k
z7uaA(kmr}XYuRvIaKHba@%u^;b%uq?Ps*|QT8<V~=+MXi^mzFuqh=IY8mRxNYbi9v
z{9c)&C+nf%X`993GCw*BMt<&WHeVzTMLhn>d_pPx14tixyx0{ACkx$N+}(EBWGwJQ
z<MxlbFD%ACYY_ww+Yj@m&#id`4eg9j>sP!x;H8(Wx@;PEz!8wVOuEgYZ33y;ThCDs
zx<QuciGPU+s3|}ZHg%TeGg2*xxZIpsc5t3WPq6-8Mn}UdG}_Io>sDln97U1XsZsJY
z6${{A){;4J9omD|e)P;ez0(4@Ssor;^j4@>##eW!A69aJPG;4DH!;(<f*|Vd+m<&C
z<F^0bxc|}A2#h<gi`X6b1di`yA09oRsH+OiaQo5Fmszj?yk2vq@iV3i<Y#uQQtpg`
zyjDrvopNAa8thJf+PeijxNT<FX4|l1*D!wf|5tvY>(KPPWL$YLA4NPTjFaHJsK)ZA
zQp0*K7xG8p+1oQQ)&lQ?&9IA{vlP1FX}pcLoEdaTn-J0=dlwi$*x{wluKHDjSwDEx
zp+B;_5g6`fUG6)Tb<<JAGQ;EE);EPfr}6gpHH#jAFwzT{E^i?alcg@oG-`(6niJgc
zU8nX!mdN%rbXG(6&oboN>|XsF7VbqO0~FqfwHa?Sp1WNi9CcR6%eP;^1lx=@=lE>3
z0by?cM!9;KKvMQ0!bkE^DL6E<Z*I3&hk2Bp>`B8J%m$Giea)(R4o13oG^9Um!T5Wo
zwM&bhM-iJ9&v-UPu$uhv_M}a-z(x3}N0Rx{I|QOK@9&5%SQrflIm^wvbZtc-b#I-W
zd+A6Ah#JWy+zJH@)ntUI!cG+VbBG4qiO}mQ)V^KjvycUR+FDbcmahV#U+dDDlP(aH
ztvLR1T-8k!_c6Qba-)n9gi=nUa-9YAUF@mvXRC@RW1v6IK!35w{YpFDrpY;F&WJ^k
zdD^GN5HC0gYE#Y6k_y6vZeLbQ30^R9_}y_)5?Ls+yy)Vg3X3lQx_QO;HALuG*vakD
z8ZH?avE}GOJ4{2)-+o*5)(KY>AxAk?C~`s}wk`~{U16}r_$OZ~Ny-R5wq=d?EQ#;a
zfCqiS$vdlLA$OP`H!x|PkjgB1aT{C4Jq+1wq{zb^cs23P*440I!#O?&x1;#{ET~)T
zsQ$bdsHjc7l9gsHT!8^*nidlFB+r1dJ-?=8vRx3+8tD{IWLyGa9(SW;>R^C>4DusO
ziD3r_{@l&_J1o|t$n5SE3I8!LH}G{1)*QRDAB3UYlFrRa0;$=GGP@j#pd*p52r|*C
z2eV<zscw4jI0?iQcDt!O=GLPdAlfF+TC=Vn2ZlN^S*^Spp0x09cfKwuz7B~RUVXt}
z=P(CiY|ZPhCEvk1$gfRqNb-b^f~`_Z4^)f+F|RA}x@;k~{39J8+O)|QlQN27fcV%q
zJ=7b@eVeQ;vatx(tBv~k?oz*ch?9_Rm_1WGEG7mRWE)Ddr+je4s8j}=NR}Wp^^$Bv
zp_d!g0MVw!^+F1vs5^*d?5dDj20SuiQtAEaAm|y|OffhP5(;LZtf|&F4tC3-B1?B;
z*y-$<OAalk#DiGT{<$nj76Vfn#W7vG5p{ygK6ez!<4wy^d>;i1nV2?DBK|mv^zMmx
z_OTESdxViOUq3xJP09pC=148Au64+R4Z)q>XD$hSENeVo(m5FtVd%jYBbXLuvT)F*
zMJWv5;K3*H*aK&uU^+az4Dlw;{XY*?|M^@i$!%=@`DPewR<7Apdh0R;MfRy@8KPOx
zL>yXqQ})3*Re_rz_Hi30P2n_(+f&rZ-?kN!BQ{?~pJ&DZ@iFTqdCOA(qz+BdO7Nt|
zo5b)V!lSHTeZ-qsu#u(p;UkLsH0Bw>*Cz)>664&+KGruuloqnYT|0yaFxJ1cDXW?Y
z|0B?R+yA&KOggcIznZQr?*b8`cW~#QdDzlU{Ttaxo?9j!Tzn-X)eyz?T_A2{ox$iL
z`?4}apg%|~D6IBZT{{fj)-mQ6TN_bazN@;XdrL4t`LNli<2QmxHIlBms6V*8bgKS;
E0l}};QUCw|

literal 0
HcmV?d00001

diff --git a/test/benchmark/run_benchmark_script.sh b/test/benchmark/run_benchmark_script.sh
new file mode 100644
index 0000000000..642eb3dab6
--- /dev/null
+++ b/test/benchmark/run_benchmark_script.sh
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+#SBATCH --account=proj16
+#SBATCH --partition=prod
+#SBATCH --time=08:00:00
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=2
+#SBATCH -n 40
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --constraint=gpu_32g
+
+#
+# Driver for nmodl-llvm benchmarking
+#
+
+set -e
+set -x
+
+module purge
+unset MODULEPATH
+export MODULEPATH=/gpfs/bbp.cscs.ch/ssd/apps/bsd/modules/_meta:/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/spack_modules/linux-rhel7-skylake
+module load unstable gcc/11.2.0 cuda/11.6.1 python-dev
+
+#intel paths
+intel_library_dir=$(module show intel-oneapi-compilers/2021.4.0 2>&1 | grep " LD_LIBRARY_PATH " | grep "intel64_lin" | awk -F' ' '{print $3}' | head -n 1)
+svml_lib=$intel_library_dir/libsvml.so
+intel_exe=$(module show intel-oneapi-compilers/2021.4.0 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)/intel64/icpc
+
+#sleef library
+sleef_lib=/gpfs/bbp.cscs.ch/apps/hpc/llvm-install/0621/sleef-3.5.1/lib64/libsleefgnuabi.so
+
+#llvm path
+llvm_path=$(module show llvm/13.0.0 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)
+clang_exe=${llvm_path}/clang++
+llc_exe=${llvm_path}/llc
+
+#gcc path
+gcc_exe=$(module show gcc/11.2.0 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)/g++
+
+#nvhpc path
+nvhpc_exe=$(module show nvhpc/22.3 2>&1 | grep " PATH " | awk -F' ' '{print $3}' | head -n 1)/nvc++
+
+#libdevice path
+libdevice_lib=${CUDA_HOME}/nvvm/libdevice/libdevice.10.bc
+
+#add ld library path
+export LD_LIBRARY_PATH=`dirname $svml_lib`:`dirname $sleef_lib`:${llvm_path}/lib:$LD_LIBRARY_PATH
+
+# nmodl binary
+nmodl_src_dir=$(pwd)/../../
+nmodl_exe=${nmodl_src_dir}/build_benchmark_gpu_math1/install/bin/nmodl
+
+# external kernel
+kernels_path=${nmodl_src_dir}/test/benchmark/kernels
+modfile_directory=${nmodl_src_dir}/test/benchmark/kernels
+ext_lib="libextkernel.so"
+
+export PYTHONPATH=/gpfs/bbp.cscs.ch/data/scratch/proj16/magkanar/nmodl_llvm_benchmark/nmodl/build_benchmark_gpu_math1/install/lib:$PYTHONPATH
+
+execute_benchmark() {
+    python benchmark_script.py \
+        --modfiles "./kernels/hh.mod" "./kernels/expsyn.mod" \
+        --architectures "skylake-avx512" "nvptx64" \
+        --compilers "intel" "gcc" "nvhpc" "clang" \
+        --external \
+        --nmodl_jit \
+        --output "./hh_expsyn_final_cpu" \
+        --instances 100000000 \
+        --experiments 5 \
+        --svml_lib $svml_lib \
+        --intel_exe $intel_exe \
+        --sleef_lib $sleef_lib \
+        --clang_exe $clang_exe \
+        --llc_exe $llc_exe \
+        --gcc_exe $gcc_exe \
+        --nvhpc_exe $nvhpc_exe \
+        --libdevice_lib $libdevice_lib \
+        --nmodl_exe $nmodl_exe /
+}
+
+roofline_gpu() {
+    mod_name=$1
+    ncu --set full -f -o "${mod_name}_full_200mil" python benchmark_script.py \
+        --modfiles "./kernels/${mod_name}.mod" \
+        --architectures "nvptx64" \
+        --compilers "nvhpc" \
+        --external \
+        --nmodl_jit \
+        --output "./${mod_name}_nvhpc_ncu_200mil" \
+        --instances 100000000 \
+        --experiments 1 \
+        --svml_lib $svml_lib \
+        --intel_exe $intel_exe \
+        --sleef_lib $sleef_lib \
+        --clang_exe $clang_exe \
+        --llc_exe $llc_exe \
+        --gcc_exe $gcc_exe \
+        --nvhpc_exe $nvhpc_exe \
+        --libdevice_lib $libdevice_lib \
+        --nmodl_exe $nmodl_exe /
+}
+
+roofline_cpu() {
+    mod_name=$1
+    module load intel-oneapi-advisor/2021.4.0
+    advisor --collect roofline --project-dir "${mod_name}_advisor_clang_avx512" python benchmark_script.py \
+        --modfiles "./kernels/${mod_name}.mod" \
+        --architectures "skylake-avx512" \
+        --compilers "clang" \
+        --external \
+        --output "./${mod_name}_clang_avx512_skylake_advisor" \
+        --instances 100000000 \
+        --experiments 1 \
+        --svml_lib $svml_lib \
+        --intel_exe $intel_exe \
+        --sleef_lib $sleef_lib \
+        --clang_exe $clang_exe \
+        --llc_exe $llc_exe \
+        --gcc_exe $gcc_exe \
+        --nvhpc_exe $nvhpc_exe \
+        --libdevice_lib $libdevice_lib \
+        --nmodl_exe $nmodl_exe /
+    module unload intel-oneapi-advisor/2021.4.0
+}
+
+execute_benchmark
+# roofline_gpu hh
+# roofline_gpu expsyn
+# roofline_cpu hh
+# roofline_cpu expsyn
diff --git a/test/benchmark/run_benchmark_script_cpu_gpu.sh b/test/benchmark/run_benchmark_script_cpu_gpu.sh
new file mode 100644
index 0000000000..3ea0e51e15
--- /dev/null
+++ b/test/benchmark/run_benchmark_script_cpu_gpu.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+#
+# Driver for MODIR benchmarking
+#
+
+set -e
+set -x
+
+# nmodl binary
+nmodl_src_dir=$(pwd)/../../
+nmodl_exe=${nmodl_src_dir}/build/install/bin/nmodl
+
+# external kernel
+kernels_path=${nmodl_src_dir}/test/benchmark/kernels
+modfile_directory=${nmodl_src_dir}/test/benchmark/kernels
+ext_lib="libextkernel.so"
+
+export PYTHONPATH=${nmodl_src_dir}/build/install/lib:$PYTHONPATH
+
+execute_benchmark_cpu() {
+    python3 benchmark_script.py \
+        --modfiles "./kernels/hh.mod" "./kernels/expsyn.mod" \
+        --architectures "skylake-avx512"  \
+        --compilers "intel" "gcc" "nvhpc" "clang" \
+        --external \
+        --nmodl_jit \
+        --output "./hh_expsyn_cpu" \
+        --instances 100000000 \
+        --experiments 5 \
+        --svml_lib $svml_lib \
+        --intel_exe $intel_exe \
+        --sleef_lib $sleef_lib \
+        --clang_exe $clang_exe \
+        --llc_exe $llc_exe \
+        --gcc_exe $gcc_exe \
+        --nvhpc_exe $nvhpc_exe \
+        --libdevice_lib $libdevice_lib \
+        --nmodl_exe $nmodl_exe /
+}
+
+execute_benchmark_gpu() {
+    python3 benchmark_script.py \
+        --modfiles "./kernels/hh.mod" "./kernels/expsyn.mod" \
+        --architectures "nvptx64"  \
+        --compilers "nvhpc" \
+        --external \
+        --nmodl_jit \
+        --output "./hh_expsyn_gpu" \
+        --instances 100000000 \
+        --experiments 5 \
+        --svml_lib $svml_lib \
+        --intel_exe $intel_exe \
+        --sleef_lib $sleef_lib \
+        --clang_exe $clang_exe \
+        --llc_exe $llc_exe \
+        --gcc_exe $gcc_exe \
+        --nvhpc_exe $nvhpc_exe \
+        --libdevice_lib $libdevice_lib \
+        --nmodl_exe $nmodl_exe /
+}
+
+execute_benchmark_cpu
+execute_benchmark_gpu
diff --git a/test/benchmark/run_benchmark_script_cpu_only.sh b/test/benchmark/run_benchmark_script_cpu_only.sh
new file mode 100644
index 0000000000..65b06570fb
--- /dev/null
+++ b/test/benchmark/run_benchmark_script_cpu_only.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+#
+# Driver for MOD2IR (CPU-only) benchmarking
+#
+
+set -e
+set -x
+
+# nmodl binary
+nmodl_src_dir=$(pwd)/../../
+nmodl_exe=${nmodl_src_dir}/build/install/bin/nmodl
+
+# external kernel
+kernels_path=${nmodl_src_dir}/test/benchmark/kernels
+modfile_directory=${nmodl_src_dir}/test/benchmark/kernels
+ext_lib="libextkernel.so"
+
+export PYTHONPATH=${nmodl_src_dir}/build/install/lib:$PYTHONPATH
+
+execute_benchmark_cpu() {
+    python3 benchmark_script.py \
+        --modfiles "./kernels/hh.mod" "./kernels/expsyn.mod" \
+        --architectures "skylake-avx512"  \
+        --compilers "intel" "gcc" "nvhpc" "clang" \
+        --external \
+        --nmodl_jit \
+        --output "./hh_expsyn_cpu" \
+        --instances 100000000 \
+        --experiments 5 \
+        --svml_lib $svml_lib \
+        --intel_exe $intel_exe \
+        --sleef_lib $sleef_lib \
+        --clang_exe $clang_exe \
+        --llc_exe $llc_exe \
+        --gcc_exe $gcc_exe \
+        --nvhpc_exe $nvhpc_exe \
+        --libdevice_lib $libdevice_lib \
+        --nmodl_exe $nmodl_exe /
+}
+
+execute_benchmark_cpu
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 818167859c..12ab7c3d38 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -146,7 +146,7 @@ if(NMODL_ENABLE_LLVM)
     printer
     ${NMODL_WRAPPER_LIBS}
     ${LLVM_LIBS_TO_LINK})
-  set(CODEGEN_TEST testllvm)
+  set(CODEGEN_TEST "testllvm;test_llvm_runner")
 endif()
 
 # =============================================================================
diff --git a/test/unit/codegen/codegen_data_helper.cpp b/test/unit/codegen/codegen_data_helper.cpp
index 9f7b3ab6b3..2e0463a076 100644
--- a/test/unit/codegen/codegen_data_helper.cpp
+++ b/test/unit/codegen/codegen_data_helper.cpp
@@ -9,6 +9,10 @@
 
 #include "codegen_data_helper.hpp"
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+#include "test/benchmark/cuda_driver.hpp"
+#endif
+
 namespace nmodl {
 namespace codegen {
 
@@ -18,14 +22,33 @@ const double default_nthread_t_value = 100.0;
 const double default_celsius_value = 34.0;
 const int default_second_order_value = 0;
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+void checkCudaErrors(cudaError error) {
+    if (error != cudaSuccess) {
+        throw std::runtime_error(
+            fmt::format("CUDA Execution Error: {}\n", cudaGetErrorString(error)));
+    }
+}
+#endif
+
 // cleanup all members and struct base pointer
 CodegenInstanceData::~CodegenInstanceData() {
     // first free num_ptr_members members which are pointers
     for (size_t i = 0; i < num_ptr_members; i++) {
         free(members[i]);
+#ifdef NMODL_LLVM_CUDA_BACKEND
+        if (dev_base_ptr) {
+            checkCudaErrors(cudaFree(dev_members[i]));
+        }
+#endif
     }
     // and then pointer to container struct
     free(base_ptr);
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    if (dev_base_ptr) {
+        checkCudaErrors(cudaFree(dev_base_ptr));
+    }
+#endif
 }
 
 /**
@@ -145,12 +168,12 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
     }
 
 
-    int cnt{};
-    for (auto& var: variables) {
-        // printout vars
-        std::cout << cnt++ << ":\t" << to_string(var->get_type()->get_type()) << '\t'
-                  << var->get_is_pointer() << '\t' << var->get_name()->get_node_name() << '\n';
-    }
+    // int cnt{};
+    // for (auto& var: variables) {
+    //     // printout vars
+    //     std::cout << cnt++ << ":\t" << to_string(var->get_type()->get_type()) << '\t'
+    //               << var->get_is_pointer() << '\t' << var->get_name()->get_node_name() << '\n';
+    // }
 
 
     // we are now switching from pointer type to next member type (e.g. double)
@@ -208,5 +231,63 @@ CodegenInstanceData CodegenDataHelper::create_data(size_t num_elements, size_t s
     return data;
 }
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+void CodegenInstanceData::copy_instance_data_gpu() {
+    const auto ptr_vars_size = num_ptr_members * sizeof(double*);
+    auto scalar_vars_size = 0;
+    const auto num_scalar_vars = members.size() - num_ptr_members;
+    for (int i = 0; i < num_scalar_vars; i++) {
+        scalar_vars_size += members_size[i + num_ptr_members];
+    }
+    checkCudaErrors(cudaMalloc(&dev_base_ptr, ptr_vars_size + scalar_vars_size));
+    for (auto i = 0; i < num_ptr_members; i++) {
+        // Allocate a vector with the correct size
+        void* dev_member_ptr;
+        auto size_of_var = members_size[i];
+        checkCudaErrors(cudaMalloc(&dev_member_ptr, size_of_var * num_elements));
+        checkCudaErrors(cudaMemcpy(dev_member_ptr,
+                                   members[i],
+                                   size_of_var * num_elements,
+                                   cudaMemcpyHostToDevice));
+        // Copy the pointer addresses to the struct
+        auto offseted_place = (char*) dev_base_ptr + offsets[i];
+        checkCudaErrors(
+            cudaMemcpy(offseted_place, &dev_member_ptr, sizeof(double*), cudaMemcpyHostToDevice));
+        dev_members.push_back(dev_member_ptr);
+    }
+    // memcpy the scalar values
+    auto offseted_place_dev = (char*) dev_base_ptr + offsets[num_ptr_members];
+    auto offseted_place_host = (char*) (base_ptr) + offsets[num_ptr_members];
+    checkCudaErrors(cudaMemcpy(
+        offseted_place_dev, offseted_place_host, scalar_vars_size, cudaMemcpyHostToDevice));
+}
+
+void CodegenInstanceData::copy_instance_data_host() {
+    const auto ptr_vars_size = num_ptr_members * sizeof(double*);
+    auto scalar_vars_size = 0;
+    const auto num_scalar_vars = members.size() - num_ptr_members;
+    for (int i = 0; i < num_scalar_vars; i++) {
+        scalar_vars_size += members_size[i + num_ptr_members];
+    }
+    const auto host_base_ptr = base_ptr;
+    for (auto i = 0; i < num_ptr_members; i++) {
+        auto size_of_var = members_size[i];
+        void* offset_dev_ptr = (char*) dev_base_ptr + offsets[i];
+        void* gpu_offset_addr;
+        checkCudaErrors(
+            cudaMemcpy(&gpu_offset_addr, offset_dev_ptr, sizeof(double*), cudaMemcpyDeviceToHost));
+        checkCudaErrors(cudaMemcpy(members[i],
+                                   gpu_offset_addr,
+                                   size_of_var * num_elements,
+                                   cudaMemcpyDeviceToHost));
+    }
+    // memcpy the scalar values
+    void* offseted_place_dev = (char*) dev_base_ptr + offsets[num_ptr_members];
+    void* offseted_place_host = (char*) (base_ptr) + offsets[num_ptr_members];
+    checkCudaErrors(cudaMemcpy(
+        offseted_place_host, offseted_place_dev, scalar_vars_size, cudaMemcpyDeviceToHost));
+}
+#endif
+
 }  // namespace codegen
 }  // namespace nmodl
diff --git a/test/unit/codegen/codegen_data_helper.hpp b/test/unit/codegen/codegen_data_helper.hpp
index 54dceb8c25..ccbeba4778 100644
--- a/test/unit/codegen/codegen_data_helper.hpp
+++ b/test/unit/codegen/codegen_data_helper.hpp
@@ -32,6 +32,10 @@ struct CodegenInstanceData {
     /// to instance struct at run time
     void* base_ptr = nullptr;
 
+    /// base pointer on the device which can be type casted
+    /// to instance struct at run time
+    void* dev_base_ptr = nullptr;
+
     /// length of each member of pointer type
     size_t num_elements = 0;
 
@@ -49,9 +53,19 @@ struct CodegenInstanceData {
     /// i.e. *(base_ptr + offsets[0]) will be members[0]
     std::vector<void*> members;
 
+    /// pointer to array allocated for each member variable on the device
+    /// i.e. *(dev_base_ptr + offsets[0]) will be dev_members[0]
+    std::vector<void*> dev_members;
+
     /// size in bytes
     size_t num_bytes = 0;
 
+    /// copy instance data to device
+    void copy_instance_data_gpu();
+
+    /// copy instance data to host
+    void copy_instance_data_host();
+
     // cleanup all memory allocated for type and member variables
     ~CodegenInstanceData();
 };
diff --git a/test/unit/codegen/codegen_llvm_execution.cpp b/test/unit/codegen/codegen_llvm_execution.cpp
index a204bb4bbd..8989bbfff6 100644
--- a/test/unit/codegen/codegen_llvm_execution.cpp
+++ b/test/unit/codegen/codegen_llvm_execution.cpp
@@ -627,3 +627,297 @@ SCENARIO("Vectorised kernel with simple control flow", "[llvm][runner]") {
         }
     }
 }
+
+//=============================================================================
+// Kernel with atomic updates.
+//=============================================================================
+
+SCENARIO("Kernel with atomic updates", "[llvm][runner]") {
+    GIVEN("An atomic update") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                USEION na READ ena WRITE ina
+                USEION ka READ eka WRITE ika
+            }
+
+            STATE { }
+
+            ASSIGNED {
+                v (mV)
+                ena (mV)
+                ina (mA/cm2)
+            }
+
+            BREAKPOINT { }
+
+            DERIVATIVE states { }
+
+            : The atomic update that we want to check is:
+            :
+            :     ion_ina_id = mech->ion_ina_index[id]
+            :     ion_ika_id = mech->ion_ika_index[id]
+            :     mech->ion_ina[ion_ina_id] += mech->ina[id]
+            :     mech->ion_ika[ion_ika_id] += mech->ika[id]
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/4);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/3);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_functions();
+
+        // Create the instance struct data.
+        int num_elements = 5;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // With these indices ion_ina[1] = 1 + 2 + 3 + 4 + 5 = 15.
+        std::vector<int> ion_ina_index = {1, 1, 1, 1, 1};
+        std::vector<double> ion_ina = {0.0, 0.0, 0.0, 0.0, 0.0};
+        std::vector<double> ina = {1.0, 2.0, 3.0, 4.0, 5.0};
+
+        // With these indices:
+        //     ion_ika[1] = 3 + 4 = 7.
+        //     ion_ika[2] = 1 + 20 = 21.
+        //     ion_ika[3] = -5 + 5 = 0.
+        std::vector<int> ion_ika_index = {2, 2, 1, 1, 3};
+        std::vector<double> ion_ika = {0.0, 0.0, 0.0, -5.0, 0.0};
+        std::vector<double> ika = {1.0, 20.0, 3.0, 4.0, 5.0};
+
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
+
+        initialise_instance_variable(instance_info, ion_ina_index, "ion_ina_index");
+        initialise_instance_variable(instance_info, ion_ina, "ion_ina");
+        initialise_instance_variable(instance_info, ina, "ina");
+        initialise_instance_variable(instance_info, ion_ika_index, "ion_ika_index");
+        initialise_instance_variable(instance_info, ion_ika, "ion_ika");
+        initialise_instance_variable(instance_info, ika, "ika");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
+
+        THEN("updates are commputed correctly with vector instructions and optimizations on") {
+            runner.run_with_argument<int, void*>("__nrn_cur_test_wrapper", instance_data.base_ptr);
+            // Recall:
+            //     ion_ina_id = mech->ion_ina_index[id]
+            //     ion_ika_id = mech->ion_ika_index[id]
+            //     mech->ion_ina[ion_ina_id] += mech->ina[id]
+            //     mech->ion_ika[ion_ika_id] += mech->ika[id]
+            std::vector<double> ion_ina_expected = {0.0, 15.0, 0.0, 0.0, 0.0};
+            REQUIRE(check_instance_variable(instance_info, ion_ina_expected, "ion_ina"));
+
+            std::vector<double> ion_ika_expected = {0.0, 7.0, 21.0, 0.0, 0.0};
+            REQUIRE(check_instance_variable(instance_info, ion_ika_expected, "ion_ika"));
+        }
+    }
+
+    GIVEN("Another atomic update") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                USEION na READ ena WRITE ina
+                USEION ka READ eka WRITE ika
+            }
+
+            STATE { }
+
+            ASSIGNED {
+                v (mV)
+                ena (mV)
+                ina (mA/cm2)
+            }
+
+            BREAKPOINT { }
+
+            DERIVATIVE states { }
+
+            : The atomic update that we want to check is again:
+            :
+            :     ion_ina_id = mech->ion_ina_index[id]
+            :     ion_ika_id = mech->ion_ika_index[id]
+            :     mech->ion_ina[ion_ina_id] += mech->ina[id]
+            :     mech->ion_ika[ion_ika_id] += mech->ika[id]
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/2);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/0);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_functions();
+
+        // Create the instance struct data.
+        int num_elements = 6;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // With these indices ion_ina[1] = 1 + 3 + 5 = 9.
+        // With these indices ion_ina[4] = 2 + 4 + 6 = 12.
+        std::vector<int> ion_ina_index = {1, 4, 1, 4, 1, 4};
+        std::vector<double> ion_ina = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+        std::vector<double> ina = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+
+        // With these indices:
+        //     ion_ika[1] = 3 + 4 + 5 = 12.
+        //     ion_ika[2] = 1 + 20 + 6 = 27.
+        std::vector<int> ion_ika_index = {2, 2, 1, 1, 1, 2};
+        std::vector<double> ion_ika = {0.0, 0.0, 0.0, -5.0, 0.0, 0.0};
+        std::vector<double> ika = {1.0, 20.0, 3.0, 4.0, 5.0, 6.0};
+
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
+
+        initialise_instance_variable(instance_info, ion_ina_index, "ion_ina_index");
+        initialise_instance_variable(instance_info, ion_ina, "ion_ina");
+        initialise_instance_variable(instance_info, ina, "ina");
+        initialise_instance_variable(instance_info, ion_ika_index, "ion_ika_index");
+        initialise_instance_variable(instance_info, ion_ika, "ion_ika");
+        initialise_instance_variable(instance_info, ika, "ika");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
+
+        THEN("Atomic updates are correct without optimizations") {
+            runner.run_with_argument<int, void*>("__nrn_cur_test_wrapper", instance_data.base_ptr);
+            // Recall:
+            //     ion_ina_id = mech->ion_ina_index[id]
+            //     ion_ika_id = mech->ion_ika_index[id]
+            //     mech->ion_ina[ion_ina_id] += mech->ina[id]
+            //     mech->ion_ika[ion_ika_id] += mech->ika[id]
+            std::vector<double> ion_ina_expected = {0.0, 9.0, 0.0, 0.0, 12.0, 0.0};
+            REQUIRE(check_instance_variable(instance_info, ion_ina_expected, "ion_ina"));
+
+            std::vector<double> ion_ika_expected = {0.0, 12.0, 27.0, -5.0, 0.0, 0.0};
+            REQUIRE(check_instance_variable(instance_info, ion_ika_expected, "ion_ika"));
+        }
+    }
+
+    GIVEN("Atomic updates of rhs and d") {
+        std::string nmodl_text = R"(
+            NEURON {
+                POINT_PROCESS test
+                USEION na READ ena WRITE ina
+                USEION ka READ eka WRITE ika
+            }
+
+            STATE { }
+
+            ASSIGNED {
+                v (mV)
+                ena (mV)
+                ina (mA/cm2)
+            }
+
+            BREAKPOINT { }
+
+            DERIVATIVE states { }
+
+            : The atomic update that we want to check is again:
+            :
+            :     node_id = mech->node_index[id]
+            :     mech->vec_rhs[node_id] -= rhs
+            :     mech->vec_d[node_id] -= g
+        )";
+
+
+        NmodlDriver driver;
+        const auto& ast = driver.parse_string(nmodl_text);
+
+        // Run passes on the AST to generate LLVM.
+        SymtabVisitor().visit_program(*ast);
+        NeuronSolveVisitor().visit_program(*ast);
+        SolveBlockVisitor().visit_program(*ast);
+
+        codegen::Platform simd_cpu_platform(/*use_single_precision=*/false,
+                                            /*instruction_width=*/2);
+        codegen::CodegenLLVMVisitor llvm_visitor(/*mod_filename=*/"unknown",
+                                                 /*output_dir=*/".",
+                                                 simd_cpu_platform,
+                                                 /*opt_level_ir=*/0);
+        llvm_visitor.visit_program(*ast);
+        llvm_visitor.wrap_kernel_functions();
+
+        // Create the instance struct data.
+        int num_elements = 6;
+        const auto& generated_instance_struct = llvm_visitor.get_instance_struct_ptr();
+        auto codegen_data = codegen::CodegenDataHelper(generated_instance_struct);
+        auto instance_data = codegen_data.create_data(num_elements, /*seed=*/1);
+
+        // With these indices vec_rhs[1] = -0.2-1.e2/1.5*2-1.e2/3.4*6-1.e2/5.2*10 =
+        // -502.3116138763197.
+        // With these indices vec_rhs[4] =
+        // -0.54-1.e2/2.3*22.0-1.e2/4.1*8.0-1.e2/6.0*12.0 = -1351.103690349947.
+        // vec_d remains the same because the contribution of g each time is 0.0.
+        std::vector<int> node_index = {1, 4, 1, 4, 1, 4};
+        std::vector<double> ina = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+        std::vector<double> ika = {1.0, 20.0, 3.0, 4.0, 5.0, 6.0};
+        std::vector<double> vec_rhs = {0.64, -0.2, 1.1, 0.42, 0.54, -0.36};
+        std::vector<double> vec_d = {1.6, 2.5, 3.4, 4.3, 5.2, 6.1};
+        std::vector<int> node_area_index = {0, 1, 2, 3, 4, 5};
+        std::vector<double> node_area = {1.5, 2.3, 3.4, 4.1, 5.2, 6.0};
+
+        InstanceTestInfo instance_info{&instance_data,
+                                       llvm_visitor.get_instance_var_helper(),
+                                       num_elements};
+
+        initialise_instance_variable(instance_info, node_index, "node_index");
+        initialise_instance_variable(instance_info, ina, "ina");
+        initialise_instance_variable(instance_info, ika, "ika");
+        initialise_instance_variable(instance_info, vec_rhs, "vec_rhs");
+        initialise_instance_variable(instance_info, vec_d, "vec_d");
+        initialise_instance_variable(instance_info, node_area_index, "node_area_index");
+        initialise_instance_variable(instance_info, node_area, "node_area");
+
+        // Set up the JIT runner.
+        std::unique_ptr<llvm::Module> module = llvm_visitor.get_module();
+        TestRunner runner(std::move(module));
+        runner.initialize_driver();
+
+        THEN("Atomic updates are correct") {
+            runner.run_with_argument<int, void*>("__nrn_cur_test_wrapper", instance_data.base_ptr);
+            // Recall:
+            //     node_id = mech->node_index[id]
+            //     mech->vec_rhs[node_id] -= rhs
+            //     mech->vec_d[node_id] -= g
+            std::vector<double> vec_rhs_expected = {
+                0.64, -502.3116138763197, 1.1, 0.42, -1351.103690349947, -0.36};
+            REQUIRE(check_instance_variable(instance_info, vec_rhs_expected, "vec_rhs"));
+
+            std::vector<double> vec_d_expected = {1.6, 2.5, 3.4, 4.3, 5.2, 6.1};
+            REQUIRE(check_instance_variable(instance_info, vec_d_expected, "vec_d"));
+        }
+    }
+}
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index 3b810edbbe..1d310d3294 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -1841,4 +1841,110 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             REQUIRE(std::regex_search(module_string, m, add));
         }
     }
+
+    GIVEN("For current update with atomic addition ") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                USEION na READ ena WRITE ina
+            }
+
+            STATE { }
+
+            ASSIGNED {
+                v (mV)
+                ena (mV)
+                ina (mA/cm2)
+            }
+
+            BREAKPOINT {
+                SOLVE states METHOD cnexp
+            }
+
+            DERIVATIVE states { }
+        )";
+
+        THEN("corresponding LLVM atomic instruction is generated") {
+            std::string module_string = run_gpu_llvm_visitor(nmodl_text,
+                                                             /*opt_level=*/0,
+                                                             /*use_single_precision=*/false);
+            std::smatch m;
+
+            // Check for atomic addition.
+            std::regex add(R"(atomicrmw fadd double\* %.*, double %.* seq_cst)");
+            REQUIRE(std::regex_search(module_string, m, add));
+        }
+    }
+}
+
+//=============================================================================
+// Atomics for vectorised kernel
+//=============================================================================
+
+SCENARIO("A simple kernel with atomic current updates", "[visitor][llvm]") {
+    GIVEN("A simple atomic update") {
+        std::string nmodl_text = R"(
+            NEURON {
+                SUFFIX test
+                USEION na READ ena WRITE ina
+            }
+
+            STATE { }
+
+            ASSIGNED {
+                v (mV)
+                ena (mV)
+                ina (mA/cm2)
+            }
+
+            BREAKPOINT { }
+
+            DERIVATIVE states { }
+        )";
+
+        THEN("an atomic loop is created") {
+            std::string module_string = run_llvm_visitor(nmodl_text,
+                                                         /*opt_level=*/0,
+                                                         /*use_single_precision=*/true,
+                                                         /*vector_width=*/4);
+            std::smatch m;
+
+            // Check for correct %ptrs calculation and bitcast to an array.
+            std::regex ptrtoint(R"(ptrtoint float\* %.* to i64)");
+            std::regex insertelement(R"(insertelement <4 x i64> undef, i64 %.*, i32 0)");
+            std::regex shufflevector(
+                R"(shufflevector <4 x i64> %.*, <4 x i64> undef, <4 x i32> zeroinitializer)");
+            std::regex bitcast(R"(bitcast <4 x i64>\* %ptrs to \[4 x float\*\]\*)");
+            REQUIRE(std::regex_search(module_string, m, ptrtoint));
+            REQUIRE(std::regex_search(module_string, m, insertelement));
+            REQUIRE(std::regex_search(module_string, m, shufflevector));
+            REQUIRE(std::regex_search(module_string, m, bitcast));
+
+            // Check for %ptrs store and branch to atomic update block.
+            std::regex ptrs_store(R"(store <4 x i64> %.*, <4 x i64>\* %ptrs)");
+            std::regex atomic_branch(R"(br label %atomic\.update)");
+            REQUIRE(std::regex_search(module_string, m, ptrs_store));
+            REQUIRE(std::regex_search(module_string, m, atomic_branch));
+
+            // Check the scalar loop for atomic update mis implemented correctly.
+            std::regex atomic_update(
+                "  %.* = phi i64 \\[ 15, %for\\.body \\], \\[ %.*, %atomic\\.update \\]\n"
+                "  %.* = call i64 @llvm\\.cttz\\.i64\\(i64 %.*, i1 false\\)\n"
+                "  %.* = shl i64 1, %.*\n"
+                "  %.* = xor i64 %.*, -1\n"
+                "  %.* = and i64 %.*, %.*\n"
+                "  %.* = getelementptr \\[4 x float\\*\\], \\[4 x float\\*\\]\\* %.*, i64 0, i64 "
+                "%.*\n"
+                "  %.* = load float\\*, float\\*\\* %.*, align 8\n"
+                "  %.* = load float, float\\* %.*, align 4\n"
+                "  %.* = extractelement <4 x float> %.*, i64 %.*\n"
+                "  %.* = fadd float %.*, %.*\n"
+                "  store float %.*, float\\* %.*, align 4\n"
+                "  %.* = icmp eq i64 %.*, 0\n");
+            std::regex remaining(
+                R"(br i1 %.*, label %for\.body\.remaining, label %atomic\.update)");
+            REQUIRE(std::regex_search(module_string, m, atomic_update));
+            REQUIRE(std::regex_search(module_string, m, remaining));
+        }
+    }
 }

From 8d06bbf80729b382e184944263d0a64bfd7ea52a Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 26 Dec 2022 20:40:14 +0100
Subject: [PATCH 323/331] Mention llvm branch in PAD and Dockerfiles

---
 docs/CC2023/PAD.md                   | 2 +-
 test/benchmark/cpu_docker/Dockerfile | 2 +-
 test/benchmark/gpu_docker/Dockerfile | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/CC2023/PAD.md b/docs/CC2023/PAD.md
index 384bb8af83..9467f3ba5c 100644
--- a/docs/CC2023/PAD.md
+++ b/docs/CC2023/PAD.md
@@ -18,7 +18,7 @@ MOD2IR is implemented as a code generation backend inside the NMODL Framework an
 use of the LLVM IR and compilation passes. Most of the relevant code of the described work can be
 found [here](https://github.com/BlueBrain/nmodl/tree/llvm/src/codegen/llvm) and
 [here](https://github.com/BlueBrain/nmodl/tree/llvm/test/benchmark). The intructions to reproduce
-the results can be found [here](https://github.com/BlueBrain/nmodl/blob/3365551b332829699c1af3bea3c0fbe820a30800/docs/CC2023/README.md). 
+the results can be found [here](https://github.com/BlueBrain/nmodl/blob/llvm/docs/CC2023/README.md).
 
 ### Badge
 
diff --git a/test/benchmark/cpu_docker/Dockerfile b/test/benchmark/cpu_docker/Dockerfile
index 5e11e32006..3c37c55705 100644
--- a/test/benchmark/cpu_docker/Dockerfile
+++ b/test/benchmark/cpu_docker/Dockerfile
@@ -64,7 +64,7 @@ RUN apt install -y git cmake flex bison python3-pip
 RUN pip install Jinja2 PyYAML sympy pytest
 
 # Clone NMODL branch for benchmarking LLVM
-RUN git clone --recursive -b magkanar/python_benchmark https://github.com/BlueBrain/nmodl.git
+RUN git clone --recursive -b llvm https://github.com/BlueBrain/nmodl.git
 
 # Setup Intel compiler specific variables to the environment
 RUN echo ". /opt/intel/oneapi/setvars.sh" >> $HOME/.bashrc
diff --git a/test/benchmark/gpu_docker/Dockerfile b/test/benchmark/gpu_docker/Dockerfile
index e302827633..7b024038b3 100644
--- a/test/benchmark/gpu_docker/Dockerfile
+++ b/test/benchmark/gpu_docker/Dockerfile
@@ -64,7 +64,7 @@ RUN apt install -y git cmake flex bison python3-pip
 RUN pip install Jinja2 PyYAML sympy pytest
 
 # Clone NMODL branch for benchmarking LLVM
-RUN git clone --recursive -b magkanar/python_benchmark https://github.com/BlueBrain/nmodl.git
+RUN git clone --recursive -b llvm https://github.com/BlueBrain/nmodl.git
 
 # Setup Intel compiler specific variables to the environment
 RUN echo ". /opt/intel/oneapi/setvars.sh" >> $HOME/.bashrc

From 591923a0968abe0f04911c95b9e974c987ce47b1 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 26 Dec 2022 21:44:37 +0100
Subject: [PATCH 324/331] Disable debug symbols in nvptx64 JIT runs due to PTX
 ISA error

---
 test/benchmark/benchmark_script.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/benchmark/benchmark_script.py b/test/benchmark/benchmark_script.py
index 865964b5ef..66246fa18b 100644
--- a/test/benchmark/benchmark_script.py
+++ b/test/benchmark/benchmark_script.py
@@ -244,6 +244,7 @@ def run_external_kernel(
             cfg.shared_lib_paths = [self.compiler_config.libdevice_lib]
             cfg.llvm_gpu_name = "nvptx64"
             cfg.llvm_gpu_target_architecture = gpu_target_architecture
+            cfg.llvm_no_debug = True
         cfg.output_dir = str((Path(self.benchmark_config.output_directory)
             / modname
             / compiler
@@ -279,6 +280,7 @@ def run_JIT_kernels(
         else:
             cfg.llvm_gpu_name = "nvptx64"
             cfg.llvm_gpu_target_architecture = gpu_target_architecture
+            cfg.llvm_no_debug = True
         if architecture == "skylake-avx512":
             cfg.llvm_vector_width = 8
         elif architecture == "broadwell":

From 475a571bc9a476320d7ec2b61a544c5302f1576a Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 27 Dec 2022 08:44:39 +0100
Subject: [PATCH 325/331] Added which branch to clone in the readme for the gpu
 docker runtime installation

---
 docs/CC2023/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/CC2023/README.md b/docs/CC2023/README.md
index e9f1fc22d8..8b176b3937 100644
--- a/docs/CC2023/README.md
+++ b/docs/CC2023/README.md
@@ -20,7 +20,8 @@ The image that targets both CPU and GPU can be found in `test/benchmark/gpu_dock
 To launch the Docker image you can execute the following:
 
 ```
-cd test/benchnark/gpu_docker  # Enter the directory that contains the Dockerfile
+git clone -b llvm https://github.com/BlueBrain/nmodl.git
+cd nmodl/test/benchnark/gpu_docker  # Enter the directory that contains the Dockerfile
 bash install_gpu_docker_env.sh  # Installs docker and NVIDIA docker runtime (needs sudo permission)
 docker run -it -v $PWD:/opt/mount --gpus all bluebrain/nmodl:mod2ir-gpu-benchmark # Execute docker image (~16GB)
 ```

From 23eb5e839f84d3bfcb6cdb77dccf191d2e029c6d Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 27 Dec 2022 08:50:21 +0100
Subject: [PATCH 326/331] Added some small changes in the PAD text

---
 docs/CC2023/PAD.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/CC2023/PAD.md b/docs/CC2023/PAD.md
index 9467f3ba5c..c5f473d2c7 100644
--- a/docs/CC2023/PAD.md
+++ b/docs/CC2023/PAD.md
@@ -32,11 +32,11 @@ qualitiatively the same as the ones we have presented.
 ### Hardware requisites
 
 The provided artifact can in theory be run on any x86 hardware platform. For the prupose of closely
-reproducing our benchmark results we recommend using a workstation (or cloud instance) with Intel Xeon
-Skylake (or newer) CPU and an NVIDIA Volta V100 (or newer) GPU. All benchmark runs are single-core
-and have relatively low memory-requirement. For building the Docker image (and more specifically the
-NMODL Framework) we, however, recommend a system with plenty of cores and at least 32GB of RAM
-available and 20 GB of disk space.
+reproducing our benchmark results it is required a workstation (or cloud instance) with Intel Xeon
+Skylake (or newer) CPU that supports AVX-512 instructions and an NVIDIA Volta V100 (or newer) GPU.
+All benchmark runs are single-core and have relatively low memory-requirement. For building or running
+the Docker image (and more specifically the NMODL Framework) we, however, recommend a system with plenty
+of cores, at least 32GB of RAM available and 20 GB of disk space.
 
 ### Software requisites
 
@@ -46,6 +46,6 @@ reproduced, an up-to-date CUDA (11.0 or newer) should be present.
 
 ### Expectations
 
-We expect that all setup and benchmarks can be completed within one working day. The expected time for building
-the docker image is around 10 minutes using a modern multicore system with a stable internet connection.
-The expected runtime of the benchmarks is around 4 hours.
+We expect that all setup and benchmarks can be completed within one working day. The expected time for
+building or pulling the docker image is around 10 minutes using a modern multicore system with a stable
+internet connection. The expected runtime of the benchmarks is around 5 hours.

From 896ffd002163ceb7c78f592456f31f66d314763a Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Tue, 27 Dec 2022 08:55:57 +0100
Subject: [PATCH 327/331] Small exmplanation regarding install.sh

---
 docs/CC2023/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/CC2023/README.md b/docs/CC2023/README.md
index 8b176b3937..4acf24422a 100644
--- a/docs/CC2023/README.md
+++ b/docs/CC2023/README.md
@@ -21,7 +21,7 @@ To launch the Docker image you can execute the following:
 
 ```
 git clone -b llvm https://github.com/BlueBrain/nmodl.git
-cd nmodl/test/benchnark/gpu_docker  # Enter the directory that contains the Dockerfile
+cd nmodl/test/benchnark/gpu_docker  # Enter the directory that contains the Dockerfile (based on Ubuntu 22.04 but with small changes in should be supported by any Ubuntu version or other linux distributions)
 bash install_gpu_docker_env.sh  # Installs docker and NVIDIA docker runtime (needs sudo permission)
 docker run -it -v $PWD:/opt/mount --gpus all bluebrain/nmodl:mod2ir-gpu-benchmark # Execute docker image (~16GB)
 ```

From 2b8fc56e09feb276bc7ee7ef2dba5de52901922c Mon Sep 17 00:00:00 2001
From: iomaganaris <iomagkanaris@gmail.com>
Date: Mon, 2 Jan 2023 16:12:46 +0100
Subject: [PATCH 328/331] Fix GPU regression

- Add noalias attribute to the GPU kernels
- Avoid adding the wrappers in JIT benchmarks
---
 src/codegen/llvm/annotation.cpp    | 5 ++++-
 src/pybind/pynmodl.cpp             | 2 +-
 test/benchmark/benchmark_script.py | 5 +++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/codegen/llvm/annotation.cpp b/src/codegen/llvm/annotation.cpp
index 6bf437e309..4bc1160bf9 100644
--- a/src/codegen/llvm/annotation.cpp
+++ b/src/codegen/llvm/annotation.cpp
@@ -59,7 +59,10 @@ void DefaultCPUAnnotator::annotate(llvm::Function& function) const {
     function.addParamAttr(0, llvm::Attribute::ReadOnly);
 }
 
-void CUDAAnnotator::annotate(llvm::Function& function) const {    
+void CUDAAnnotator::annotate(llvm::Function& function) const {
+    // Add the `noalias` attribute similarly to the DefaultCPUAnnotator
+    function.addParamAttr(0, llvm::Attribute::NoAlias);
+
     llvm::LLVMContext& context = function.getContext();
     llvm::Module* m = function.getParent();
 
diff --git a/src/pybind/pynmodl.cpp b/src/pybind/pynmodl.cpp
index 7e3d62de99..0c58f2b0ed 100644
--- a/src/pybind/pynmodl.cpp
+++ b/src/pybind/pynmodl.cpp
@@ -211,7 +211,7 @@ class JitDriver {
                                                    0,
                                                    !cfg.llvm_no_debug,
                                                    cfg.llvm_fast_math_flags,
-                                                   true);
+                                                   false);
         visitor.visit_program(*new_node);
         const GPUExecutionParameters gpu_execution_parameters{cuda_grid_dim_x, cuda_block_dim_x};
         nmodl::benchmark::LLVMBenchmark benchmark(visitor,
diff --git a/test/benchmark/benchmark_script.py b/test/benchmark/benchmark_script.py
index 66246fa18b..6241e77638 100644
--- a/test/benchmark/benchmark_script.py
+++ b/test/benchmark/benchmark_script.py
@@ -274,7 +274,8 @@ def run_JIT_kernels(
         cfg.llvm_ir = True
         cfg.llvm_opt_level_ir = 3
         cfg.llvm_math_library = math_lib
-        cfg.llvm_fast_math_flags = fast_math_flags
+        if fast_math_flags is not None:
+            cfg.llvm_fast_math_flags = fast_math_flags
         if architecture != "nvptx64":
             cfg.llvm_cpu_name = architecture
         else:
@@ -388,7 +389,7 @@ def run_benchmark(self):
                                 fast_math_flags = self.benchmark_config.llvm_fast_math_flags
                                 fast_math_name = "nnancontractafn"
                             else:
-                                fast_math_flags = [""]
+                                fast_math_flags = None
                                 fast_math_name = "nonfastmath"
                             if architecture != "nvptx64":
                                 for math_lib in self.benchmark_config.math_libraries:

From 70d0dc00827874910f5ca1c051041acf0e0fb43f Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 2 Jan 2023 16:23:22 +0100
Subject: [PATCH 329/331] Updated readme

---
 docs/CC2023/README.md | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/docs/CC2023/README.md b/docs/CC2023/README.md
index 4acf24422a..599ad3f33f 100644
--- a/docs/CC2023/README.md
+++ b/docs/CC2023/README.md
@@ -1,7 +1,26 @@
 
 # MOD2IR: High-Performance Code Generation for a Biophysically Detailed Neuronal Simulation DSL
 
-Please refer to the PAD.md file for an overview and necessary prerequisites.
+## Artifact Description
+
+MOD2IR is implemented as a code generation backend inside the NMODL Framework and it makes heavy
+use of the LLVM IR and compilation passes. Most of the relevant code of the described work can be
+found [here](https://github.com/BlueBrain/nmodl/tree/llvm/src/codegen/llvm) and
+[here](https://github.com/BlueBrain/nmodl/tree/llvm/test/benchmark).
+
+### Hardware Requirements
+
+The provided artifact can in theory be run on any x86 hardware platform. For the purpose of closely
+reproducing our benchmark results it is required a workstation (or cloud instance) with Intel Xeon
+Skylake (or newer) CPU that supports AVX-512 instructions and an NVIDIA Volta V100 (or newer) GPU.
+All benchmark runs are single-core and have relatively low memory-requirement. For building or running
+the Docker image (and more specifically the NMODL Framework) we, however, recommend a system with plenty
+of cores, at least 32GB of RAM available and 20 GB of disk space.
+
+### Software Requirements
+
+Any reasonably up-to-date Linux system with Docker should be sufficient. If GPU results are to be
+reproduced, an up-to-date CUDA (11.0 or newer) should be present.
 
 ## Benchmarking Instructions
 
@@ -21,7 +40,7 @@ To launch the Docker image you can execute the following:
 
 ```
 git clone -b llvm https://github.com/BlueBrain/nmodl.git
-cd nmodl/test/benchnark/gpu_docker  # Enter the directory that contains the Dockerfile (based on Ubuntu 22.04 but with small changes in should be supported by any Ubuntu version or other linux distributions)
+cd nmodl/test/benchmark/gpu_docker  # Enter the directory that contains the Dockerfile (based on Ubuntu 22.04 but with small changes in should be supported by any Ubuntu version or other linux distributions)
 bash install_gpu_docker_env.sh  # Installs docker and NVIDIA docker runtime (needs sudo permission)
 docker run -it -v $PWD:/opt/mount --gpus all bluebrain/nmodl:mod2ir-gpu-benchmark # Execute docker image (~16GB)
 ```
@@ -50,7 +69,6 @@ CPU only container.
 To do this you need to:
 
 ```
-cd test/benchnark/cpu_docker  # Enter the directory that contains the Dockerfile
 docker run -it -v $PWD:/opt/mount bluebrain/nmodl:mod2ir-cpu-benchmark # Execute docker image (~16GB)
 ```
 

From f60ac4e78783f019abef70327a77eab8a344db51 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 2 Jan 2023 19:40:55 +0100
Subject: [PATCH 330/331] Updated PAD file according to the submission

---
 docs/CC2023/PAD.md | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/docs/CC2023/PAD.md b/docs/CC2023/PAD.md
index c5f473d2c7..22967b1fb7 100644
--- a/docs/CC2023/PAD.md
+++ b/docs/CC2023/PAD.md
@@ -5,20 +5,7 @@
 
 ### Broad Description
 
-This artifact provides all the necessary code, scripts and results to compile the NMODL transpiler
-with the MOD2IR extension and run all benchmarks described in the manuscript. To simplify the
-evaluation process we provide along with the instructions a Dockerfile that will setup a viable
-system for the benchmarks. The driver script compiles the membrane mechanism model `hh.mod` and the
-synapse mechanism model `expsyn.mod` with various compile-time configurations and then runs the
-generated  binaries comparing their runtimes. More specifically the benchmark compares the execution
-runtime of the binaries generated via the two-step compilation process MOD-C++-binary using various
-open-source and commercial compiler frameworks with the one-step ahead-of-time and just-in-time
-processes of MOD2IR.
-MOD2IR is implemented as a code generation backend inside the NMODL Framework and it makes heavy
-use of the LLVM IR and compilation passes. Most of the relevant code of the described work can be
-found [here](https://github.com/BlueBrain/nmodl/tree/llvm/src/codegen/llvm) and
-[here](https://github.com/BlueBrain/nmodl/tree/llvm/test/benchmark). The intructions to reproduce
-the results can be found [here](https://github.com/BlueBrain/nmodl/blob/llvm/docs/CC2023/README.md).
+This artifact provides all the necessary code, scripts and results to compile the NMODL transpiler with the MOD2IR extension and run all benchmarks described in the manuscript. To simplify the evaluation process we provide along with the instructions a Dockerfile that will setup a viable system for the benchmarks. The driver script compiles the membrane mechanism model `hh.mod` and the synapse mechanism model `expsyn.mod` with various compile-time configurations and then runs the generated binaries comparing their runtimes. More specifically the benchmark compares the execution runtime of the binaries generated via the two-step compilation process MOD-C++-binary using various open-source and commercial compiler frameworks with the one-step ahead-of-time and just-in-time processes of MOD2IR. MOD2IR is implemented as a code generation backend inside the NMODL Framework and it makes heavy use of the LLVM IR and compilation passes. Most of the relevant code of the described work can be found [here](https://github.com/BlueBrain/nmodl/tree/llvm/src/codegen/llvm) and [here](https://github.com/BlueBrain/nmodl/tree/llvm/test/benchmark). The instructions to reproduce the results can be found [here](https://github.com/BlueBrain/nmodl/blob/896ffd002163ceb7c78f592456f31f66d314763a/docs/CC2023/README.md).
 
 ### Badge
 
@@ -31,17 +18,11 @@ qualitiatively the same as the ones we have presented.
 
 ### Hardware requisites
 
-The provided artifact can in theory be run on any x86 hardware platform. For the prupose of closely
-reproducing our benchmark results it is required a workstation (or cloud instance) with Intel Xeon
-Skylake (or newer) CPU that supports AVX-512 instructions and an NVIDIA Volta V100 (or newer) GPU.
-All benchmark runs are single-core and have relatively low memory-requirement. For building or running
-the Docker image (and more specifically the NMODL Framework) we, however, recommend a system with plenty
-of cores, at least 32GB of RAM available and 20 GB of disk space.
+The provided artifact can in theory be run on any x86 hardware platform. For the purpose of closely reproducing our benchmark results it is required a workstation (or cloud instance) with Intel Xeon Skylake (or newer) CPU that supports AVX-512 instructions and an NVIDIA Volta V100 (or newer) GPU. All benchmark runs are single-core and have relatively low memory-requirement. For building or running the Docker image (and more specifically the NMODL Framework) we, however, recommend a system with plenty of cores, at least 32GB of RAM available and 20 GB of disk space.
 
 ### Software requisites
 
-Any reasonably up-to-date Linux system with Docker should be sufficient. If GPU results are to be
-reproduced, an up-to-date CUDA (11.0 or newer) should be present.
+Any reasonably up-to-date Linux system with Docker should be sufficient. If GPU results are to be reproduced, an up-to-date CUDA (11.0 or newer) should be present.
 
 
 ### Expectations

From e5e010b9e7767245ec29a3d7f2814b5842d63c39 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 2 Jan 2023 19:57:14 +0100
Subject: [PATCH 331/331] Small fixes in the readme file

---
 docs/CC2023/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/CC2023/README.md b/docs/CC2023/README.md
index 599ad3f33f..14d9c3a51e 100644
--- a/docs/CC2023/README.md
+++ b/docs/CC2023/README.md
@@ -40,8 +40,8 @@ To launch the Docker image you can execute the following:
 
 ```
 git clone -b llvm https://github.com/BlueBrain/nmodl.git
-cd nmodl/test/benchmark/gpu_docker  # Enter the directory that contains the Dockerfile (based on Ubuntu 22.04 but with small changes in should be supported by any Ubuntu version or other linux distributions)
-bash install_gpu_docker_env.sh  # Installs docker and NVIDIA docker runtime (needs sudo permission)
+cd nmodl/test/benchmark/gpu_docker  # Enter the directory that contains the NVIDIA docker runtime installation script
+bash install_gpu_docker_env.sh  # Installs docker and NVIDIA docker runtime (needs sudo permission and is based on Ubuntu 22.04 but with small changes in should be supported by any Ubuntu version or other linux distributions)
 docker run -it -v $PWD:/opt/mount --gpus all bluebrain/nmodl:mod2ir-gpu-benchmark # Execute docker image (~16GB)
 ```