[LLVM] Basic scalable vector support draft #676

georgemitenkov · 2021-05-31T14:05:56Z

This is a draft PR that adds SVE support for LLVM code generation backend. To use scalable vectors, a new --scalable option is added.

Currently, we support basic code generation, including all points mentioned in #637. Note that this is not properly checked (predication was definitely not checked) and even valid LLVM IR may not generate correct assembly (due to LLVM limitations such as maximum scalable vector alignment is 16, etc.).

Example using a constant (https://godbolt.org/z/nK9j3sdM6):

// test.mod
NEURON {
    SUFFIX test
    RANGE x, y
}

ASSIGNED { x y }

STATE { m }

BREAKPOINT {
    SOLVE states METHOD cnexp
}

DERIVATIVE states {
   m = y + 2
}

;bin/nmodl -o llvm  ../../nmodl/test/integration/mod/test.mod llvm --ir --single-precision --vector-width 2 --scalable --disable-debug-info
; ModuleID = 'test'
source_filename = "test"

%test__instance_var__type = type { float*, float*, float*, float*, float*, float*, float*, i32*, float, float, float, i32, i32 }

; Function Attrs: nofree nounwind
define void @nrn_state_test(%test__instance_var__type* noalias nocapture readonly %mech1) #0 {
  %mech = alloca %test__instance_var__type*, align 8
  %id = alloca i32, align 4
  %node_id = alloca <vscale x 2 x i32>, align 8
  %v = alloca <vscale x 2 x float>, align 8
  %epilogue_node_id = alloca i32, align 4
  %epilogue_v = alloca float, align 4
  store %test__instance_var__type* %mech1, %test__instance_var__type** %mech, align 8
  store i32 0, i32* %id, align 4
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %0
  %1 = call i32 @llvm.vscale.i32()
  %2 = mul i32 %1, 2
  %3 = sub i32 %2, 1
  %4 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %5 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %4, i32 0, i32 12
  %6 = load i32, i32* %5, align 4
  %7 = sub i32 %6, %3
  %8 = load i32, i32* %id, align 4
  %9 = icmp slt i32 %8, %7
  br i1 %9, label %for.body, label %for.exit

for.body:                                         ; preds = %for.cond
  %10 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %11 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %10, i32 0, i32 7
  %12 = load i32, i32* %id, align 4
  %13 = sext i32 %12 to i64
  %14 = load i32*, i32** %11, align 8
  %15 = getelementptr inbounds i32, i32* %14, i64 %13
  %16 = bitcast i32* %15 to <vscale x 2 x i32>*
  %17 = load <vscale x 2 x i32>, <vscale x 2 x i32>* %16, align 8
  store <vscale x 2 x i32> %17, <vscale x 2 x i32>* %node_id, align 8
  %18 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %19 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %18, i32 0, i32 6
  %20 = load <vscale x 2 x i32>, <vscale x 2 x i32>* %node_id, align 8
  %21 = sext <vscale x 2 x i32> %20 to <vscale x 2 x i64>
  %22 = load float*, float** %19, align 8
  %23 = getelementptr inbounds float, float* %22, <vscale x 2 x i64> %21
  %24 = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*> %23, i32 1, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x float> undef)
  store <vscale x 2 x float> %24, <vscale x 2 x float>* %v, align 8
  %25 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %26 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %25, i32 0, i32 1
  %27 = load i32, i32* %id, align 4
  %28 = sext i32 %27 to i64
  %29 = load float*, float** %26, align 8
  %30 = getelementptr inbounds float, float* %29, i64 %28
  %31 = bitcast float* %30 to <vscale x 2 x float>*
  %32 = load <vscale x 2 x float>, <vscale x 2 x float>* %31, align 8
  %33 = fadd <vscale x 2 x float> %32, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 2.000000e+00, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
  %34 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %35 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %34, i32 0, i32 2
  %36 = load i32, i32* %id, align 4
  %37 = sext i32 %36 to i64
  %38 = load float*, float** %35, align 8
  %39 = getelementptr inbounds float, float* %38, i64 %37
  %40 = bitcast float* %39 to <vscale x 2 x float>*
  store <vscale x 2 x float> %33, <vscale x 2 x float>* %40, align 8
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %41 = call i32 @llvm.vscale.i32()
  %42 = mul i32 %41, 2
  %43 = load i32, i32* %id, align 4
  %44 = add i32 %43, %42
  store i32 %44, i32* %id, align 4
  br label %for.cond

for.exit:                                         ; preds = %for.cond
  br label %for.cond2

for.cond2:                                        ; preds = %for.inc4, %for.exit
  %45 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %46 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %45, i32 0, i32 12
  %47 = load i32, i32* %46, align 4
  %48 = load i32, i32* %id, align 4
  %49 = icmp slt i32 %48, %47
  br i1 %49, label %for.body3, label %for.exit5

for.body3:                                        ; preds = %for.cond2
  %50 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %51 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %50, i32 0, i32 7
  %52 = load i32, i32* %id, align 4
  %53 = sext i32 %52 to i64
  %54 = load i32*, i32** %51, align 8
  %55 = getelementptr inbounds i32, i32* %54, i64 %53
  %56 = load i32, i32* %55, align 4
  store i32 %56, i32* %epilogue_node_id, align 4
  %57 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %58 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %57, i32 0, i32 6
  %59 = load i32, i32* %epilogue_node_id, align 4
  %60 = sext i32 %59 to i64
  %61 = load float*, float** %58, align 8
  %62 = getelementptr inbounds float, float* %61, i64 %60
  %63 = load float, float* %62, align 4
  store float %63, float* %epilogue_v, align 4
  %64 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %65 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %64, i32 0, i32 1
  %66 = load i32, i32* %id, align 4
  %67 = sext i32 %66 to i64
  %68 = load float*, float** %65, align 8
  %69 = getelementptr inbounds float, float* %68, i64 %67
  %70 = load float, float* %69, align 4
  %71 = fadd float %70, 2.000000e+00
  %72 = load %test__instance_var__type*, %test__instance_var__type** %mech, align 8
  %73 = getelementptr inbounds %test__instance_var__type, %test__instance_var__type* %72, i32 0, i32 2
  %74 = load i32, i32* %id, align 4
  %75 = sext i32 %74 to i64
  %76 = load float*, float** %73, align 8
  %77 = getelementptr inbounds float, float* %76, i64 %75
  store float %71, float* %77, align 4
  br label %for.inc4

for.inc4:                                         ; preds = %for.body3
  %78 = load i32, i32* %id, align 4
  %79 = add i32 %78, 1
  store i32 %79, i32* %id, align 4
  br label %for.cond2

for.exit5:                                        ; preds = %for.cond2
  ret void
}

; Function Attrs: nofree nosync nounwind readnone willreturn
declare i32 @llvm.vscale.i32() #1

; Function Attrs: nofree nosync nounwind readonly willreturn
declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32.nxv2p0f32(<vscale x 2 x float*>, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x float>) #2

attributes #0 = { nofree nounwind "target-features"="+sve,+sve"}
; we actually generate attributes #0 = { nofree nounwind }, see todos
attributes #1 = { nofree nosync nounwind readnone willreturn }
attributes #2 = { nofree nosync nounwind readonly willreturn }

; llc -O3 -mtriple=aarch64--linux-gnu
nrn_state_test:                         // @nrn_state_test
        str     x29, [sp, #-16]!                // 8-byte Folded Spill
        addvl   sp, sp, #-1
        sub     sp, sp, #16                     // =16
        addvl   x8, sp, #1
        str     x0, [x8, #24]
        cntd    x8
        ptrue   p0.d
        neg     x9, x8
        fmov    z0.s, #2.00000000
        str     wzr, [sp, #12]
.LBB0_1:                                // %for.cond
        addvl   x10, sp, #1
        ldr     x10, [x10, #24]
        ldr     w11, [sp, #12]
        ldr     w10, [x10, #80]
        add     w10, w10, w9
        add     w10, w10, #1                    // =1
        cmp     w11, w10
        b.ge    .LBB0_3
        addvl   x10, sp, #1
        ldr     x10, [x10, #24]
        ldrsw   x11, [sp, #12]
        ldp     x10, x12, [x10, #48]
        ld1sw   { z1.d }, p0/z, [x12, x11, lsl #2]
        add     x11, sp, #16                    // =16
        st1w    { z1.d }, p0, [x11, #1, mul vl]
        ld1w    { z1.d }, p0/z, [x10, z1.d, lsl #2]
        addvl   x10, sp, #1
        ldr     x10, [x10, #24]
        ldrsw   x12, [sp, #12]
        ldr     w13, [sp, #12]
        ldp     x11, x10, [x10, #8]
        ld1w    { z2.d }, p0/z, [x11, x12, lsl #2]
        add     x11, sp, #16                    // =16
        st1w    { z1.d }, p0, [x11]
        add     w11, w13, w8
        movprfx z1, z2
        fadd    z1.s, p0/m, z1.s, z0.s
        st1w    { z1.d }, p0, [x10, x12, lsl #2]
        str     w11, [sp, #12]
        b       .LBB0_1
.LBB0_3:
        fmov    s0, #2.00000000
        addvl   x8, sp, #1
        ldr     x8, [x8, #24]
        ldr     w9, [sp, #12]
        ldr     w8, [x8, #80]
        cmp     w9, w8
        b.ge    .LBB0_5
.LBB0_4:                                // %for.body3
        addvl   x9, sp, #1
        ldr     x9, [x9, #24]
        ldrsw   x8, [sp, #12]
        ldp     x13, x11, [x9, #48]
        lsl     x10, x8, #2
        ldp     x12, x9, [x9, #8]
        add     w8, w8, #1                      // =1
        ldrsw   x11, [x11, x10]
        ldr     s1, [x12, x10]
        ldr     s2, [x13, x11, lsl #2]
        stp     w11, w8, [sp, #8]
        fadd    s1, s1, s0
        str     s1, [x9, x10]
        str     s2, [sp, #4]
        addvl   x8, sp, #1
        ldr     x8, [x8, #24]
        ldr     w9, [sp, #12]
        ldr     w8, [x8, #80]
        cmp     w9, w8
        b.lt    .LBB0_4
.LBB0_5:                                // %for.exit5
        addvl   sp, sp, #1
        add     sp, sp, #16                     // =16
        ldr     x29, [sp], #16                  // 8-byte Folded Reload
        ret

Problems
If we use a simple math function such as exp in the kernel (see https://godbolt.org/z/18jEeP65G), then llc complains with

ERROR: Invalid size request on a scalable vector.
PLEASE submit a bug report to https://bugs.llvm.org/ and include the crash backtrace.
Stack dump:
0.	Program arguments: /opt/compiler-explorer/clang-trunk/bin/llc -o /app/output.s -x86-asm-syntax=intel -O3 -mtriple=aarch64--linux-gnu <source>
1.	Running pass 'Function Pass Manager' on module '<source>'.
2.	Running pass 'AArch64 Instruction Selection' on function '@nrn_state_test'
 #0 0x000055d6059b7a0c llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/opt/compiler-explorer/clang-trunk/bin/llc+0x2900a0c)
 #1 0x000055d6059b5994 llvm::sys::RunSignalHandlers() (/opt/compiler-explorer/clang-trunk/bin/llc+0x28fe994)
 #2 0x000055d6059b5b03 SignalHandler(int) Signals.cpp:0:0
 #3 0x00007feef253a3c0 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x153c0)
 #4 0x00007feef200a18b raise (/lib/x86_64-linux-gnu/libc.so.6+0x4618b)
 #5 0x00007feef1fe9859 abort (/lib/x86_64-linux-gnu/libc.so.6+0x25859)
 #6 0x000055d605924536 llvm::report_fatal_error(llvm::Twine const&, bool) (/opt/compiler-explorer/clang-trunk/bin/llc+0x286d536)
 #7 0x000055d605924668 (/opt/compiler-explorer/clang-trunk/bin/llc+0x286d668)
 #8 0x000055d60596a06d (/opt/compiler-explorer/clang-trunk/bin/llc+0x28b306d)
 #9 0x000055d6057eefdd llvm::SelectionDAG::UnrollVectorOp(llvm::SDNode*, unsigned int) (/opt/compiler-explorer/clang-trunk/bin/llc+0x2737fdd)
#10 0x000055d60587106d (anonymous namespace)::VectorLegalizer::Expand(llvm::SDNode*, llvm::SmallVectorImpl<llvm::SDValue>&) LegalizeVectorOps.cpp:0:0
#11 0x000055d605875b44 (anonymous namespace)::VectorLegalizer::LegalizeOp(llvm::SDValue) LegalizeVectorOps.cpp:0:0
#12 0x000055d6058778bb llvm::SelectionDAG::LegalizeVectors() (/opt/compiler-explorer/clang-trunk/bin/llc+0x27c08bb)
#13 0x000055d605803f7b llvm::SelectionDAGISel::CodeGenAndEmitDAG() (/opt/compiler-explorer/clang-trunk/bin/llc+0x274cf7b)
#14 0x000055d6058075e4 llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x27505e4)
#15 0x000055d605809602 llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (.part.859) SelectionDAGISel.cpp:0:0
#16 0x000055d604ea1578 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x1dea578)
#17 0x000055d60529ae37 llvm::FPPassManager::runOnFunction(llvm::Function&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x21e3e37)
#18 0x000055d60529b551 llvm::FPPassManager::runOnModule(llvm::Module&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x21e4551)
#19 0x000055d60529a08f llvm::legacy::PassManagerImpl::run(llvm::Module&) (/opt/compiler-explorer/clang-trunk/bin/llc+0x21e308f)
#20 0x000055d603919ab1 compileModule(char**, llvm::LLVMContext&) llc.cpp:0:0
#21 0x000055d603848446 main (/opt/compiler-explorer/clang-trunk/bin/llc+0x791446)
#22 0x00007feef1feb0b3 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x270b3)
#23 0x000055d60391186a _start (/opt/compiler-explorer/clang-trunk/bin/llc+0x85a86a)
Compiler returned: 139

This is likely a bug on LLVM side (or exp simply should not be used with scalable vectors). If the exponential call is replaced with some other function call (i.e llvm.exp.nxv2f32 -> something that takes <vscale x 2 x float> argument), then no error occurs.

Plan:

Implementation & Testing

Support ScalableVecorType
Support induction variable increments with llvm.vscale.i32() call
Support scalable constants
Add target features (+sve) [See pramodk/llvm-scalable-vectors]
Implement IR tests
Implement execution tests (if using SVE platform)
Remove hardcoded llvm.vscale.i32() from LLVM helper visitor

Questions

Problem: math inrisnics do not work with scalable vectors
How do we call SIMD math functions if the vector width is unknown at compile time?

- while working on NMODL + LLVM, we don't worry that much about Python bindings by default - so lets disable them by default

* added NMODL_ENABLE_LLVM option to enable/disable llvm support in nmodl * LLVMHelper.cmake added to help with linking LLVM libraries - clang might need to use libstdc++ or libc++ linking - on BB5, using GCC with LLVM libraries is fine. But using clang results into lots of link error. Adding -stdlib=libstd++ solves the issue - use check_cxx_source_compiles to find out which cxx flag is needed

- added llvm dir under codegen where LLVM code generation work will live - llvm codegen visitor created that can be used as template for initial work - cmake adapted to enable llvm codegen based on CMake option - simple procedure.mod added that can be initial target for testing - new CLI option --llvm that runs LLVM codegen visitor - Enable CXX 14 because new LLVM versions require it

- install llvm via brew - set LLV_DIR variable so that CMake can find llvm-config

- print table with different build options, flags and paths used that can be helpful for debugging - fix git revision date for older git version - update INSTALL.md with correct brew paths for flex and bison

- test/unit/codegen/llvm.cpp added for unit testing LLVM code generation visitor - ./bin/testcodegen binary can be used to launch LLVM codegen specific tests - multiple llvm_map_components_to_libnames removed - update procedure.mod with simple examples for IR generation

* Added LLVM code generation for `ProcedureBlock`. * Added code generation routines for double, integer and boolean variable types. * Added binary and unary operator code generation: - Supported binary operators: +, -, *, /. - Supported unary operators: -. - Assignment (=) is also supported. * Added regex matching unit tests for LLVM code generation. * Fixed Travis CI/builds. fixes #451, fixes #452, fixes #456 Co-authored-by: Pramod Kumbhar <[email protected]>

* LLVM code generation for `FunctionBlock` is now supported. * Terminators in function or procedure blocks are enforced: - Every procedure must have `ret void` instruction. - Every function returns a double, specified by `ret_<function_name>`. * For local symbol table, code generation now uses LLVM's builtin `llvm::ValueSymbolTable`. fixes #454, fixes #469

* Add option to run LLVM optimisation passes - update CLI argument from --llvm to llvm --ir --opt - --ir runs CodegenLLVMVicitor and emits LLVM IR - if --opt is passed, we run basic LLVM optimisation passes - update simple test to check optimisation passes * Add function example in procedure.mod * Add test for LLVM optimisation passes and dead code removal

This patch adds support for function call code generation, particularly: - User-defined procedures and functions can now lowered to LLVM IR. - A framework for external method calls (e.g. sin, exp, etc.) has been created, currently `exp` and `pow` are supported. - Corresponding tests added. fixes #472

LLVM code generation for `IndexedName`s. - Added code generation for initialising arrays in LOCAL blocks (with both integer constants and macros). - Added support for indexing arrays. fixes #467

NMODL AST needs various transformation to generate C++ code or LLVM IR. This PR is begining of AST transformations to simplify code generation backends. * New CodegenLLVMHelperVisitor to perform various AST transformations to simplify code generation for various backends and simulators. * CodegenLLVMHelperVisitor is currently limited to LLVM backend to simplify initial implementation and keep C++ based backends working. * CodegenLLVMHelperVisitor now handles FUNCTIONS and PROCEDURES blocks - Replace LocalListStatement with CodegenVarStatement - Added new AST types for code generation - CodegenVar to represent variable used for code generation - CodegenVarType to represent codegen variable - CodegenVarListStatement to represent list of CodegenVar - CodegenStruct will be used in future to represent struct like NrnThread or Mechanism class See #474

* Added new BinaryOp for += and -= * Added string_to_binaryop function * Added Void node type to represent void return type * Added CodegenAtomicStatement for ion write statements * llvm helper started handling visit_nrn_state_block - NrnStateBlock is being converted into CodegenFunction - for loop body with solution blocks created - voltage and node index initialization code added - read and write ion statements are handled * Some of the functions are now moved into CodegenInfo Co-authored-by: Ioannis Magkanaris <[email protected]>

This commit introduces a functionality to execute functions from MOD file via LLVM jit. For that, there is now: - `JITDriver` class that, given a LLVM IR module, set ups the JIT compiler and is able to look up a function and execute it. - `Runner` class that wraps around JIT driver. It helps to initialise JIT with LLVM IR module only once, and then run multiple functions from it. To execute functions, `nmodl_llvm_runner` executable is used. It takes a single mod file and a specified entry-point function, and runs it via LLVM code generation pipeline and JIT driver. Only functions with double result types are supported at the moment. For example, for MOD file `foo.mod`: ``` FUNCTION one() { one = 1 } FUNCTION bar() { bar = one() + exp(1) } ``` running `nmodl_llvm_runner -f foo.mod -e bar` gives ``` Result: 3.718282 ``` Tests for execution of generated IR have been added as well. fixes #482 Co-authored-by: Pramod Kumbhar <[email protected]>

* Added more bin ops and refactored code - Now, there are code generation functions for all comparison and logical operators. - Code generation functions are now split based on the expression "type" (assignment, arithmetic, comparison, logical). Moreover, the lhs and rhs expression results can be both double and integer. This is important for control flow code generation and for the new AST node CodegenVarType. * Added support for NOT op * Added default type flag to switch between float and double * Added tests for single precision * Renames LLVM test file to codegen_llvm_ir.cpp to follow convention. * NOTE : Tests for new operators will be added when the first control flow node (most likely FOR node) will land. fixes #453

* visit_statement_block of all FUNCTION and PROCEDURE blocks was called resulting in changing LOCAL statement to DOUBLE statement * As statement block doesn't need to be visited for this purpose, rename function to convert_local_statement * Call convert_local_statement only when required i.e. only when codegen function creation time. fixes #491

* Handle CodegenVarType type in JSON printer - As AstNodeType is enum type and node itself, we need to print that explicitly * Indent json visitor jinja template - initially template was not indented as code generated was not looking good - now all generated code is automatically clang-formatted so it's less of a concern. Readability is important. fixes #493

* LLVM Helper visitor now can return a vector of `CodegenFunction`s. * LLVM Helper visitor has been integrated into LLVM visitor: - The type of variables is still double by default, but can also be inferred from `CodegenVarType` node. - Procedure's return type changed to int (so that error codes can be returned in the future). - New visitor functions added: for `CodegenReturn`, `CodegenFunction`, `CodegenVarList` and `CodegenVarType`.

* Added a new code generation function for conditional statements (`if`, `else if`, `else` and their nested variations). * Added tests for the new code generation: - IR unit tests. - Execution tests. * Fixed FP and integer comparison ordering in macros. fixes #468

Added error handling when a non-scope value is looked up. Before, such a lookup would yield a nullptr, therefore leading to a segmentation fault. This PR adds a lookup function that wraps around value symbol lookup, and throws an error with a message if nullptr is returned.

Added support for WHILE statement code generation. Corresponding tests for IR generation and execution were also added. Additional visitor for StatementBlock was added to reduce code duplication. fixes #500

* Moved info related function to codegen_info - Moved get_float_variables, codegen_int_variables, codegen_global_variables, codegen_shadow_variables into CodegenHelper - Move small utility functions from CodegenCVisitor to codeged_utils * Add proper variables to the mech_Instance * Adding LLVMStructBlock * Added test and visitor * Fix llvm codegen tests with x[0-9].*

- Added support for string function arguments. These are converted into global `i8` array values. - Added support for `printf` function call with variable number of arguments. - Refactored function/procedure call argument processing into a separate function. fixes #510

* Move code gen specific InstanceStruct node to codegen.yaml - nmodl.yaml file is more for language constructs - InstanceStruct is specific for code generation and hence move it to codegen.yaml * Update CI scripts * fix cmake-format with v==0.6.13

- instance structure now contains all global variables - instance structure now contains index variables for ions - nrn_state kernel now has all variables converted to instance - InstanceVarHelper added to query variable and it's location * Support for codegen variable with type * Add nmodl_to_json helper added in main.cpp * Added --vector-width CLI option * Add instance struct argument to nrn_state_hh * Add comments as TODOs to support LLVM IR generation Note that this commit and next commit (Part II) are required to make LLVM IR code generation working. Vector IR generation is working except indirect indexes. See comment in #531.

- remove undefined visit_codegen_instance_var - Improved member creation for instance struct - Instance struct type generation for kernel arguments - Proper integration of instance struct - Added scalar code generation for the kernel - Removed instance test since it is not created explicitly anymore - Fixed ordering for precision and width in LLVM Visitor - Added vector induction variable - Vectorised code for compute with direct loads fully functional - Instance naming fixed - (LLVM IR) Fixed compute vector code generation types - refactoring : improve coversion of double to int for the loop expressions

This PR adds a unit test to check LLVM instructions generated for the scalar kernel, particularly: - FOR loop blocks. - Induction variable increments and comparisons. - Correct loads through GEPs from the struct. Test for vectorised code generation would be added in a separate PR or when full vectorisation support (indirect indexing) would land.

Improved index code generation within the LLVM pipeline. The following issues were addressed: Array indices are i64 per LLVM's addressing convention. This means that if the value is not a constant, an additional sext instruction must be created. Bounds check is removed since it requires a certain analysis on the index value. This can be addressed in a separate PR. `IndexedName` code generation is separated into 2 functions The first, `get_array_length()` is responsible for array initialisation, the second, `get_array_index()`, for indexing. In latter case, we support the following cases: ``` ... // Indexing with an integer constant k[0] = ... // Indexing with an integer expression k[10 - 10] // Indexing with a `Name` AST node that is an integer // (in our case a FOR loop induction variable or a variable // with `CodegenVarType` == `Integer` k[id] = ... k[ena_id] = ... ``` Note that the case: ``` // id := loop integer induction variable k[id + 1] = ... ``` is not supported for 2 reasons: On the AST level, as per #545 the expression would contain a Name and not VarName node that fails the code generation. The case only arises in the kernel functions like state_update, where indexing is "artificially" created with indexing by a Name only. fixes #541

* CodegenLLVMHelperVisitor improved without hardcoded parameters * Added get_instance_struct_ptr to get instance structure for variable information * test/unit/codegen/codegen_data_helper.cpp : first draft implementation of codegen data helper * Added test for typecasting to the proper struct type Co-authored-by: Pramod Kumbhar <[email protected]>

@proc

Added debug support to LLVM code generation pipeline. Currently, only basic support was added: 1. Debug information about functions (name) 2. Debug information about module **What has been changed and added** 1. A new class `DebugBuilder` was created. It is used as a wrapper around LLVM's `DIBuilder` and holds important information such as `LLVMContext`, debug file and compile unit. It also wraps `DIBuilder`'s functionality into a more suitable API. 2. A temporary `Location` struct has been added. It encapsulates the location of the source AST construct and reflects `ModToken` on LLVM code generation level. It is only used if the location of the source NMODL function is known. 3. LLVM visitor know takes an extra `add_debug_information` flag and handles debug information creation. Fore readability, `IRBuilder` was renamed to `ir_builder`. 4. JIT runner is now able to listen for GDB, perf (build LLVM with `-DLLVM_USE_PERF=ON`) and VTune (build LLVM with `-DLLVM_USE_INTEL_JITEVENTS=ON`) events. 5. Necessary cmake changes were added to optionally support JIT event listeners (`-DNMODL_HAVE_JIT_EVENT_LISTENERS`). **How to generate debug information** Debug information is attached to every function, procedure or artificially created kernel (and corresponding wrappers). Debug information is enable by default, so to turn it off use ` --disable-debug-info` flag. For example, the given NMODL ```nmodl 1 FUNCTION func(x) { 2 func = x 3 } 4 5 PROCEDURE proc() {} ``` is transformed (running `./bin/nmodl <filename>.mod llvm --ir`) into ```llvm define double @func(double %x1) !dbg !4 { ; ... } define i32 @proc() !dbg !6 { ; ... } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3} !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "NMODL-LLVM", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) !1 = !DIFile(filename: "foo", directory: ".") !2 = !{} !3 = !{i32 2, !"Debug Version", i32 3} !4 = distinct !DISubprogram(name: "func", linkageName: "func", scope: null, file: !1, line: 1, type: !5, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !5 = !DISubroutineType(types: !2) !6 = distinct !DISubprogram(name: "proc", linkageName: "proc", scope: null, file: !1, line: 5, type: !5, scopeLine: 5, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) ``` fixes #592 #612 Co-authored-by: Pramod Kumbhar <[email protected]>

* in the new deployment, pgi module is changed to nvhpc * fix gitlab CI script accordingly

* Fix `append_statements_from_block` function in LLVM helper visitor. * Before, if nonspecific current was not specified, the whole `BREAKPOINT` block would be added to the kernel body. * This led to cases when `SOLVE` block was together with the actual solution to `DERIVATIVE`

Improves the code structure for the LLVM code generation pipeline The following changes were added: 1. New IR builder class. Before, LLVM visitor just simply used `llvm::IRBuilder<>` class to generate instructions. Recently, this (as well as adding the functionality to the visitor on the go) had led to code duplication and it became hard to introduce new features nicely. Hence, a special `IRBuilder` class is now used. This class is a wrapper around `llvm::IRBuilder<>` that keeps track of certain IR generation specific fields (that are unrelated to the visitor), defines an API that the visitor can use to generate LLVM IR. Also, this IR builder has been designed to be nearly fully-independent from NMODL AST nodes. this allows it to be more generic and to be more extensible. 2. Visitor clean-up LLVM visitor has been refactored to take the new IR builder class into account. Also, the functions were reordered, refactored and renamed to better reflect the intended use and provide encapsulation. 3. Scatter preparation The functionality of the generating code for `CodegenInstanceVar` node has been extended with `read_from_or_write_to_instance(...)` function. Now, an optional `value_to_store` is passed to indicate whether the code needs to be generated for reading the instance variable or writing to it. fixes #538

* Fixed CodegenAtomicStatement initialisation * Removed unused variable and changed comment

* Instance data structure initialization had following bug - instance struct has int member variables which act as offsets to other vectors (e.g. node_index, na_ion_index) - these variables were initialized from 1 to N where N was incremented always without considering the upper bound on for offset. * With this fix - index / integer variables are always initialized from 0 to N-1. - Variables are initialised 1e-5 prevision so that we have reaosanbly bigger values - Update tests to check offset from 0 to N-1

Added basic support to transform indirect writes into `llvm.masked.scatter` intrinsic. Currently, the scatter functionality is limited to non-atomic writes and assignment (e.g. `+=` operator is not yet supported). Hence, a warning is logged to the console indicating all limitations. Corresponding IR and execution tests were also added. fixes #539

* Move benchmark + JIT related code from src/codegen/llvm to test/benchmark * Common execution of CodegenLLVMVisitor for llvm --ir and benchmark option. With this, ast transformed for LLVM code generation is dumped to file. * Previous object file is removed (if exist) so that output file name is same / deterministic * Benchmark output is always printed to stdout via common logger object * Remove unnecessary LLVMBuildInfo struct

Previously, there was no metadata and attributes associated with the instance struct pointer, compute kernels or loops. This commit fixes this. - New instance struct attributes Since all pointers contained in the instance struct do not alias, we add a `noalias` (LLVM's `__restrict` alternative) attribute to it. In addition, we add `nocapture` (No capturing occurs in the function) and `readonly` (Struct pointer is not written to) attributes. This means that some load instructions can be moved out from the loop body. Example: ```llvm ; BEFORE for.body.lr.ph: ; preds = %0 %5 = getelementptr inbounds %avx__instance_var__type, %avx__instance_var__type* %mech1, i64 0, i32 1 br label %for.body for.body: ; preds = %for.body.lr.ph, %for.body %15 = load double*, double** %5, align 8 ; ... ; AFTER for.body.lr.ph: ; preds = %0 %5 = getelementptr inbounds %avx__instance_var__type, %avx__instance_var__type* %mech1, i64 0, i32 1 %6 = load double*, double** %5, align 8 br label %for.body ``` - New function attributes Now, compute kernels are marked with `nofree` and `nounwind` attributes. - Loop metadata Also, loop metadata is added to scalar kernels, specifying that no vectorization is needed. The reason for this is because we want to benchmark truly scalar kernels, and disable LLVM's vectorization if necessary. Note that for vector loop epilogue there is no metadata that disables vectorization. fixes #607

- fixes the case, where loaded value was taken from the stack, but was never actually put there

Added support for vector predication. Currently, we support a very basic predication pattern (that will be extended in the future): ```c++ IF (/*condition*/) { // code here, no nested conditionals } ELSE { // code here, no nested conditionals } ``` **What has been changed and added** 1. Removed vectorization check Before, in the `FOR` statement visitor we were checking whether the code can be vectorized. After refactoring `llvm::IRBuilder<>` into a separate class, there is no interface to reset the builder's vector width. Hence, this check leads to visitor having scalar vector width of 1, and builder having the same vector width. ```c++ if (!can_vectorize(node, sym_tab)) { vector_width = 1; ir_builder.generate_scalar_code(); } ``` In order to avoid any issues, this check is simply removed and will be added in the separate PR. 2. Predication support - `can_vectorize` has been changed to support a single `IF` or `IF/ELSE` pair. - A special vectorized `IF` AST node visitor has been added. - If generating code within `IF` AST node, instructions are masked. 3. Added execution and IR tests fixes #539

* Improved cmake versioning of LLVM * Added ^ support * Added more math functions intrinsics with tests * Added compute time variance and min/max times in benchmarking output

* With this PR alloca instructions are always inserted in the beginning of the function entry block. This is done to avoid them in the while or for loops, where allocations per iteration cause stack overflow (if the IR is not optimized). * Insertion point for allocas is the enetry block now See #653

@fma

Added support for fast math flags in LLVM backend. Currently, the user can specify them via command-line (this approach was chosen for easier benchmarking). The specified flags are named exactly the same as in LLVM. This feature is useful to enable previously unsafe FP-math optimizations. For example, fused-multiply-add instructions can now be generated when lowering LLVM IR to assembly or executing via JIT. Example: ```c++ // fma.mod FUNCTION fma(a, b, c) { fma = (a * b) + c } ``` ```bash $ ./nmodl fma.mod --verbose debug llvm --ir --fmf nnan contract afn --opt ``` ```llvm define double @fma(double %a, double %b, double %c) { %1 = fmul nnan contract afn double %a, %b %2 = fadd nnan contract afn double %1, %c ret double %2 } ```

- NMODL parser uses VarName on the LHS of assignment expression - Inline visitor was using Name on the LHS of assignment expression Related to #667

* Added support for `libsystem_m` and `SLEEF` vector libraries. The first one is supported by LLVM internally, so it comes for free with LLVM 13. For `SLEEF`, basic support was added for AArch64 and x86 architectures. Currently, we support - `exp` - `pow` * Added corresponding IR checks for `libsystem_m` and `SLEEF` (both AArch64 and x86). * Updated LLVM binaries for MAC OS CI, as well as for latest LLVM 13 (trunk) to fix link errors for Darwin vector library. Co-authored-by: Pramod Kumbhar <[email protected]>

bbpbuildbot · 2021-05-31T14:05:59Z

Can one of the admins verify this patch?

castigli · 2021-06-08T10:47:15Z

If I understand well, <n x m x type> means that it is an unknown multiple (n times) of m x type where m is the minimum number of elements. Since the minimum for sve is 128 bits shouldn't be <vscale x 4 x float> or <vscale x 2 x double> ?
If the above is correct, we should make this automatic, so you should be able to specify either --vector-width <m> or --scalable and in the latter case you select the minimum width based on the type.

pramodk and others added 30 commits May 8, 2021 09:03

Disable python bindings for faster build

16d3cab

- while working on NMODL + LLVM, we don't worry that much about Python bindings by default - so lets disable them by default

Azure CI fixes for LLVM build and README update

51987dc

- install llvm via brew - set LLV_DIR variable so that CMake can find llvm-config

Print build status after cmake configure stage

ae07ce4

- print table with different build options, flags and paths used that can be helpful for debugging - fix git revision date for older git version - update INSTALL.md with correct brew paths for flex and bison

Support for IndexedName codegen (#478)

7884de8

LLVM code generation for `IndexedName`s. - Added code generation for initialising arrays in LOCAL blocks (with both integer constants and macros). - Added support for indexing arrays. fixes #467

Added support for WHILE statement (#501)

a32f76b

Added support for WHILE statement code generation. Corresponding tests for IR generation and execution were also added. Additional visitor for StatementBlock was added to reduce code duplication. fixes #500

Fix issue error: ‘runtime_error’ is not a member of ‘std’ (#512)

a561c97

georgemitenkov and others added 24 commits May 10, 2021 23:49

Fixed using benchmarking_info in TestRunner (#631)

1802b74

Fixes to run CI with NVHPC/PGI compiler

3359ea3

* in the new deployment, pgi module is changed to nvhpc * fix gitlab CI script accordingly

Fixed initialisation of CodegenAtomicStatement (#642)

454a18f

* Fixed CodegenAtomicStatement initialisation * Removed unused variable and changed comment

Added loaded value to the stack (#655)

ee8bbdb

- fixes the case, where loaded value was taken from the stack, but was never actually put there

Improvements for LLVM code generation and benchmarking (#661)

8bee7de

* Improved cmake versioning of LLVM * Added ^ support * Added more math functions intrinsics with tests * Added compute time variance and min/max times in benchmarking output

Avoid generating LLVM IR for Functions and Procedures if inlined (#664)

2ca85e5

Fixed typo in benchmarking metrics (#665)

7fdbb4f

Remove only inlined blocks from AST based on symtab properties (#668)

4c585f3

Use VarName on the RHS of assignment expression (#669)

f0a3afc

- NMODL parser uses VarName on the LHS of assignment expression - Inline visitor was using Name on the LHS of assignment expression Related to #667

Added scalable type support

7300eb9

Clang format

77881fa

Added scalable induction variable support

885fd55

Added scalable vector constant support

86aa20d

pramodk force-pushed the llvm branch 2 times, most recently from bc5a4c4 to c80a44d Compare March 9, 2022 12:04

iomaganaris force-pushed the llvm branch from 265a02b to c8b626a Compare May 12, 2022 14:57

iomaganaris added the llvm label Sep 19, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[LLVM] Basic scalable vector support draft #676

[LLVM] Basic scalable vector support draft #676

georgemitenkov commented May 31, 2021 •

edited

Loading

bbpbuildbot commented May 31, 2021

castigli commented Jun 8, 2021

[LLVM] Basic scalable vector support draft #676

Are you sure you want to change the base?

[LLVM] Basic scalable vector support draft #676

Conversation

georgemitenkov commented May 31, 2021 • edited Loading

bbpbuildbot commented May 31, 2021

castigli commented Jun 8, 2021

georgemitenkov commented May 31, 2021 •

edited

Loading