backend: (risc-v) Lowering NsNet2 IREE kernels #2740

zero9178 · 2024-06-17T13:13:13Z

NsNet2, as processed by our IREE backend (https://github.com/opencompl/Quidditch) currently produces 6 different kernels.
This epic documents the state of compiling each + the input IR. The order is in percentage of cycles in the LLVM backend execution.

main$async_dispatch_1_matmul_transpose_b_1x1200x400_f32 (48.71% of all cycles)

IR

func.func @main$async_dispatch_1_matmul_transpose_b_1x1200x400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x1200xf64>, %arg1: memref<1x400xf64, strided<[400, 1], offset: ?>>, %arg2: memref<1200x400xf64, strided<[400, 1], offset: ?>>, %arg3: memref<1x1200xf64, strided<[1200, 1], offset: ?>>, %arg4: memref<1x1200xf64, strided<[1200, 1], offset: ?>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x1200xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x400xf64, strided<[400, 1], offset: ?>>, memref<1200x400xf64, strided<[400, 1], offset: ?>>) outs(%arg0 : memref<1x1200xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x1200xf64>, memref<1x1200xf64, strided<[1200, 1], offset: ?>>) outs(%arg4 : memref<1x1200xf64, strided<[1200, 1], offset: ?>>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    linalg.yield %0 : f64
  }
  return
}

main$async_dispatch_9_matmul_transpose_b_1x161x600_f32 (27.50% of all cycles)
Needs support for: math.exp

IR

func.func @main$async_dispatch_9_matmul_transpose_b_1x161x600_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x161xf64>, %arg1: memref<1x600xf64, strided<[600, 1], offset: 600>>, %arg2: memref<161x600xf64, strided<[600, 1], offset: 2590800>>, %arg3: memref<1x161xf64, strided<[161, 1], offset: 2687400>>, %arg4: memref<1x161xf64>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  %cst_0 = arith.constant 1.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x161xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x600xf64, strided<[600, 1], offset: 600>>, memref<161x600xf64, strided<[600, 1], offset: 2590800>>) outs(%arg0 : memref<1x161xf64>) {
  ^bb0(%in: f64, %in_1: f64, %out: f64):
    %0 = arith.mulf %in, %in_1 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x161xf64>, memref<1x161xf64, strided<[161, 1], offset: 2687400>>) outs(%arg4 : memref<1x161xf64>) {
  ^bb0(%in: f64, %in_1: f64, %out: f64):
    %0 = arith.addf %in, %in_1 : f64
    %1 = arith.negf %0 : f64
    %2 = math.exp %1 : f64
    %3 = arith.addf %2, %cst_0 : f64
    %4 = arith.divf %cst_0, %3 : f64
    linalg.yield %4 : f64
  }
  return
}

main$async_dispatch_8_matmul_transpose_b_1x600x600_f32 (8.85% of all cycles)

IR

func.func @main$async_dispatch_8_matmul_transpose_b_1x600x600_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x600xf64>, %arg1: memref<1x600xf64>, %arg2: memref<600x600xf64, strided<[600, 1], offset: 2230200>>, %arg3: memref<1x600xf64, strided<[600, 1], offset: 2590200>>, %arg4: memref<1x600xf64, strided<[600, 1], offset: 600>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x600xf64>, memref<600x600xf64, strided<[600, 1], offset: 2230200>>) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x600xf64>, memref<1x600xf64, strided<[600, 1], offset: 2590200>>) outs(%arg4 : memref<1x600xf64, strided<[600, 1], offset: 600>>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    %1 = arith.maximumf %0, %cst : f64
    linalg.yield %1 : f64
  }
  return
}

main$async_dispatch_7_matmul_transpose_b_1x600x400_f32 (5.89% of all cycles)

IR

func.func @main$async_dispatch_7_matmul_transpose_b_1x600x400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x600xf64>, %arg1: memref<1x400xf64, strided<[400, 1], offset: 400>>, %arg2: memref<600x400xf64, strided<[400, 1], offset: 1989600>>, %arg3: memref<1x600xf64, strided<[600, 1], offset: 2229600>>, %arg4: memref<1x600xf64>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x400xf64, strided<[400, 1], offset: 400>>, memref<600x400xf64, strided<[400, 1], offset: 1989600>>) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x600xf64>, memref<1x600xf64, strided<[600, 1], offset: 2229600>>) outs(%arg4 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    %1 = arith.maximumf %0, %cst : f64
    linalg.yield %1 : f64
  }
  return
}

main$async_dispatch_0_matmul_transpose_b_1x400x161_f32 (1.62% of all cycles)

IR

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel0(%arg0: memref<1x50xf64>) {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x50xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  return
}

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel1(%arg0: memref<1x161xf64>, %arg1: memref<50x161xf64>, %arg2: memref<1x50xf64>) {
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : memref<1x161xf64>, memref<50x161xf64>) outs(%arg2 : memref<1x50xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  return
}

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel2(%arg0: memref<1x50xf64>, %arg1: memref<1x50xf64>, %arg2: memref<1x50xf64>) {
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x50xf64>, memref<1x50xf64>) outs(%arg2 : memref<1x50xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    linalg.yield %0 : f64
  }
  return
}

main$async_dispatch_3_elementwise_400_f32 (1.26% of all cycles)
Needs support for: Dynamic offsets in MemRef, math.exp and math.tanh

IR

func.func @main$async_dispatch_3_elementwise_400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<400xf64, strided<[1], offset: ?>>, %arg1: memref<400xf64, strided<[1], offset: ?>>, %arg2: memref<400xf64, strided<[1], offset: ?>>, %arg3: memref<400xf64, strided<[1], offset: ?>>, %arg4: memref<400xf64, strided<[1], offset: ?>>, %arg5: memref<400xf64, strided<[1], offset: ?>>, %arg6: memref<400xf64, strided<[1], offset: ?>>, %arg7: memref<400xf64, strided<[1], offset: ?>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 1.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 : memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>) outs(%arg7 : memref<400xf64, strided<[1], offset: ?>>) {
  ^bb0(%in: f64, %in_0: f64, %in_1: f64, %in_2: f64, %in_3: f64, %in_4: f64, %in_5: f64, %out: f64):
    %0 = arith.addf %in_4, %in_5 : f64
    %1 = arith.addf %in_2, %in_3 : f64
    %2 = arith.negf %1 : f64
    %3 = math.exp %2 : f64
    %4 = arith.addf %3, %cst : f64
    %5 = arith.divf %cst, %4 : f64
    %6 = arith.mulf %in_1, %5 : f64
    %7 = arith.addf %in_0, %6 : f64
    %8 = math.tanh %7 : f64
    %9 = arith.negf %0 : f64
    %10 = math.exp %9 : f64
    %11 = arith.addf %10, %cst : f64
    %12 = arith.divf %cst, %11 : f64
    %13 = arith.subf %in, %8 : f64
    %14 = arith.mulf %13, %12 : f64
    %15 = arith.addf %14, %8 : f64
    linalg.yield %15 : f64
  }
  return
}

The text was updated successfully, but these errors were encountered:

superlopuh changed the title ~~epic, backend: (risc-v) Lowering NsNet2 IREE kernels~~ backend: (risc-v) Lowering NsNet2 IREE kernels Nov 20, 2024

superlopuh added the backend Compiler backend in xDSL label Nov 20, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

backend: (risc-v) Lowering NsNet2 IREE kernels #2740

backend: (risc-v) Lowering NsNet2 IREE kernels #2740

zero9178 commented Jun 17, 2024 •

edited by superlopuh

Loading

backend: (risc-v) Lowering NsNet2 IREE kernels #2740

backend: (risc-v) Lowering NsNet2 IREE kernels #2740

Comments

zero9178 commented Jun 17, 2024 • edited by superlopuh Loading

zero9178 commented Jun 17, 2024 •

edited by superlopuh

Loading