Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backend: (risc-v) Lowering NsNet2 IREE kernels #2740

Open
4 of 6 tasks
zero9178 opened this issue Jun 17, 2024 · 0 comments
Open
4 of 6 tasks

backend: (risc-v) Lowering NsNet2 IREE kernels #2740

zero9178 opened this issue Jun 17, 2024 · 0 comments
Labels
backend Compiler backend in xDSL

Comments

@zero9178
Copy link
Contributor

zero9178 commented Jun 17, 2024

NsNet2, as processed by our IREE backend (https://github.com/opencompl/Quidditch) currently produces 6 different kernels.
This epic documents the state of compiling each + the input IR. The order is in percentage of cycles in the LLVM backend execution.

  • main$async_dispatch_1_matmul_transpose_b_1x1200x400_f32 (48.71% of all cycles)
IR
func.func @main$async_dispatch_1_matmul_transpose_b_1x1200x400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x1200xf64>, %arg1: memref<1x400xf64, strided<[400, 1], offset: ?>>, %arg2: memref<1200x400xf64, strided<[400, 1], offset: ?>>, %arg3: memref<1x1200xf64, strided<[1200, 1], offset: ?>>, %arg4: memref<1x1200xf64, strided<[1200, 1], offset: ?>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x1200xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x400xf64, strided<[400, 1], offset: ?>>, memref<1200x400xf64, strided<[400, 1], offset: ?>>) outs(%arg0 : memref<1x1200xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x1200xf64>, memref<1x1200xf64, strided<[1200, 1], offset: ?>>) outs(%arg4 : memref<1x1200xf64, strided<[1200, 1], offset: ?>>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    linalg.yield %0 : f64
  }
  return
}
  • main$async_dispatch_9_matmul_transpose_b_1x161x600_f32 (27.50% of all cycles)
    Needs support for: math.exp
IR
func.func @main$async_dispatch_9_matmul_transpose_b_1x161x600_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x161xf64>, %arg1: memref<1x600xf64, strided<[600, 1], offset: 600>>, %arg2: memref<161x600xf64, strided<[600, 1], offset: 2590800>>, %arg3: memref<1x161xf64, strided<[161, 1], offset: 2687400>>, %arg4: memref<1x161xf64>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  %cst_0 = arith.constant 1.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x161xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x600xf64, strided<[600, 1], offset: 600>>, memref<161x600xf64, strided<[600, 1], offset: 2590800>>) outs(%arg0 : memref<1x161xf64>) {
  ^bb0(%in: f64, %in_1: f64, %out: f64):
    %0 = arith.mulf %in, %in_1 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x161xf64>, memref<1x161xf64, strided<[161, 1], offset: 2687400>>) outs(%arg4 : memref<1x161xf64>) {
  ^bb0(%in: f64, %in_1: f64, %out: f64):
    %0 = arith.addf %in, %in_1 : f64
    %1 = arith.negf %0 : f64
    %2 = math.exp %1 : f64
    %3 = arith.addf %2, %cst_0 : f64
    %4 = arith.divf %cst_0, %3 : f64
    linalg.yield %4 : f64
  }
  return
}
  • main$async_dispatch_8_matmul_transpose_b_1x600x600_f32 (8.85% of all cycles)
IR
func.func @main$async_dispatch_8_matmul_transpose_b_1x600x600_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x600xf64>, %arg1: memref<1x600xf64>, %arg2: memref<600x600xf64, strided<[600, 1], offset: 2230200>>, %arg3: memref<1x600xf64, strided<[600, 1], offset: 2590200>>, %arg4: memref<1x600xf64, strided<[600, 1], offset: 600>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x600xf64>, memref<600x600xf64, strided<[600, 1], offset: 2230200>>) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x600xf64>, memref<1x600xf64, strided<[600, 1], offset: 2590200>>) outs(%arg4 : memref<1x600xf64, strided<[600, 1], offset: 600>>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    %1 = arith.maximumf %0, %cst : f64
    linalg.yield %1 : f64
  }
  return
}
  • main$async_dispatch_7_matmul_transpose_b_1x600x400_f32 (5.89% of all cycles)
IR
func.func @main$async_dispatch_7_matmul_transpose_b_1x600x400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<1x600xf64>, %arg1: memref<1x400xf64, strided<[400, 1], offset: 400>>, %arg2: memref<600x400xf64, strided<[400, 1], offset: 1989600>>, %arg3: memref<1x600xf64, strided<[600, 1], offset: 2229600>>, %arg4: memref<1x600xf64>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg1, %arg2 : memref<1x400xf64, strided<[400, 1], offset: 400>>, memref<600x400xf64, strided<[400, 1], offset: 1989600>>) outs(%arg0 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg3 : memref<1x600xf64>, memref<1x600xf64, strided<[600, 1], offset: 2229600>>) outs(%arg4 : memref<1x600xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    %1 = arith.maximumf %0, %cst : f64
    linalg.yield %1 : f64
  }
  return
}
  • main$async_dispatch_0_matmul_transpose_b_1x400x161_f32 (1.62% of all cycles)
IR
func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel0(%arg0: memref<1x50xf64>) {
  %cst = arith.constant 0.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%cst : f64) outs(%arg0 : memref<1x50xf64>) {
  ^bb0(%in: f64, %out: f64):
    linalg.yield %in : f64
  }
  return
}

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel1(%arg0: memref<1x161xf64>, %arg1: memref<50x161xf64>, %arg2: memref<1x50xf64>) {
  linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : memref<1x161xf64>, memref<50x161xf64>) outs(%arg2 : memref<1x50xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.mulf %in, %in_0 : f64
    %1 = arith.addf %out, %0 : f64
    linalg.yield %1 : f64
  }
  return
}

func.func @main$async_dispatch_0_matmul_transpose_b_1x400x161_f64$xdsl_kernel2(%arg0: memref<1x50xf64>, %arg1: memref<1x50xf64>, %arg2: memref<1x50xf64>) {
  linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<1x50xf64>, memref<1x50xf64>) outs(%arg2 : memref<1x50xf64>) {
  ^bb0(%in: f64, %in_0: f64, %out: f64):
    %0 = arith.addf %in, %in_0 : f64
    linalg.yield %0 : f64
  }
  return
}
  • main$async_dispatch_3_elementwise_400_f32 (1.26% of all cycles)
    Needs support for: Dynamic offsets in MemRef, math.exp and math.tanh
IR
func.func @main$async_dispatch_3_elementwise_400_f64$iree_to_xdsl$xDSL_kernel(%arg0: memref<400xf64, strided<[1], offset: ?>>, %arg1: memref<400xf64, strided<[1], offset: ?>>, %arg2: memref<400xf64, strided<[1], offset: ?>>, %arg3: memref<400xf64, strided<[1], offset: ?>>, %arg4: memref<400xf64, strided<[1], offset: ?>>, %arg5: memref<400xf64, strided<[1], offset: ?>>, %arg6: memref<400xf64, strided<[1], offset: ?>>, %arg7: memref<400xf64, strided<[1], offset: ?>>) attributes {llvm.bareptr, xdsl_generated} {
  %cst = arith.constant 1.000000e+00 : f64
  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 : memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>, memref<400xf64, strided<[1], offset: ?>>) outs(%arg7 : memref<400xf64, strided<[1], offset: ?>>) {
  ^bb0(%in: f64, %in_0: f64, %in_1: f64, %in_2: f64, %in_3: f64, %in_4: f64, %in_5: f64, %out: f64):
    %0 = arith.addf %in_4, %in_5 : f64
    %1 = arith.addf %in_2, %in_3 : f64
    %2 = arith.negf %1 : f64
    %3 = math.exp %2 : f64
    %4 = arith.addf %3, %cst : f64
    %5 = arith.divf %cst, %4 : f64
    %6 = arith.mulf %in_1, %5 : f64
    %7 = arith.addf %in_0, %6 : f64
    %8 = math.tanh %7 : f64
    %9 = arith.negf %0 : f64
    %10 = math.exp %9 : f64
    %11 = arith.addf %10, %cst : f64
    %12 = arith.divf %cst, %11 : f64
    %13 = arith.subf %in, %8 : f64
    %14 = arith.mulf %13, %12 : f64
    %15 = arith.addf %14, %8 : f64
    linalg.yield %15 : f64
  }
  return
}
@superlopuh superlopuh changed the title epic, backend: (risc-v) Lowering NsNet2 IREE kernels backend: (risc-v) Lowering NsNet2 IREE kernels Nov 20, 2024
@superlopuh superlopuh added the backend Compiler backend in xDSL label Nov 20, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend Compiler backend in xDSL
Projects
None yet
Development

No branches or pull requests

2 participants