script

cornell-zhang · Jan 6, 2024 · 66654df · 66654df
1 parent 80e41d3
commit 66654df
Show file tree

Hide file tree

Showing 7 changed files with 288 additions and 7 deletions.
diff --git a/evaluation/CPU/UniSparse/sparlay_bdia_csr_spmv.mlir b/evaluation/CPU/UniSparse/sparlay_bdia_csr_spmv.mlir
@@ -0,0 +1,164 @@
+// sparlay-opt ./decompose-BDIA.mlir -lower-struct-convert -lower-struct -dce -sparlay-codegen -lower-format-conversion | \
+// mlir-opt -one-shot-bufferize="bufferize-function-boundaries=1 allow-return-allocs unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map" \
+// -finalizing-bufferize -convert-linalg-to-loops -convert-vector-to-scf -convert-scf-to-cf -lower-affine \
+// -convert-vector-to-llvm -convert-memref-to-llvm -convert-complex-to-standard -convert-math-to-llvm \
+// -convert-math-to-libm -convert-complex-to-libm -convert-complex-to-llvm -convert-func-to-llvm \
+// -reconcile-unrealized-casts  | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -relocation-model=pic -filetype=obj -o bdia_spmv.o
+
+// clang++ bdia_spmv.o -L$SPLHOME/build/lib -lmlir_sparlay_runner_utils \
+//     -L$LLVMHOME/build/lib -lmlir_runner_utils -lmlir_c_runner_utils -o bdia_spmv
+
+// ./bdia_spmv
+
+// RUN: sparlay-opt %s -lower-struct-convert -lower-struct -dce -lower-format-conversion | FileCheck %s
+
+
+!Filename = !llvm.ptr<i8>
+
+#COO = #sparlay.encoding<{
+  crdMap = #sparlay.crd<(i,j)->(i,j)>,
+  compressMap = #sparlay.compress<trim(0,1)>
+}>
+
+#CSR = #sparlay.encoding<{
+  crdMap = #sparlay.crd<(i,j)->(i,j)>,
+  compressMap = #sparlay.compress<fuse(0), trim(1,1)>
+}>
+
+#BDIA = #sparlay.encoding<{
+  crdMap = #sparlay.crd<(i,j)->(i floordiv 50, j minus i, i mod 50)>,
+  compressMap = #sparlay.compress<fuse(0), trim(1,1)>
+}>
+
+#trait1 = {
+indexing_maps = [
+    affine_map<(i,j) -> (i, j)>,  // A
+    affine_map<(i,j) -> (j)>,  // B
+    affine_map<(i,j) -> (i)>   // X (out)
+  ],
+  iterator_types = ["parallel", "reduction"],
+  doc = "X(i) =+ A(i,j) * B(j)"
+}
+
+module {
+  func.func private @rtclock() -> f64
+  func.func private @getTensorFilename(index) -> (!Filename)
+  func.func @kernel_csr_spmv(%arg0: tensor<?x?xf32, #CSR>, %arg1: tensor<?xf32>, %argx: tensor<?xf32>) -> tensor<?xf32> {
+    %0 = linalg.generic #trait1
+    ins(%arg0, %arg1 : tensor<?x?xf32, #CSR>, tensor<?xf32>)
+    outs(%argx: tensor<?xf32>) {
+    ^bb0(%a: f32, %b: f32, %x: f32):
+      %2 = arith.mulf %a, %b : f32
+      %3 = arith.addf %x, %2 : f32
+      linalg.yield %3 : f32
+    } -> tensor<?xf32>
+    return %0 : tensor<?xf32>
+  }
+
+  func.func @main() {
+    %c0 = arith.constant 0: index
+    %c1 = arith.constant 1 : index
+    %f0 = arith.constant 0.0: f32
+    %f05 = arith.constant 0.5: f32
+    %i1 = arith.constant 1: i32
+    %blockSize = arith.constant 100: i32
+    %thres_1 = arith.constant 0.5: f32
+
+    %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
+    %A_1 = sparlay.fromFile (%fileName): !llvm.ptr<i8> to tensor<?x?xf32, #COO>
+    %dim1 = tensor.dim %A_1, %c1 : tensor<?x?xf32, #COO>
+    %dim0 = tensor.dim %A_1, %c0 : tensor<?x?xf32, #COO>
+    // %thres_1 = arith.constant dense<[0.5]>: tensor<1xf32>
+    // %thres_2 = bufferization.alloc_tensor () copy(%thres_1): tensor<1xf32>
+    // %thres = bufferization.to_memref %thres_2: memref<1xf32>
+
+    %t_start0 = call @rtclock() : () -> f64
+    %S_1 = sparlay.decompose_BDIA %A_1, %blockSize, %thres_1 : tensor<?x?xf32, #COO>, i32, f32 to 
+          !sparlay.struct< tensor<?x?xf32,#COO>, tensor<?x?xf32,#BDIA> >
+    %t_end0 = call @rtclock() : () -> f64
+    %t_0 = arith.subf %t_end0, %t_start0: f64
+    vector.print %t_0 : f64
+
+    %B_0 = sparlay.struct_access %S_1[0]: 
+              !sparlay.struct< tensor<?x?xf32,#COO>, tensor<?x?xf32,#BDIA> >
+          to  tensor<?x?xf32, #COO>
+    %B_1 = sparlay.struct_access %S_1[1]:
+              !sparlay.struct< tensor<?x?xf32,#COO>, tensor<?x?xf32,#BDIA> >
+          to  tensor<?x?xf32, #BDIA>
+
+    %D_0 = sparlay.convert(%B_0) : tensor<?x?xf32, #COO> to tensor<?x?xf32, #CSR>
+
+    // %init_256_4 = bufferization.alloc_tensor(%dim1) : tensor<?xf32>
+    // %b = scf.for %i = %c0 to %dim1 step %c1 iter_args(%t = %init_256_4) -> tensor<?xf32> {
+    //   %k0 = arith.muli %i, %c1 : index
+    //   %k1 = arith.index_cast %k0 : index to i32
+    //   %k = arith.sitofp %k1 : i32 to f32
+    //   %t3 = tensor.insert %k into %t[%i] : tensor<?xf32>
+    //   scf.yield %t3 : tensor<?xf32>
+    // }
+    %init_256_4 = bufferization.alloc_tensor(%dim1) : tensor<?xf32>
+    // %tensor_B = tensor.insert %f05 into %init_256_4[%c0] : tensor<?xf32>
+    // %dim1_1 = arith.subi %dim1, %c1 : index
+    // %i_dim1_1 = arith.index_cast %dim1_1 : index to i32
+    // %f_dim1_1 = arith.sitofp %i_dim1_1 : i32 to f32
+    // %elm = arith.divf %f05, %f_dim1_1 : f32
+    // %b = scf.for %i = %c1 to %dim1 step %c1 iter_args(%t = %tensor_B) -> tensor<?xf32> {
+    %b = scf.for %i = %c0 to %dim1 step %c1 iter_args(%t = %init_256_4) -> tensor<?xf32> {
+      %k1 = arith.index_cast %i : index to i32
+      %k = arith.sitofp %k1 : i32 to f32
+      %t3 = tensor.insert %k into %t[%i] : tensor<?xf32>
+      scf.yield %t3 : tensor<?xf32>
+    }
+
+    // %o0 = bufferization.alloc_tensor(%dim0) : tensor<?xf32>
+    // %o00 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o0) -> tensor<?xf32> {
+    //   %t3 = tensor.insert %f0 into %t[%i] : tensor<?xf32>
+    //   scf.yield %t3 : tensor<?xf32>
+    // }
+    %o1 = bufferization.alloc_tensor(%dim0) : tensor<?xf32>
+    %o11 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o1) -> tensor<?xf32> {
+      %t3 = tensor.insert %f0 into %t[%i] : tensor<?xf32>
+      scf.yield %t3 : tensor<?xf32>
+    }
+    // %o2 = bufferization.alloc_tensor(%dim0) : tensor<?xf32>
+    // %o22 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o2) -> tensor<?xf32> {
+    //   %t3 = tensor.insert %f0 into %t[%i] : tensor<?xf32>
+    //   scf.yield %t3 : tensor<?xf32>
+    // }
+
+    %t_start4 = call @rtclock() : () -> f64
+    // CSR SpMV
+    // %result0 = call @kernel_csr_spmv(%D_0, %b, %o00) : (tensor<?x?xf32, #CSR>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+    // %t_end1 = call @rtclock() : () -> f64
+    // block DIA SpMV
+    %result1 = sparlay.bdia_spmv %D_0, %B_1, %b, %o1: 
+      tensor<?x?xf32, #CSR>, tensor<?x?xf32,#BDIA>, tensor<?xf32>, tensor<?xf32> to memref<?xf32>
+    // %t_end2 = call @rtclock() : () -> f64
+    // %output = linalg.elemwise_binary ins(%result0, %result1: tensor<?xf32>, tensor<?xf32>)
+    //                           outs(%o2: tensor<?xf32>) -> tensor<?xf32>
+    %t_end4 = call @rtclock() : () -> f64
+    // %t_1 = arith.subf %t_end1, %t_start4: f64
+    // %t_2 = arith.subf %t_end2, %t_end1: f64
+    // %t_4 = arith.subf %t_end4, %t_end2: f64
+    %t_5 = arith.subf %t_end4, %t_start4: f64
+    // vector.print %t_1 : f64
+    // vector.print %t_2 : f64
+    // vector.print %t_4 : f64
+    vector.print %t_5 : f64
+    // %v0 = vector.transfer_read %result0[%c0], %f0: tensor<?xf32>, vector<4xf32>
+    // vector.print %v0 : vector<4xf32>
+    %v1 = vector.transfer_read %result1[%c0], %f0: memref<?xf32>, vector<4xf32>
+    vector.print %v1 : vector<4xf32>
+    // %v2 = vector.transfer_read %output[%c0], %f0: tensor<?xf32>, vector<4xf32>
+    // vector.print %v2 : vector<4xf32>
+    bufferization.dealloc_tensor %A_1 : tensor<?x?xf32, #COO>
+    bufferization.dealloc_tensor %B_1 : tensor<?x?xf32, #BDIA>
+    sparlay.release %S_1: !sparlay.struct< tensor<?x?xf32,#COO>, tensor<?x?xf32,#BDIA> >
+    // bufferization.dealloc_tensor %B_0 : tensor<?x?xf32, #COO>
+    // bufferization.dealloc_tensor %o1 : tensor<?xf32>
+    // bufferization.dealloc_tensor %result0 : tensor<?xf32>
+    // bufferization.dealloc_tensor %output : tensor<?xf32>
+
+    return
+  }
+}
diff --git a/evaluation/CPU/UniSparse/unisparse_bdia_csr_spmv.mlir b/evaluation/CPU/UniSparse/unisparse_bdia_csr_spmv.mlir
@@ -0,0 +1,119 @@
+// sparlay-opt ./decompose-BDIA.mlir -lower-struct-convert -lower-struct -dce -sparlay-codegen -lower-format-conversion | \
+// mlir-opt -one-shot-bufferize="bufferize-function-boundaries=1 allow-return-allocs unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map" \
+// -finalizing-bufferize -convert-linalg-to-loops -convert-vector-to-scf -convert-scf-to-cf -lower-affine \
+// -convert-vector-to-llvm -convert-memref-to-llvm -convert-complex-to-standard -convert-math-to-llvm \
+// -convert-math-to-libm -convert-complex-to-libm -convert-complex-to-llvm -convert-func-to-llvm \
+// -reconcile-unrealized-casts  | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -relocation-model=pic -filetype=obj -o bdia_spmv.o
+
+// clang++ bdia_spmv.o -L$SPLHOME/build/lib -lmlir_sparlay_runner_utils \
+//     -L$LLVMHOME/build/lib -lmlir_runner_utils -lmlir_c_runner_utils -o bdia_spmv
+
+// ./bdia_spmv
+
+// RUN: sparlay-opt %s -lower-struct-convert -lower-struct -dce -lower-format-conversion | FileCheck %s
+
+
+!Filename = !llvm.ptr<i8>
+
+#COO = #sparlay.encoding<{
+  crdMap = #sparlay.crd<(i,j)->(i,j)>,
+  compressMap = #sparlay.compress<trim(0,1)>
+}>
+
+#CSR = #sparlay.encoding<{
+  crdMap = #sparlay.crd<(i,j)->(i,j)>,
+  compressMap = #sparlay.compress<fuse(0), trim(1,1)>
+}>
+
+#BDIA = #sparlay.encoding<{
+  crdMap = #sparlay.crd<(i,j)->(i floordiv 50, j minus i, i mod 50)>,
+  compressMap = #sparlay.compress<fuse(0), trim(1,1)>
+}>
+
+#trait1 = {
+indexing_maps = [
+    affine_map<(i,j) -> (i, j)>,  // A
+    affine_map<(i,j) -> (j)>,  // B
+    affine_map<(i,j) -> (i)>   // X (out)
+  ],
+  iterator_types = ["parallel", "reduction"],
+  doc = "X(i) =+ A(i,j) * B(j)"
+}
+
+module {
+  func.func private @rtclock() -> f64
+  func.func private @getTensorFilename(index) -> (!Filename)
+  func.func @kernel_csr_spmv(%arg0: tensor<?x?xf32, #CSR>, %arg1: tensor<?xf32>, %argx: tensor<?xf32>) -> tensor<?xf32> {
+    %0 = linalg.generic #trait1
+    ins(%arg0, %arg1 : tensor<?x?xf32, #CSR>, tensor<?xf32>)
+    outs(%argx: tensor<?xf32>) {
+    ^bb0(%a: f32, %b: f32, %x: f32):
+      %2 = arith.mulf %a, %b : f32
+      %3 = arith.addf %x, %2 : f32
+      linalg.yield %3 : f32
+    } -> tensor<?xf32>
+    return %0 : tensor<?xf32>
+  }
+
+  func.func @main() {
+    %c0 = arith.constant 0: index
+    %c1 = arith.constant 1 : index
+    %f0 = arith.constant 0.0: f32
+    %f05 = arith.constant 0.5: f32
+    %i1 = arith.constant 1: i32
+    %blockSize = arith.constant 100: i32
+    %thres_1 = arith.constant 0.3: f32
+
+    %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
+    %A_1 = sparlay.fromFile (%fileName): !llvm.ptr<i8> to tensor<?x?xf32, #COO>
+    %dim1 = tensor.dim %A_1, %c1 : tensor<?x?xf32, #COO>
+    %dim0 = tensor.dim %A_1, %c0 : tensor<?x?xf32, #COO>
+
+    %t_start0 = call @rtclock() : () -> f64
+    %S_1 = sparlay.decompose_BDIA %A_1, %blockSize, %thres_1 : tensor<?x?xf32, #COO>, i32, f32 to 
+          !sparlay.struct< tensor<?x?xf32,#COO>, tensor<?x?xf32,#BDIA> >
+    %t_end0 = call @rtclock() : () -> f64
+    %t_0 = arith.subf %t_end0, %t_start0: f64
+    vector.print %t_0 : f64
+
+    %B_0 = sparlay.struct_access %S_1[0]: 
+              !sparlay.struct< tensor<?x?xf32,#COO>, tensor<?x?xf32,#BDIA> >
+          to  tensor<?x?xf32, #COO>
+    %B_1 = sparlay.struct_access %S_1[1]:
+              !sparlay.struct< tensor<?x?xf32,#COO>, tensor<?x?xf32,#BDIA> >
+          to  tensor<?x?xf32, #BDIA>
+
+    %D_0 = sparlay.convert(%B_0) : tensor<?x?xf32, #COO> to tensor<?x?xf32, #CSR>
+
+    %init_256_4 = bufferization.alloc_tensor(%dim1) : tensor<?xf32>
+    %b = scf.for %i = %c0 to %dim1 step %c1 iter_args(%t = %init_256_4) -> tensor<?xf32> {
+      %k1 = arith.index_cast %i : index to i32
+      %k = arith.sitofp %k1 : i32 to f32
+      %t3 = tensor.insert %k into %t[%i] : tensor<?xf32>
+      scf.yield %t3 : tensor<?xf32>
+    }
+
+    %o1 = bufferization.alloc_tensor(%dim0) : tensor<?xf32>
+    %o11 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o1) -> tensor<?xf32> {
+      %t3 = tensor.insert %f0 into %t[%i] : tensor<?xf32>
+      scf.yield %t3 : tensor<?xf32>
+    }
+
+    %t_start4 = call @rtclock() : () -> f64
+    // block DIA SpMV
+    %result1 = sparlay.bdia_spmv %D_0, %B_1, %b, %o1: 
+      tensor<?x?xf32, #CSR>, tensor<?x?xf32,#BDIA>, tensor<?xf32>, tensor<?xf32> to memref<?xf32>
+                          outs(%o2: tensor<?xf32>) -> tensor<?xf32>
+    %t_end4 = call @rtclock() : () -> f64
+    %t_5 = arith.subf %t_end4, %t_start4: f64
+    vector.print %t_5 : f64
+
+    %v1 = vector.transfer_read %result1[%c0], %f0: memref<?xf32>, vector<4xf32>
+    vector.print %v1 : vector<4xf32>
+    bufferization.dealloc_tensor %A_1 : tensor<?x?xf32, #COO>
+    bufferization.dealloc_tensor %B_1 : tensor<?x?xf32, #BDIA>
+    sparlay.release %S_1: !sparlay.struct< tensor<?x?xf32,#COO>, tensor<?x?xf32,#BDIA> >
+
+    return
+  }
+}
diff --git a/evaluation/FormatConversion/UniSparse/executables/coo_cisr b/evaluation/FormatConversion/UniSparse/executables/coo_cisr
diff --git a/evaluation/FormatConversion/UniSparse/executables/coo_cisr.o b/evaluation/FormatConversion/UniSparse/executables/coo_cisr.o
diff --git a/...onversion/UniSparse/temp_coo_to_cisr.mlir → ...version/UniSparse/unisparse_coo_cisr.mlir b/...onversion/UniSparse/temp_coo_to_cisr.mlir → ...version/UniSparse/unisparse_coo_cisr.mlir
@@ -2,18 +2,11 @@ module {
   func.func private @delUniSparseTensorF32(!llvm.ptr<i8>)
   func.func private @sptCheckF32(!llvm.ptr<i8>, !llvm.ptr<i8>) attributes {llvm.emit_c_interface}
   func.func private @sptMoveF32(!llvm.ptr<i8>, i32, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
-  func.func private @sptSwapF32(!llvm.ptr<i8>, i32, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
-  func.func private @sptTileSplitF32(!llvm.ptr<i8>, i32, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
-  func.func private @sptSeparateF32(!llvm.ptr<i8>, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
-  func.func private @sptTrimF32(!llvm.ptr<i8>, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
-  func.func private @sptGrowF32(!llvm.ptr<i8>, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
   func.func private @sptFuseF32(!llvm.ptr<i8>, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
   func.func private @sptSumF32(!llvm.ptr<i8>, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
-  func.func private @sptEnumerateF32(!llvm.ptr<i8>, i32, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
   func.func private @sptScheduleF32(!llvm.ptr<i8>, i32, i32, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
   func.func private @sptPadF32(!llvm.ptr<i8>, i32, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
   func.func private @sptReorderF32(!llvm.ptr<i8>, i32, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
-  func.func private @sptCustTrimF32(!llvm.ptr<i8>, i32) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
   func.func private @sptCopyF32(!llvm.ptr<i8>) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
   func.func private @sptFromFileF32(!llvm.ptr<i8>) -> !llvm.ptr<i8> attributes {llvm.emit_c_interface}
   func.func private @rtclock() -> f64

diff --git a/evaluation/FormatConversion/coo_cisr.o b/evaluation/FormatConversion/coo_cisr.o
diff --git a/evaluation/FormatConversion/run.sh b/evaluation/FormatConversion/run.sh
@@ -19,6 +19,8 @@ mlir-opt ./UniSparse/unisparse_csb_dia_v.mlir -one-shot-bufferize="bufferize-fun
 clang++ csb_dia_v.o -L$SPLHOME/build/lib -lmlir_unisparse_runner_utils -L$LLVM_ROOT/build/lib -lmlir_runner_utils -lmlir_c_runner_utils -o csb_dia_v
 mlir-opt ./UniSparse/unisparse_coo_c2sr.mlir -one-shot-bufferize="bufferize-function-boundaries=1 allow-return-allocs unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map" -finalizing-bufferize -convert-linalg-to-loops -convert-vector-to-scf -convert-scf-to-cf -lower-affine -convert-vector-to-llvm -convert-memref-to-llvm -convert-complex-to-standard -convert-math-to-llvm -convert-math-to-libm -convert-complex-to-libm -convert-complex-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts  | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -relocation-model=pic -filetype=obj -o coo_c2sr.o
 clang++ coo_c2sr.o -L$SPLHOME/build/lib -lmlir_unisparse_runner_utils -L$LLVM_ROOT/build/lib -lmlir_runner_utils -lmlir_c_runner_utils -o coo_c2sr
+mlir-opt ./UniSparse/unisparse_coo_cisr.mlir -one-shot-bufferize="bufferize-function-boundaries=1 allow-return-allocs unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map" -finalizing-bufferize -convert-linalg-to-loops -convert-vector-to-scf -convert-scf-to-cf -lower-affine -convert-vector-to-llvm -convert-memref-to-llvm -convert-complex-to-standard -convert-math-to-llvm -convert-math-to-libm -convert-complex-to-libm -convert-complex-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts  | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -relocation-model=pic -filetype=obj -o coo_cisr.o
+clang++ coo_cisr.o -L$SPLHOME/build/lib -lmlir_unisparse_runner_utils -L$LLVM_ROOT/build/lib -lmlir_runner_utils -lmlir_c_runner_utils -o coo_cisr
 
 mlir-opt ./sparse_tensor_dialect/sparse_tensor_csr_to_csc.mlir -sparse-compiler | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -relocation-model=pic -filetype=obj -o sparse_tensor_csr_csc.o
 clang++ sparse_tensor_csr_csc.o -L$LLVM_ROOT/build/lib -lmlir_runner_utils -lmlir_c_runner_utils -o sparse_tensor_csr_csc
@@ -56,4 +58,7 @@ do
 
     echo COO_C2SR UniSparse 
     ./coo_c2sr
+
+    echo COO_CISR UniSparse 
+    ./coo_cisr
 done