diff --git a/.gitignore b/.gitignore index bd318e0..7390008 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ build/* /build* -# /scripts* -# scripts/* +/scripts* +scripts/* evaluation/CPU/build* storage-src/1 /eigen \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..fdb714d --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "files.associations": { + "random": "cpp", + "cmath": "cpp", + "iostream": "cpp", + "ostream": "cpp" + } +} \ No newline at end of file diff --git a/README.pdf b/README.pdf new file mode 100644 index 0000000..8c99e54 Binary files /dev/null and b/README.pdf differ diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/Makefile b/evaluation/FormatCustomization/PageRank/Intel_MKL/Makefile new file mode 100644 index 0000000..a0acc8d --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/Makefile @@ -0,0 +1,87 @@ +# TACO_INCLUDE_DIR=/home/ubuntu/efs/software/taco/include +# TACO_LIBRARY_DIR=/home/ubuntu/efs/software/taco/build/lib + +# GBLAS_INCLUDE_DIR=/home/ubuntu/project/GraphBLAS/include +# GBLAS_LIBRARY_DIR=/home/ubuntu/project/GraphBLAS/lib + +#MKL_INCLUDE_DIR=/work/shared/common/CAD_tool/Intel/mkl/mkl/include +#MKL_LIBRARY_DIR=/work/shared/common/CAD_tool/Intel/mkl/mkl/lib/intel64 + +MKL_INCLUDE_DIR=/opt/intel/oneapi/mkl/latest/include +MKL_LIBRARY_DIR=/opt/intel/oneapi/mkl/latest/lib/intel64 + +# CNPY_INCLUDE_DIR=/work/shared/common/project_build/graphblas/software/cnpy +# CNPY_LIBRARY_DIR=/work/shared/common/project_build/graphblas/software/cnpy/build + +BUILD_DIR=./build + +# taco_sddmm: taco_sddmm.cpp +# mkdir -p $(BUILD_DIR) +# g++ -std=c++14 -O3 -I${TACO_INCLUDE_DIR} -L${TACO_LIBRARY_DIR} taco_sddmm.cpp -o $(BUILD_DIR)/taco_sddmm -ltaco + +# graphblas_spmm: graphblas_spmm.cpp +# mkdir -p $(BUILD_DIR) +# g++ -std=c++11 -O3 -I${GBLAS_INCLUDE_DIR} -Wl,-rpath,${GBLAS_LIBRARY_DIR} -L${GBLAS_LIBRARY_DIR} \ +# graphblas_spmm.cpp -o $(BUILD_DIR)/graphblas_spmm -llagraph -lgraphblas -lm -lcnpy + +mkl_spmm: mkl_spmm.cpp + mkdir -p $(BUILD_DIR) + g++ -std=c++11 -O3 -I${MKL_INCLUDE_DIR} \ + ${CXXFLAGS} \ + -Wl,--no-as-needed \ + mkl_spmm.cpp -o $(BUILD_DIR)/mkl_spmm \ + -Wl,--start-group ${MKL_LIBRARY_DIR}/libmkl_intel_lp64.a \ + ${MKL_LIBRARY_DIR}/libmkl_core.a ${MKL_LIBRARY_DIR}/libmkl_gnu_thread.a -Wl,--end-group \ + -fopenmp -lpthread -lm -ldl + +mkl_spmv: mkl_spmv.cpp + mkdir -p $(BUILD_DIR) + g++ -std=c++11 -O3 -I${MKL_INCLUDE_DIR} \ + ${CXXFLAGS} \ + -Wl,--no-as-needed \ + mkl_spmv.cpp -o $(BUILD_DIR)/mkl_spmv \ + -Wl,--start-group ${MKL_LIBRARY_DIR}/libmkl_intel_lp64.a \ + ${MKL_LIBRARY_DIR}/libmkl_core.a ${MKL_LIBRARY_DIR}/libmkl_gnu_thread.a -Wl,--end-group \ + -fopenmp -lpthread -lm -ldl + +mkl_pagerank: mkl_pagerank.cpp + mkdir -p $(BUILD_DIR) + g++ -std=c++11 -O3 -I${MKL_INCLUDE_DIR} \ + ${CXXFLAGS} \ + -Wl,--no-as-needed \ + mkl_pagerank.cpp -o $(BUILD_DIR)/mkl_pagerank \ + -Wl,--start-group ${MKL_LIBRARY_DIR}/libmkl_intel_lp64.a \ + ${MKL_LIBRARY_DIR}/libmkl_core.a ${MKL_LIBRARY_DIR}/libmkl_gnu_thread.a -Wl,--end-group \ + -fopenmp -lpthread -lm -ldl + +mkl_spgemm: mkl_spgemm.cpp + mkdir -p $(BUILD_DIR) + g++ -std=c++11 -O3 -I${MKL_INCLUDE_DIR} \ + ${CXXFLAGS} \ + -Wl,--no-as-needed \ + mkl_spgemm.cpp -o $(BUILD_DIR)/mkl_spgemm \ + -Wl,--start-group ${MKL_LIBRARY_DIR}/libmkl_intel_lp64.a \ + ${MKL_LIBRARY_DIR}/libmkl_core.a ${MKL_LIBRARY_DIR}/libmkl_gnu_thread.a -Wl,--end-group \ + -fopenmp -lpthread -lm -ldl + +mkl_spmspv: mkl_spmspv.cpp + mkdir -p $(BUILD_DIR) + g++ -std=c++11 -O3 -I${MKL_INCLUDE_DIR} \ + -Wl,--no-as-needed \ + mkl_spmspv.cpp -o $(BUILD_DIR)/mkl_spmspv \ + -Wl,--start-group ${MKL_LIBRARY_DIR}/libmkl_intel_lp64.a \ + ${MKL_LIBRARY_DIR}/libmkl_core.a ${MKL_LIBRARY_DIR}/libmkl_gnu_thread.a -Wl,--end-group \ + -fopenmp -lpthread -lm -ldl + +mkl_spgemm_lp: mkl_spgemm_lp.cpp + mkdir -p $(BUILD_DIR) + g++ -std=c++11 -O3 -I${MKL_INCLUDE_DIR} \ + ${CXXFLAGS} \ + -Wl,--no-as-needed \ + mkl_spgemm_lp.cpp -o $(BUILD_DIR)/mkl_spgemm_lp \ + -Wl,--start-group ${MKL_LIBRARY_DIR}/libmkl_intel_lp64.a \ + ${MKL_LIBRARY_DIR}/libmkl_core.a ${MKL_LIBRARY_DIR}/libmkl_gnu_thread.a -Wl,--end-group \ + -fopenmp -lpthread -lm -ldl + +clean: + rm -rf ${BUILD_DIR} diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/build/mkl_pagerank b/evaluation/FormatCustomization/PageRank/Intel_MKL/build/mkl_pagerank new file mode 100755 index 0000000..811a85e Binary files /dev/null and b/evaluation/FormatCustomization/PageRank/Intel_MKL/build/mkl_pagerank differ diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_pagerank.cpp b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_pagerank.cpp new file mode 100644 index 0000000..d0aa0e7 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_pagerank.cpp @@ -0,0 +1,136 @@ +#include +#include +#include +#include +#include +#include + +#include "mtx_read.h" +#include "mkl_spblas.h" + +using namespace std; + +//#define VAR var +typedef double scalar_t; +float test_spmv(sparse_matrix_t* AdjMatrix, struct matrix_descr descrAdjMatrix, + int num_src_vertices, int num_dst_vertices) { + scalar_t* Vector = (scalar_t*)malloc(sizeof(scalar_t) * num_src_vertices); + for (int i = 0; i < num_src_vertices; i++) { + Vector[i] = 1.0/num_src_vertices; + } + scalar_t* Out = (scalar_t*)malloc(sizeof(scalar_t) * num_dst_vertices); + for (int i = 0; i < num_dst_vertices; i++) { + Out[i] = 0.0; + } + scalar_t* PrevOut = (scalar_t*)malloc(sizeof(scalar_t) * num_dst_vertices); + for (int i = 0; i < num_dst_vertices; i++) { + PrevOut[i] = 0.0; + } + + scalar_t alpha = 1.0; + scalar_t beta = 0; + int num_runs = 0; + scalar_t sum; + scalar_t max_diff; + + auto t1 = std::chrono::high_resolution_clock::now(); + // for (int i = 0; i < num_runs; i++) { + do { + mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, + alpha, + *AdjMatrix, + descrAdjMatrix, + Vector, + beta, + Out); + max_diff = 0; + sum = 0; + for (int j = 0; j < num_dst_vertices; j++) { + max_diff = max(max_diff, abs(Out[j]-PrevOut[j])); + PrevOut[j] = Out[j]; + // cout << Vector[j] << " "; + Vector[j] = Out[j]; + sum = Out[j] + sum; + } + // cout << endl; + if (abs(sum-1)>1e-2) + cout << sum< input(file_name); + + int num_dst_vertices = input.num_cols; + int num_src_vertices = input.num_rows; + + sparse_matrix_t AdjMatrix; + mkl_sparse_d_create_csc(&AdjMatrix, + SPARSE_INDEX_BASE_ONE, + input.num_rows, + input.num_cols, + input.cscColPtr, + input.cscColPtr + 1, + input.cscRowInd, + input.cscValue); +*/ + parse_CSR input(file_name); + + int num_dst_vertices = input.num_rows; + int num_src_vertices = input.num_cols; + + sparse_matrix_t AdjMatrix; + mkl_sparse_d_create_csr(&AdjMatrix, + SPARSE_INDEX_BASE_ZERO, + input.num_rows, + input.num_cols, + input.csrRowPtr, + input.csrRowPtr + 1, + input.csrColInd, + input.csrValue); + + + // printf("cscColPtr: \n"); + // for (unsigned i = 0; i < input.num_cols + 1; i++) { + // printf("%d ", *(input.cscColPtr + i)); + // } + // printf("\n"); + +// parse_COO input(file_name); + +// int num_dst_vertices = input.num_rows; +// int num_src_vertices = input.num_cols; + +// sparse_matrix_t AdjMatrix; +// mkl_sparse_d_create_coo(&AdjMatrix, +// SPARSE_INDEX_BASE_ONE, +// input.num_rows, +// input.num_cols, +// input.num_nnz, +// input.cooRowInd, +// input.cooColInd, +// input.cooValue); + + mkl_sparse_optimize(AdjMatrix); + + struct matrix_descr descrAdjMatrix; + descrAdjMatrix.type = SPARSE_MATRIX_TYPE_GENERAL; + + float average_time_in_sec = test_spmv(&AdjMatrix, descrAdjMatrix, num_src_vertices, num_dst_vertices); + std::cout << "average_time = " << average_time_in_sec * 1000 << " ms" << std::endl; + float throughput = input.num_nnz * 2 / average_time_in_sec / 1000 / 1000 / 1000; + std::cout << "THROUGHPUT = " << throughput << " GOPS" << std::endl; + + return 0; +} diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spgemm.cpp b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spgemm.cpp new file mode 100644 index 0000000..96fdba2 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spgemm.cpp @@ -0,0 +1,107 @@ +#include +#include +#include +#include +#include + +#include "mtx_read.h" +#include "mkl_spblas.h" +#include "mkl.h" + +typedef double scalar_t; + + +int test_ops(MKL_INT rows, MKL_INT cols, MKL_INT *row_ptr, MKL_INT *col_idx) { + int nnz = 0; + for(int i = 0; i < rows; i++) { +// std::cout << "Finish initialize" << std::endl; + for(int j = row_ptr[i]; j < row_ptr[i+1]; j++) { + int idx = col_idx[j]; + nnz = nnz + 2* (row_ptr[idx+1] - row_ptr[idx]); + } + } + return nnz; +} + +int main(int argc, char* argv[]) { + char *file_name = argv[1]; + +/* + parse_CSC input(file_name); + + int num_dst_vertices = input.num_cols; + int num_src_vertices = input.num_rows; + + sparse_matrix_t AdjMatrix; + mkl_sparse_d_create_csc(&AdjMatrix, + SPARSE_INDEX_BASE_ONE, + input.num_rows, + input.num_cols, + input.cscColPtr, + input.cscColPtr + 1, + input.cscRowInd, + input.cscValue); +*/ + parse_CSR input0(file_name); + sparse_matrix_t matA; + mkl_sparse_d_create_csr(&matA, + SPARSE_INDEX_BASE_ZERO, + input0.num_rows, + input0.num_cols, + input0.csrRowPtr, + input0.csrRowPtr + 1, + input0.csrColInd, + input0.csrValue); + mkl_sparse_optimize(matA); + + parse_CSR input1(file_name); + sparse_matrix_t matB; + mkl_sparse_d_create_csr(&matB, + SPARSE_INDEX_BASE_ZERO, + input1.num_rows, + input1.num_cols, + input1.csrRowPtr, + input1.csrRowPtr + 1, + input1.csrColInd, + input1.csrValue); + mkl_sparse_optimize(matB); + + int num_runs = var; + + auto t1 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < num_runs; i++) { + sparse_matrix_t matC = NULL; + mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE, matA, matB, &matC); + sparse_index_base_t indexing; + MKL_INT rows; + MKL_INT cols; + MKL_INT *pointerB_C; + MKL_INT *pointerE_C; + MKL_INT *columns_C; + scalar_t *values_C; + mkl_sparse_d_export_csr(matC, &indexing, &rows, &cols, &pointerB_C, &pointerE_C, &columns_C, &values_C); + mkl_free_buffers(); + mkl_free(matC); + mkl_free(pointerB_C); +// mkl_free(pointerE_C); + mkl_free(columns_C); + mkl_free(values_C); + } + auto t2 = std::chrono::high_resolution_clock::now(); + float total_time = float(std::chrono::duration_cast(t2 - t1).count()) / 1000000; + printf("total time: %fs\n", total_time); + float average_time_in_sec = total_time / num_runs; + std::cout << "average_time = " << average_time_in_sec * 1000 << " ms" << std::endl; + // int ops = test_ops(input0.num_rows, input0.num_cols, input0.csrRowPtr, input0.csrColInd); + // std::cout << "The tested ops is " << ops << std::endl; + // std::cout << "output nnz is " << pointerB_C[rows] << std::endl; + +// for(int i = 0; i < rows+1; i++) { +// std::cout << "intput row_pointer[" << i << "] is " << input0.csrRowPtr[i] << std::endl; +// std::cout << "input col_index[" << i << "] is " << input0.csrColInd[i] << std::endl; +// std::cout << "row_pointer[" << i << "] is " << pointerB_C[i] << std::endl; +// std::cout << "Column_idxp[" << i << "] is " << columns_C[i] << std::endl; +// } + + return 0; +} diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spmm.cpp b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spmm.cpp new file mode 100644 index 0000000..aa25a31 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spmm.cpp @@ -0,0 +1,101 @@ +#include +#include +#include +#include + +#include "mtx_read.h" +#include "mkl_spblas.h" +//#define VAR var + +typedef double scalar_t; +double test_spmm(sparse_matrix_t* AdjMatrix, struct matrix_descr descrAdjMatrix, + int cols_A, int rows_A, int cols_B, int num_runs) { + scalar_t* MatrixB = (scalar_t*)malloc(sizeof(scalar_t) * cols_A * cols_B); + for (int i = 0; i < cols_A * cols_B; i++) { + MatrixB[i] = 1.0; + } + scalar_t* OutMatrix = (scalar_t*)malloc(sizeof(scalar_t) * rows_A * cols_B); + for (int i = 0; i < rows_A * cols_B; i++) { + OutMatrix[i] = 0.0; + } + + scalar_t alpha = 1.0; + scalar_t beta = 0; + // mkl_sparse_s_mm(SPARSE_OPERATION_NON_TRANSPOSE, + // alpha, + // *AdjMatrix, + // descrAdjMatrix, + // SPARSE_LAYOUT_ROW_MAJOR, + // MatrixB, + // feat_len, + // feat_len, + // beta, + // OutMatrix, + // feat_len); + auto t1 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < num_runs; i++) { + mkl_sparse_d_mm(SPARSE_OPERATION_NON_TRANSPOSE, + alpha, + *AdjMatrix, + descrAdjMatrix, + SPARSE_LAYOUT_COLUMN_MAJOR, + MatrixB, + cols_B, + cols_B, + beta, + OutMatrix, + cols_B); + } + auto t2 = std::chrono::high_resolution_clock::now(); + // std::cout << "average time of " << num_runs << " runs: " + // << float(std::chrono::duration_cast(t2 - t1).count()) / 1000000 / num_runs + // << " seconds" << std::endl; + double total_time = double(std::chrono::duration_cast(t2 - t1).count()) / 1000000; + printf("total time: %fs\n", total_time); + double average_time_in_sec = total_time / num_runs; + return average_time_in_sec; +} + + +int main(int argc, char* argv[]) { + char *file_name = argv[1]; + + parse_CSR input(file_name); + + // int num_rows = npy_shape.data()[0]; + // int num_cols = npy_shape.data()[2]; + // int num_dst_vertices = num_rows; + // int num_src_vertices = num_cols; + + // float* csrVal = npy_data.data(); + // MKL_INT* csrRowPtr = npy_indptr.data(); + // MKL_INT* csrColInd = npy_indices.data(); + + sparse_matrix_t AdjMatrix; + mkl_sparse_d_create_csr(&AdjMatrix, + SPARSE_INDEX_BASE_ZERO, + input.num_rows, + input.num_cols, + input.csrRowPtr, + input.csrRowPtr + 1, + input.csrColInd, + input.csrValue); + mkl_sparse_optimize(AdjMatrix); + + struct matrix_descr descrAdjMatrix; + descrAdjMatrix.type = SPARSE_MATRIX_TYPE_GENERAL; + + // std::vector feat_len_values{32, 64, 128, 256, 512}; + int num_runs = var; + int num_cols_b = 40; + + // for (int feat_len : feat_len_values) { + // std::cout << "\nfeat_len is: " << feat_len << std::endl; + double average_time_in_sec = test_spmm(&AdjMatrix, descrAdjMatrix, input.num_cols, input.num_rows, num_cols_b, num_runs); + // } + std::cout << "average_time = " << average_time_in_sec * 1000 << " ms" << std::endl; + double throughput = double(input.num_nnz) * double(num_cols_b * 2) / average_time_in_sec / 1000 / 1000 / 1000; + std::cout << "THROUGHPUT = " << throughput << " GOPS" << std::endl; + + return 0; +} diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spmspv.cpp b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spmspv.cpp new file mode 100644 index 0000000..f3aa425 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spmspv.cpp @@ -0,0 +1,101 @@ +#include +#include +#include +#include +#include + +#include "mtx_read.h" +#include "mkl_spblas.h" + +typedef double scalar_t; + + +int test_ops(MKL_INT cols, MKL_INT *col_ptr, MKL_INT *row_ptr, MKL_INT *col_idx) { + int nnz = 0; + for(int i = 0; i < cols; i++) { +// std::cout << "Finish initialize" << std::endl; + if(row_ptr[i] != row_ptr[i+1]) { + nnz = nnz + 2 * (col_ptr[i+1] - col_ptr[i]); + } + } + return nnz; +} + +int main(int argc, char* argv[]) { + char *file_name = argv[1]; + char *file_name1 = argv[2]; + char *file_name2 = argv[3]; +/* + parse_CSC input(file_name); + + int num_dst_vertices = input.num_cols; + int num_src_vertices = input.num_rows; + + sparse_matrix_t AdjMatrix; + mkl_sparse_d_create_csc(&AdjMatrix, + SPARSE_INDEX_BASE_ONE, + input.num_rows, + input.num_cols, + input.cscColPtr, + input.cscColPtr + 1, + input.cscRowInd, + input.cscValue); +*/ + parse_CSR input0(file_name); + sparse_matrix_t matA; + mkl_sparse_d_create_csr(&matA, + SPARSE_INDEX_BASE_ZERO, + input0.num_rows, + input0.num_cols, + input0.csrRowPtr, + input0.csrRowPtr + 1, + input0.csrColInd, + input0.csrValue); + mkl_sparse_optimize(matA); + + parse_CSR input1(file_name1); + sparse_matrix_t matB; + mkl_sparse_d_create_csr(&matB, + SPARSE_INDEX_BASE_ZERO, + input1.num_rows, + input1.num_cols, + input1.csrRowPtr, + input1.csrRowPtr + 1, + input1.csrColInd, + input1.csrValue); + mkl_sparse_optimize(matB); + + sparse_matrix_t matC = NULL; + + int num_runs = 60; + auto t1 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < num_runs; i++) { + mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE, matA, matB, &matC); + } + sparse_index_base_t indexing; + MKL_INT rows; + MKL_INT cols; + MKL_INT *pointerB_C; + MKL_INT *pointerE_C; + MKL_INT *columns_C; + scalar_t *values_C; + mkl_sparse_d_export_csr(matC, &indexing, &rows, &cols, &pointerB_C, &pointerE_C, &columns_C, &values_C); + auto t2 = std::chrono::high_resolution_clock::now(); + parse_CSC input2(file_name2); + int ops = test_ops(input0.num_cols, input2.cscColPtr, input1.csrRowPtr, input1.csrColInd); + float total_time = float(std::chrono::duration_cast(t2 - t1).count()) / 1000000; + printf("total time: %fs\n", total_time); + float average_time_in_sec = total_time / num_runs; + std::cout << "average_time = " << average_time_in_sec * 1000 << " ms" << std::endl; + std::cout << "The tested ops is " << ops << std::endl; + std::cout << "output nnz is " << pointerB_C[rows] << std::endl; + +// for(int i = 0; i < rows+1; i++) { +// std::cout << "intput row_pointer[" << i << "] is " << input0.csrRowPtr[i] << std::endl; +// std::cout << "input col_index[" << i << "] is " << input0.csrColInd[i] << std::endl; +// std::cout << "row_pointer[" << i << "] is " << pointerB_C[i] << std::endl; +// std::cout << "Column_idxp[" << i << "] is " << columns_C[i] << std::endl; +// } + + return 0; +} diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spmv.cpp b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spmv.cpp new file mode 100644 index 0000000..a0ef185 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/mkl_spmv.cpp @@ -0,0 +1,110 @@ +#include +#include +#include +#include + +#include "mtx_read.h" +#include "mkl_spblas.h" + +//#define VAR var +typedef double scalar_t; +float test_spmv(sparse_matrix_t* AdjMatrix, struct matrix_descr descrAdjMatrix, + int num_src_vertices, int num_dst_vertices, int num_runs) { + scalar_t* Vector = (scalar_t*)malloc(sizeof(scalar_t) * num_src_vertices); + for (int i = 0; i < num_src_vertices; i++) { + Vector[i] = 1.0; + } + scalar_t* Out = (scalar_t*)malloc(sizeof(scalar_t) * num_dst_vertices); + for (int i = 0; i < num_dst_vertices; i++) { + Out[i] = 0.0; + } + + scalar_t alpha = 1.0; + scalar_t beta = 0; + + auto t1 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < num_runs; i++) { + mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, + alpha, + *AdjMatrix, + descrAdjMatrix, + Vector, + beta, + Out); + } + auto t2 = std::chrono::high_resolution_clock::now(); + float total_time = float(std::chrono::duration_cast(t2 - t1).count()) / 1000000; + printf("total time: %fs\n", total_time); + float average_time_in_sec = total_time / num_runs; + return average_time_in_sec; +} + + +int main(int argc, char* argv[]) { + char *file_name = argv[1]; +/* + parse_CSC input(file_name); + + int num_dst_vertices = input.num_cols; + int num_src_vertices = input.num_rows; + + sparse_matrix_t AdjMatrix; + mkl_sparse_d_create_csc(&AdjMatrix, + SPARSE_INDEX_BASE_ONE, + input.num_rows, + input.num_cols, + input.cscColPtr, + input.cscColPtr + 1, + input.cscRowInd, + input.cscValue); +*/ + parse_CSR input(file_name); + + int num_dst_vertices = input.num_rows; + int num_src_vertices = input.num_cols; + + sparse_matrix_t AdjMatrix; + mkl_sparse_d_create_csr(&AdjMatrix, + SPARSE_INDEX_BASE_ZERO, + input.num_rows, + input.num_cols, + input.csrRowPtr, + input.csrRowPtr + 1, + input.csrColInd, + input.csrValue); + + + // printf("cscColPtr: \n"); + // for (unsigned i = 0; i < input.num_cols + 1; i++) { + // printf("%d ", *(input.cscColPtr + i)); + // } + // printf("\n"); + +// parse_COO input(file_name); + +// int num_dst_vertices = input.num_rows; +// int num_src_vertices = input.num_cols; + +// sparse_matrix_t AdjMatrix; +// mkl_sparse_d_create_coo(&AdjMatrix, +// SPARSE_INDEX_BASE_ONE, +// input.num_rows, +// input.num_cols, +// input.num_nnz, +// input.cooRowInd, +// input.cooColInd, +// input.cooValue); + + mkl_sparse_optimize(AdjMatrix); + + struct matrix_descr descrAdjMatrix; + descrAdjMatrix.type = SPARSE_MATRIX_TYPE_GENERAL; + + int num_runs = 10000; + float average_time_in_sec = test_spmv(&AdjMatrix, descrAdjMatrix, num_src_vertices, num_dst_vertices, num_runs); + std::cout << "average_time = " << average_time_in_sec * 1000 << " ms" << std::endl; + float throughput = input.num_nnz * 2 / average_time_in_sec / 1000 / 1000 / 1000; + std::cout << "THROUGHPUT = " << throughput << " GOPS" << std::endl; + + return 0; +} diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/mtx_read.h b/evaluation/FormatCustomization/PageRank/Intel_MKL/mtx_read.h new file mode 100644 index 0000000..7441a90 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/mtx_read.h @@ -0,0 +1,345 @@ +/******************************************************************************* +* Read .mtx and .tns file format +*******************************************************************************/ + +#ifndef _MTX_READ_H_ +#define _MTX_READ_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include "mkl_spblas.h" + +//using namespace std; + +static char *toLower(char *token) { + for (char *c = token; *c; c++) + *c = tolower(*c); + return token; +} + +static void readMTXHeader(FILE* file, char* fileName, uint64_t* metaData, char* field, char* symmetry) { + char line[1025]; + char header[64]; + char object[64]; + char format[64]; + + // Read header line. + printf("read MTX filename %s\n", fileName); + if (fscanf(file, "%63s %63s %63s %63s %63s\n", header, object, format, field, + symmetry) != 5) { + fprintf(stderr, "Corrupt header in %s\n", fileName); + exit(1); + } + // Make sure this is a general sparse matrix. + if (strcmp(toLower(header), "%%matrixmarket") || + strcmp(toLower(object), "matrix") || + strcmp(toLower(format), "coordinate")) { + fprintf(stderr, + "Cannot find a coordinate format sparse matrix in %s\n", fileName); + exit(1); + } + // if (strcmp(toLower(field), "pattern")) + // strcmp(toLower(symmetry), "general") + + // Skip comments. + while (1) { + if (!fgets(line, 1025, file)) { + fprintf(stderr, "Cannot find data in %s\n", fileName); + exit(1); + } + if (line[0] != '%') + break; + } + // Next line contains M N NNZ. + metaData[0] = 2; // rank + if (sscanf(line, "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n", metaData + 2, metaData + 3, + metaData + 1) != 3) { + fprintf(stderr, "Cannot find size in %s\n", fileName); + exit(1); + } +} + +static void readFROSTTHeader(FILE* file, char* fileName, uint64_t* metaData) { + +} + +template +class parse_CSC { +public: + parse_CSC(char* fileName) { + FILE *file = fopen(fileName, "r"); + printf("filename %s\n", fileName); + if (!file) { + fprintf(stderr, "Cannot find %s\n", fileName); + exit(1); + } + + uint64_t metaData[512]; + char field[64]; + char symmetry[64]; + if (strstr(fileName, ".mtx")) { + readMTXHeader(file, fileName, metaData, field, symmetry); + } else if (strstr(fileName, ".tns")) { + readFROSTTHeader(file, fileName, metaData); + } else { + fprintf(stderr, "Unknown format %s\n", fileName); + exit(1); + } + + // printf("in getTensorIndices :\n"); + // for (unsigned i = 0; i < 4; i++) + // printf("metaData[%u] = %lu \n", i, metaData[i]); + + num_nnz = metaData[1]; + num_rows = metaData[2]; + num_cols = metaData[3]; + + cscColPtr = (MKL_INT*)malloc((num_cols + 1) * sizeof(MKL_INT)); + cscRowInd = (MKL_INT*)malloc(num_nnz * sizeof(MKL_INT)); + cscValue = (valueTp*)malloc(num_nnz * sizeof(valueTp)); + + bool isFieldPattern = strcmp(toLower(field), "pattern"); + + if (!strcmp(toLower(field), "complex")) { + fprintf(stderr, "Complex data type not yet supported.\n"); + exit(1); + } + + if (strcmp(toLower(symmetry), "general") && strcmp(toLower(symmetry), "symmetric")) { + fprintf(stderr, "Non general matrix structure not yet supported.\n"); + exit(1); + } + + MKL_INT lastRowInd = 0; + // cscColPtr[0] = 0; + for (unsigned i = 0; i < num_nnz; i++) { + MKL_INT rowInd = -1; + MKL_INT colInd = -1; + if (fscanf(file, "%d", &rowInd) != 1) { + fprintf(stderr, "Cannot find next row index at line %u\n", i); + exit(1); + } + cscRowInd[i] = rowInd - 1; + if (fscanf(file, "%d", &colInd) != 1) { + fprintf(stderr, "Cannot find next col index at line %u\n", i); + exit(1); + } + while (colInd > lastRowInd) { + cscColPtr[lastRowInd++] = i; + } + if (!isFieldPattern) { + // Field is Pattern + cscValue[i] = 1; + } else { + valueTp value; + if (fscanf(file, "%lf", &value) != 1) { + fprintf(stderr, "Cannot find next value at line %u\n", i); + exit(1); + } + cscValue[i] = value; + } + } + for(int i = lastRowInd; i <= num_cols; i++) + cscColPtr[i] = num_nnz; + } + + ~parse_CSC() { + free(cscColPtr); + free(cscRowInd); + free(cscValue); + } + + int num_rows, num_cols, num_nnz; + MKL_INT* cscColPtr; + MKL_INT* cscRowInd; + valueTp* cscValue; +}; + +template +class parse_CSR { +public: + parse_CSR(char* fileName) { + FILE *file = fopen(fileName, "r"); + printf("filename %s\n", fileName); + if (!file) { + fprintf(stderr, "Cannot find %s\n", fileName); + exit(1); + } + + uint64_t metaData[512]; + char field[64]; + char symmetry[64]; + if (strstr(fileName, ".mtx")) { + readMTXHeader(file, fileName, metaData, field, symmetry); + } else if (strstr(fileName, ".tns")) { + readFROSTTHeader(file, fileName, metaData); + } else { + fprintf(stderr, "Unknown format %s\n", fileName); + exit(1); + } + + // printf("in getTensorIndices :\n"); + // for (unsigned i = 0; i < 4; i++) + // printf("metaData[%u] = %lu \n", i, metaData[i]); + + num_nnz = metaData[1]; + num_rows = metaData[2]; + num_cols = metaData[3]; + + csrRowPtr = (MKL_INT*)malloc((num_rows + 1) * sizeof(MKL_INT)); + csrColInd = (MKL_INT*)malloc(num_nnz * sizeof(MKL_INT)); + csrValue = (valueTp*)malloc(num_nnz * sizeof(valueTp)); + + bool isFieldPattern = strcmp(toLower(field), "pattern"); + + if (!strcmp(toLower(field), "complex")) { + fprintf(stderr, "Complex data type not yet supported.\n"); + exit(1); + } + + if (strcmp(toLower(symmetry), "general")) { + fprintf(stderr, "Non general matrix structure not yet supported.\n"); + exit(1); + } + + MKL_INT lastRowInd = 0; + // csrRowPtr[0] = 0; + for (unsigned i = 0; i < num_nnz; i++) { + MKL_INT rowInd = -1; + MKL_INT colInd = -1; + if (fscanf(file, "%d", &rowInd) != 1) { + fprintf(stderr, "Cannot find next row index at line %u\n", i); + exit(1); + } + while (rowInd > lastRowInd) { + csrRowPtr[lastRowInd] = i; + lastRowInd = lastRowInd + 1; + } + if (fscanf(file, "%d", &colInd) != 1) { + fprintf(stderr, "Cannot find next col index at line %u\n", i); + exit(1); + } + csrColInd[i] = colInd - 1; + + if (!isFieldPattern) { + // Field is Pattern + csrValue[i] = 1; + } else { + valueTp value; + if (fscanf(file, "%lf", &value) != 1) { + fprintf(stderr, "Cannot find next value at line %u\n", i); + exit(1); + } + csrValue[i] = value; + } + } + for (unsigned i = lastRowInd; i <= num_rows; i++ ) + csrRowPtr[i] = num_nnz; + } + + ~parse_CSR() { + free(csrRowPtr); + free(csrColInd); + free(csrValue); + } + + int num_rows, num_cols, num_nnz; + MKL_INT* csrRowPtr; + MKL_INT* csrColInd; + valueTp* csrValue; +}; + +template +class parse_COO { +public: + parse_COO(char* fileName) { + FILE *file = fopen(fileName, "r"); + printf("filename %s\n", fileName); + if (!file) { + fprintf(stderr, "Cannot find %s\n", fileName); + exit(1); + } + + uint64_t metaData[512]; + char field[64]; + char symmetry[64]; + if (strstr(fileName, ".mtx")) { + readMTXHeader(file, fileName, metaData, field, symmetry); + } else if (strstr(fileName, ".tns")) { + readFROSTTHeader(file, fileName, metaData); + } else { + fprintf(stderr, "Unknown format %s\n", fileName); + exit(1); + } + + // printf("in getTensorIndices :\n"); + // for (unsigned i = 0; i < 4; i++) + // printf("metaData[%u] = %lu \n", i, metaData[i]); + + num_nnz = metaData[1]; + num_rows = metaData[2]; + num_cols = metaData[3]; + + cooRowInd = (MKL_INT*)malloc(num_nnz * sizeof(MKL_INT)); + cooColInd = (MKL_INT*)malloc(num_nnz * sizeof(MKL_INT)); + cooValue = (valueTp*)malloc(num_nnz * sizeof(valueTp)); + + bool isFieldPattern = strcmp(toLower(field), "pattern"); + + if (!strcmp(toLower(field), "complex")) { + fprintf(stderr, "Complex data type not yet supported.\n"); + exit(1); + } + + if (strcmp(toLower(symmetry), "general")) { + fprintf(stderr, "Non general matrix structure not yet supported.\n"); + exit(1); + } + + for (unsigned i = 0; i < num_nnz; i++) { + MKL_INT rowInd = -1; + MKL_INT colInd = -1; + if (fscanf(file, "%d", &rowInd) != 1) { + fprintf(stderr, "Cannot find next row index at line %u\n", i); + exit(1); + } + cooRowInd[i] = rowInd; + if (fscanf(file, "%d", &colInd) != 1) { + fprintf(stderr, "Cannot find next col index at line %u\n", i); + exit(1); + } + cooColInd[i] = colInd; + if (!isFieldPattern) { + // Field is Pattern + cooValue[i] = 1; + } else { + valueTp value; + if (fscanf(file, "%lf", &value) != 1) { + fprintf(stderr, "Cannot find next value at line %u\n", i); + exit(1); + } + cooValue[i] = value; + } + } + } + + ~parse_COO() { + free(cooRowInd); + free(cooColInd); + free(cooValue); + } + + int num_rows, num_cols; + int num_nnz; + MKL_INT* cooRowInd; + MKL_INT* cooColInd; + valueTp* cooValue; +}; + +#endif diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/out.log b/evaluation/FormatCustomization/PageRank/Intel_MKL/out.log new file mode 100644 index 0000000..1a5e5e3 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/out.log @@ -0,0 +1,115 @@ +mkdir -p ./build +g++ -std=c++11 -O3 -I/opt/intel/oneapi/mkl/latest/include \ + \ + -Wl,--no-as-needed \ + mkl_pagerank.cpp -o ./build/mkl_pagerank \ + -Wl,--start-group /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_intel_lp64.a \ + /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_core.a /opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_gnu_thread.a -Wl,--end-group \ + -fopenmp -lpthread -lm -ldl +./build/mkl_pagerank email-Eu-core_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/email-Eu-core_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/email-Eu-core_col_norm.mtx +max_diff = 0.477613 +max_diff = 0.521485 +max_diff = 0.000873562 +max_diff = 2.72988e-05 +max_diff = 8.53088e-07 +max_diff = 2.6659e-08 +max_diff = 8.33094e-10 +num_runs = 7 +total time: 0.119094 +average_time = 17.0134 ms +THROUGHPUT = 0.00465997 GOPS +./build/mkl_pagerank amazon0312_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/amazon0312_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/amazon0312_row_major_col_norm.mtx +max_diff = 0.985002 +max_diff = 0.0149889 +max_diff = 4.57502e-06 +max_diff = 3.32729e-06 +max_diff = 8.31821e-07 +max_diff = 0 +num_runs = 6 +total time: 0.114899 +average_time = 19.1498 ms +THROUGHPUT = 0.334253 GOPS +./build/mkl_pagerank p2p-Gnutella31_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/p2p-Gnutella31_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/p2p-Gnutella31_row_major_col_norm.mtx +max_diff = 0.999457 +max_diff = 0.000543252 +max_diff = 0 +num_runs = 3 +total time: 0.252962 +average_time = 84.3207 ms +THROUGHPUT = 0.453304 GOPS +./build/mkl_pagerank soc-Slashdot0811_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/soc-Slashdot0811_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/soc-Slashdot0811_col_norm.mtx +max_diff = 0.919567 +max_diff = 0.922795 +max_diff = 0.988043 +max_diff = 0.00643886 +max_diff = 1.14182e-05 +max_diff = 4.76752e-08 +max_diff = 1.99036e-10 +num_runs = 7 +total time: 0.096358 +average_time = 13.7654 ms +THROUGHPUT = 0.131557 GOPS +./build/mkl_pagerank email-Enron_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/email-Enron_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/email-Enron_col_norm.mtx +max_diff = 0.776423 +max_diff = 0.223474 +max_diff = 7.52208e-05 +max_diff = 2.72383e-05 +max_diff = 1.08993e-06 +max_diff = 1.55705e-08 +max_diff = 2.22435e-10 +num_runs = 7 +total time: 0.126683 +average_time = 18.0976 ms +THROUGHPUT = 0.0406311 GOPS +./build/mkl_pagerank ca-CondMat_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/ca-CondMat_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/ca-CondMat_col_norm.mtx +max_diff = 0.50435 +max_diff = 0.619213 +max_diff = 2.88189e-05 +max_diff = 0 +num_runs = 4 +total time: 0.05815 +average_time = 14.5375 ms +THROUGHPUT = 0.0257178 GOPS +./build/mkl_pagerank ca-HepTh_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/ca-HepTh_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/ca-HepTh_col_norm.mtx +max_diff = 0.596145 +max_diff = 0.403787 +max_diff = 6.74969e-05 +max_diff = 0 +num_runs = 4 +total time: 0.083615 +average_time = 20.9037 ms +THROUGHPUT = 0.00497241 GOPS +./build/mkl_pagerank loc-Brightkite_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/loc-Brightkite_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/loc-Brightkite_col_norm.mtx +max_diff = 0.00156438 +max_diff = 0.00413032 +max_diff = 0.0216047 +max_diff = 0.0923027 +max_diff = 0.170001 +max_diff = 0.176793 +max_diff = 0.290136 +max_diff = 0.378285 +max_diff = 0.451992 +max_diff = 0.0610746 +max_diff = 0.000729923 +max_diff = 5.98949e-06 +max_diff = 0 +num_runs = 13 +total time: 0.157434 +average_time = 12.1103 ms +THROUGHPUT = 0.0707093 GOPS diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/output.log b/evaluation/FormatCustomization/PageRank/Intel_MKL/output.log new file mode 100644 index 0000000..d957074 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/output.log @@ -0,0 +1,113 @@ +./build/mkl_pagerank email-Eu-core_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/email-Eu-core_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/email-Eu-core_col_norm.mtx +num_runs = 8363 +total time: 68.4676 +average_time = 8.18696 ms +THROUGHPUT = 0.00624676 GOPS +./build/mkl_pagerank gplus_108K_13M_csr_float32_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/gplus_108K_13M_csr_float32_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/gplus_108K_13M_csr_float32_row_major_col_norm.mtx +num_runs = 4 +total time: 0.159106 +average_time = 39.7765 ms +THROUGHPUT = 0.687514 GOPS +./build/mkl_pagerank pokec_1633K_31M_csr_float32_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/pokec_1633K_31M_csr_float32_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/pokec_1633K_31M_csr_float32_row_major_col_norm.mtx +num_runs = 5 +total time: 0.252179 +average_time = 50.4358 ms +THROUGHPUT = 1.21432 GOPS +./build/mkl_pagerank live_journal_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/live_journal_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/live_journal_row_major_col_norm.mtx +num_runs = 6 +total time: 0.573081 +average_time = 95.5135 ms +THROUGHPUT = 1.43384 GOPS +./build/mkl_pagerank wiki-Vote_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/wiki-Vote_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/wiki-Vote_row_major_col_norm.mtx +num_runs = 23 +total time: 0.230467 +average_time = 10.0203 ms +THROUGHPUT = 0.0206958 GOPS +./build/mkl_pagerank web-Google_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/web-Google_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/web-Google_row_major_col_norm.mtx +num_runs = 4 +total time: 0.099081 +average_time = 24.7703 ms +THROUGHPUT = 0.412191 GOPS +./build/mkl_pagerank amazon0312_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/amazon0312_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/amazon0312_row_major_col_norm.mtx +num_runs = 6 +total time: 0.116176 +average_time = 19.3627 ms +THROUGHPUT = 0.330578 GOPS +./build/mkl_pagerank p2p-Gnutella31_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/p2p-Gnutella31_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/p2p-Gnutella31_row_major_col_norm.mtx +num_runs = 3 +total time: 0.054864 +average_time = 18.288 ms +THROUGHPUT = 0.0161737 GOPS + +./build/mkl_pagerank ML_Geer_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/ML_Geer_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/ML_Geer_row_major_col_norm.mtx +num_runs = 18638 +total time: 205.94 +average_time = 11.0495 ms +THROUGHPUT = 20.0698 GOPS +./build/mkl_pagerank ss_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/ss_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/ss_row_major_col_norm.mtx +num_runs = 220 +total time: 3.33963 +average_time = 15.1801 ms +THROUGHPUT = 4.57882 GOPS +./build/mkl_pagerank ML_Laplace_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/ML_Laplace_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/ML_Laplace_row_major_col_norm.mtx +num_runs = 4629 +total time: 14.1679 +average_time = 3.06068 ms +THROUGHPUT = 18.094 GOPS +./build/mkl_pagerank Transport_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/Transport_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/Transport_row_major_col_norm.mtx +num_runs = 33 +total time: 0.324893 +average_time = 9.84524 ms +THROUGHPUT = 4.77403 GOPS +./build/mkl_pagerank rajat31_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/rajat31_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/rajat31_row_major_col_norm.mtx +num_runs = 5 +total time: 0.284035 +average_time = 56.807 ms +THROUGHPUT = 0.715273 GOPS +./build/mkl_pagerank TSOPF_RS_b2383_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/TSOPF_RS_b2383_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/TSOPF_RS_b2383_row_major_col_norm.mtx +num_runs = 35 +total time: 0.620586 +average_time = 17.731 ms +THROUGHPUT = 1.82405 GOPS +./build/mkl_pagerank memchip_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/memchip_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/memchip_row_major_col_norm.mtx +num_runs = 6 +total time: 0.181731 +average_time = 30.2885 ms +THROUGHPUT = 0.977942 GOPS +./build/mkl_pagerank vas_stokes_1M_row_major_col_norm.mtx +filename /work/shared/common/datasets/UniSparse_dataset/vas_stokes_1M_row_major_col_norm.mtx +read MTX filename /work/shared/common/datasets/UniSparse_dataset/vas_stokes_1M_row_major_col_norm.mtx +num_runs = 32 +total time: 1.76579 +average_time = 55.1811 ms +THROUGHPUT = 1.26011 GOPS \ No newline at end of file diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/run_label_propgation.sh b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_label_propgation.sh new file mode 100644 index 0000000..fc7875b --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_label_propgation.sh @@ -0,0 +1,26 @@ +make mkl_spgemm_lp CXXFLAGS=-Dvar=1000 +make mkl_spmm CXXFLAGS=-Dvar=1000 + +export OMP_NUM_THREADS=48 + +DATASET_PATH=/work/shared/common/datasets/versatile_sparse_xcel/lp + +PREFIX=arxiv +SPMCSR=_adj_matrix_row_major.mtx +DATASETS=( "_label_matrix_1.mtx" + "_label_matrix_2.mtx" + "_label_matrix_3.mtx" + "_label_matrix_4.mtx") + + +BUILD_DIR=./build + +for dataset in "${DATASETS[@]}" +do + echo ${BUILD_DIR}/mkl_spgemm_lp $PREFIX$dataset + ${BUILD_DIR}/mkl_spgemm_lp $DATASET_PATH/$PREFIX$SPMCSR $DATASET_PATH/$PREFIX$dataset +done + +echo ${BUILD_DIR}/mkl_spmm $PREFIX$SPMCSR +${BUILD_DIR}/mkl_spmm $DATASET_PATH/$PREFIX$SPMCSR + diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/run_pagerank.sh b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_pagerank.sh new file mode 100644 index 0000000..b1bb1a2 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_pagerank.sh @@ -0,0 +1,48 @@ +make mkl_pagerank + +export OMP_NUM_THREADS=48 + +DATASET_PATH=/work/shared/common/datasets/UniSparse_dataset + +DATASETS=( + "email-Eu-core" + # "wiki-Vote_row_major" + "amazon0312_row_major" + "p2p-Gnutella31_row_major" + # "cit-Patents" + # "web-Stanford" + # "cit-HepTh" + "soc-Slashdot0811" + # "soc-Epinions1" + "email-Enron" + "ca-CondMat" + # "as-735" + "ca-HepTh" + "loc-Brightkite" + # # "test_col_norm.mtx" + # # "gplus_108K_13M_csr_float32_row_major_col_norm.mtx" + # # "pokec_1633K_31M_csr_float32_row_major_col_norm.mtx" + # # "live_journal_row_major_col_norm.mtx" + # "wiki-Vote_row_major_col_norm.mtx" + # # "web-Google_row_major_col_norm.mtx" + # "amazon0312_row_major_col_norm.mtx" + # "p2p-Gnutella31_row_major_col_norm.mtx" + + # "ML_Geer_row_major_col_norm.mtx" + # "ss_row_major_col_norm.mtx" + # "ML_Laplace_row_major_col_norm.mtx" + # "Transport_row_major_col_norm.mtx" + # "rajat31_row_major_col_norm.mtx" + # "TSOPF_RS_b2383_row_major_col_norm.mtx" + # "memchip_row_major_col_norm.mtx" + # "vas_stokes_1M_row_major_col_norm.mtx" + ) + +SUFFIX="_col_norm.mtx" +BUILD_DIR=./build + +for dataset in "${DATASETS[@]}" +do + echo ${BUILD_DIR}/mkl_pagerank $dataset$SUFFIX + ${BUILD_DIR}/mkl_pagerank $DATASET_PATH/$dataset$SUFFIX +done diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spgemm.sh b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spgemm.sh new file mode 100644 index 0000000..f881faf --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spgemm.sh @@ -0,0 +1,35 @@ +make mkl_spgemm + +export OMP_NUM_THREADS=48 + +DATASET_PATH=/work/shared/common/datasets/versatile_sparse_xcel + +# DATASETS=( "ss1_col_major.mtx" +# "stomach_col_major.mtx" +# "scircuit_col_major.mtx" +# "Hamrle3_col_major.mtx" +# "Transport_col_major.mtx") + +DATASETS=( "web-Google_row_major.mtx" + "mario002_row_major.mtx" + "amazon0312_row_major.mtx" + "m133-b3_row_major.mtx" + "scircuit_row_major.mtx" + "p2p-Gnutella31_row_major.mtx" + "offshore_row_major.mtx" + "cage12_row_major.mtx" + "2cubes_sphere_row_major.mtx" + "filter3D_row_major.mtx" + "ca-CondMat_row_major.mtx" + "wiki-Vote_row_major.mtx" + "poisson3Da_row_major.mtx" + "CollegeMsg_row_major.mtx" + "email-Eu-core_row_major.mtx" ) + +BUILD_DIR=./build + +for dataset in "${DATASETS[@]}" +do + echo ${BUILD_DIR}/mkl_spgemm $dataset + ${BUILD_DIR}/mkl_spgemm $DATASET_PATH/$dataset +done diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spmm.sh b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spmm.sh new file mode 100644 index 0000000..be97aae --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spmm.sh @@ -0,0 +1,29 @@ +make mkl_spmm + +export OMP_NUM_THREADS=48 + +DATASET_PATH=/work/shared/common/datasets/versatile_sparse_xcel + +DATASETS=( "2cubes_sphere_row_major.mtx" + "cage12_row_major.mtx" + "email-Eu-core_row_major.mtx" + "mario002_row_major.mtx" + "poisson3Da_row_major.mtx" + "ca-CondMat_row_major.mtx" + "CollegeMsg_row_major.mtx" + "filter3D_row_major.mtx" + "offshore_row_major.mtx" + "scircuit_row_major.mtx" + "wiki-Vote_row_major.mtx" + "amazon0312_row_major.mtx" + "web-Google_row_major.mtx" + "p2p-Gnutella31_row_major.mtx" + "m133-b3_row_major.mtx") + +BUILD_DIR=./build + +for dataset in "${DATASETS[@]}" +do + echo ${BUILD_DIR}/mkl_spmm $dataset + ${BUILD_DIR}/mkl_spmm $DATASET_PATH/$dataset +done diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spmspv.sh b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spmspv.sh new file mode 100644 index 0000000..afd4158 --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spmspv.sh @@ -0,0 +1,42 @@ +make mkl_spmspv + +export OMP_NUM_THREADS=48 + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/work/shared/common/datasets/versatile_sparse_xcel/cnpy/build +DATASET_PATH=/work/shared/common/datasets/versatile_sparse_xcel +SP_VEC0=_0.01 +SP_VEC1=_0.1 +SP_VEC2=_0.5 +SMCSRPOST=_row_major.mtx +SMCSCPOST=_col_major.mtx +SVPOST=.mtx + +DATSETS=(2cubes_sphere + CollegeMsg + amazon0312 + ca-CondMat + cage12 + email-Eu-core + filter3D + m133-b3 + mario002 + offshore + p2p-Gnutella31 + poisson3Da + scircuit + web-Google + wiki-Vote) + +BUILD_DIR=./build + +for dataset in ${DATSETS[@]} +do + echo ${BUILD_DIR}/mkl_spmspv $dataset$SP_VEC0$SVPOST + ${BUILD_DIR}/mkl_spmspv $DATASET_PATH/$dataset$SMCSRPOST $DATASET_PATH/sparse_vec/$dataset$SP_VEC0$SVPOST $DATASET_PATH/$dataset$SMCSCPOST + + echo ${BUILD_DIR}/mkl_spmspv $dataset$SP_VEC1$SVPOST + ${BUILD_DIR}/mkl_spmspv $DATASET_PATH/$dataset$SMCSRPOST $DATASET_PATH/sparse_vec/$dataset$SP_VEC1$SVPOST $DATASET_PATH/$dataset$SMCSCPOST + + echo ${BUILD_DIR}/mkl_spmspv $dataset$SP_VEC2$SVPOST + ${BUILD_DIR}/mkl_spmspv $DATASET_PATH/$dataset$SMCSRPOST $DATASET_PATH/sparse_vec/$dataset$SP_VEC2$SVPOST $DATASET_PATH/$dataset$SMCSCPOST +done diff --git a/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spmv.sh b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spmv.sh new file mode 100644 index 0000000..ddb1d0b --- /dev/null +++ b/evaluation/FormatCustomization/PageRank/Intel_MKL/run_spmv.sh @@ -0,0 +1,35 @@ +make mkl_spmv + +export OMP_NUM_THREADS=48 + +DATASET_PATH=/work/shared/common/datasets/versatile_sparse_xcel + +# DATASETS=( "ss1_col_major.mtx" +# "stomach_col_major.mtx" +# "scircuit_col_major.mtx" +# "Hamrle3_col_major.mtx" +# "Transport_col_major.mtx") + +DATASETS=( "2cubes_sphere_row_major.mtx" + "cage12_row_major.mtx" + "email-Eu-core_row_major.mtx" + "mario002_row_major.mtx" + "poisson3Da_row_major.mtx" + "ca-CondMat_row_major.mtx" + "CollegeMsg_row_major.mtx" + "filter3D_row_major.mtx" + "offshore_row_major.mtx" + "scircuit_row_major.mtx" + "wiki-Vote_row_major.mtx" + "amazon0312_row_major.mtx" + "web-Google_row_major.mtx" + "p2p-Gnutella31_row_major.mtx" + "m133-b3_row_major.mtx") + +BUILD_DIR=./build + +for dataset in "${DATASETS[@]}" +do + echo ${BUILD_DIR}/mkl_spmv $dataset + ${BUILD_DIR}/mkl_spmv $DATASET_PATH/$dataset +done diff --git a/include/IR/UniSparseOps.td b/include/IR/UniSparseOps.td index 3365f58..6f5aed5 100644 --- a/include/IR/UniSparseOps.td +++ b/include/IR/UniSparseOps.td @@ -477,7 +477,7 @@ def UniSparse_COOSpMMOp: UniSparse_Op<"coo_spmm", [NoSideEffect]> { def UniSparse_DecomposeBDIAOp: UniSparse_Op<"decompose_BDIA", [NoSideEffect]> { let summary = "A COO SpMM runtime"; let description = [{ - Example: %0 = unisparse.coo_spmm %tensor, %matrix : , memref to memref + Example: %0 = unisparse.decompose_BDIA %tensor, %matrix : , memref to memref }]; let arguments = (ins AnyTensor: $tensor, AnyType: $blockSize, AnyType: $thres); let results = (outs UniSparse_StructType: $output); @@ -489,7 +489,7 @@ def UniSparse_DecomposeBDIAOp: UniSparse_Op<"decompose_BDIA", [NoSideEffect]> { def UniSparse_DecomposeBELLOp: UniSparse_Op<"decompose_BELL", [NoSideEffect]> { let summary = "A COO SpMM runtime"; let description = [{ - Example: %0 = unisparse.coo_spmm %tensor, %matrix : , memref to memref + Example: %0 = unisparse.decompose_BELL %tensor, %matrix : , memref to memref }]; let arguments = (ins AnyTensor: $tensor, AnyType: $blockSize, AnyType: $block_thres, AnyType: $col_thres); let results = (outs UniSparse_StructType: $output); @@ -501,7 +501,7 @@ def UniSparse_DecomposeBELLOp: UniSparse_Op<"decompose_BELL", [NoSideEffect]> { def UniSparse_BDIASpMVOp: UniSparse_Op<"bdia_spmv", [NoSideEffect]> { let summary = "A COO SpMM runtime"; let description = [{ - Example: %0 = unisparse.coo_spmm %tensor, %matrix : , memref to memref + Example: %0 = unisparse.bdia_spmv %tensor, %matrix : , memref to memref }]; let arguments = (ins AnyTensor: $tensor_CSR, AnyTensor: $tensor_BDIA, AnyTensor: $vector, AnyTensor: $out_vec); let results = (outs AnyStridedMemRefOfRank<1>: $result); @@ -513,7 +513,7 @@ def UniSparse_BDIASpMVOp: UniSparse_Op<"bdia_spmv", [NoSideEffect]> { def UniSparse_BDIASpMMOp: UniSparse_Op<"bdia_spmm", [NoSideEffect]> { let summary = "A COO SpMM runtime"; let description = [{ - Example: %0 = unisparse.coo_spmm %tensor, %matrix : , memref to memref + Example: %0 = unisparse.bdia_spmm %tensor, %matrix : , memref to memref }]; let arguments = (ins AnyTensor: $tensor_CSR, AnyTensor: $tensor_BDIA, AnyTensor: $in_mat, AnyTensor: $out_mat); let results = (outs AnyStridedMemRefOfRank<2>: $result); @@ -525,7 +525,7 @@ def UniSparse_BDIASpMMOp: UniSparse_Op<"bdia_spmm", [NoSideEffect]> { def UniSparse_ReleaseOp: UniSparse_Op<"release"> { let summary = "A COO SpMM runtime"; let description = [{ - Example: %0 = unisparse.coo_spmm %tensor, %matrix : , memref to memref + Example: %0 = unisparse.release %tensor, %matrix : , memref to memref }]; let arguments = (ins AnyType: $tensor); let assemblyFormat = [{ diff --git a/lib/Runtime/CMakeLists.txt b/lib/Runtime/CMakeLists.txt index b928eca..a562b83 100644 --- a/lib/Runtime/CMakeLists.txt +++ b/lib/Runtime/CMakeLists.txt @@ -29,5 +29,6 @@ add_mlir_library(mlir_unisparse_runner_utils set_property(TARGET mlir_unisparse_runner_utils PROPERTY CXX_STANDARD 14) target_compile_definitions(mlir_unisparse_runner_utils PRIVATE mlir_unisparse_runner_utils_EXPORTS) +target_link_libraries(mlir_unisparse_runner_utils PRIVATE OpenMP::OpenMP_CXX) # target_include_directories(mlir_unisparse_runner_utils PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # target_link_libraries(mlir_unisparse_runner_utils PRIVATE OpenMP::OpenMP_CXX PRIVATE ${CUDA_LIBRARIES} PRIVATE ${CUDA_cusparse_LIBRARY}) diff --git a/lib/Runtime/UniSparseUtils.cpp b/lib/Runtime/UniSparseUtils.cpp index 97573d7..dc677ec 100644 --- a/lib/Runtime/UniSparseUtils.cpp +++ b/lib/Runtime/UniSparseUtils.cpp @@ -18,6 +18,8 @@ // #define DEBUG #define PRINT false //#define PARALLEL +#define DataType float + #include #include @@ -3157,31 +3159,547 @@ FOREVERY_V(IMPL_NEWUNISPARSETENSOR) // delete static_cast *>(tensor); // } - using index_type = uint64_t; - index_type getTensorDim(void* ptr, index_type dim) { - char* fileName = static_cast(ptr); - char field[64]; - char symmetry[64]; - - FILE *file = fopen(fileName, "r"); - printf("filename %s\n", fileName); - if (!file) { - fprintf(stderr, "Cannot find %s\n", fileName); - exit(1); + using index_type = uint64_t; + index_type getTensorDim(void* ptr, index_type dim) { + char* fileName = static_cast(ptr); + char field[64]; + char symmetry[64]; + + FILE *file = fopen(fileName, "r"); + printf("filename %s\n", fileName); + if (!file) { + fprintf(stderr, "Cannot find %s\n", fileName); + exit(1); + } + + index_type metaData[512]; + if (strstr(fileName, ".mtx")) { + readMTXHeader(file, fileName, metaData, field, symmetry); + } else if (strstr(fileName, ".tns")) { + readFROSTTHeader(file, fileName, metaData); + } else { + fprintf(stderr, "Unknown format %s\n", fileName); + exit(1); + } + + index_type request_dim = dim + 2; + return metaData[request_dim]; + } + + + // Vanilla DIA SpMM + void _mlir_ciface_kernel_dia_spmm(StridedMemRefType *outC, + void* inA, + StridedMemRefType *inB, + StridedMemRefType *inC) { + printf("enter in kernel_dia_spmm\n"); + UniSparseStorage* spA = (UniSparseStorage*)inA; + // printf("spA->vLevel.size = %zu \n", spA->vLevel.size()); + std::shared_ptr spA_dim0 = spA->vLevel[0]; + std::shared_ptr spA_dim1 = spA->vLevel[1]; + // std::vector spA_data = spA->valueArray; + std::vector< std::vector > spA_vector = spA->vectorArray; + // printf("spA_data.size = %zu\n", spA_data.size()); + // printf("spA_vector.size = %zu\n", spA_vector.size()); + int64_t iSize = inC->sizes[0]; + int64_t jSize = inB->sizes[0]; + int64_t kSize = inC->sizes[1]; + printf("iSize = %ld, jSize = %ld, kSize = %ld\n", iSize, jSize, kSize); + // std::vector spA_dim0_crd = spA_dim0->crd; + std::vector spA_dim1_crd = spA_dim1->crd; + // // int spA_dim0_size = spA_dim0->size; + // // int spA_dim1_size = spA_dim1->size; + // printf("spA_dim0_crd = "); + // for (auto elm: spA_dim0_crd) { + // printf("%d ", elm); + // } + // printf("\n"); + // printf("spA_dim1_crd = "); + // for (auto elm: spA_dim1_crd) { + // printf("%d ", elm); + // } + // printf("\n"); + // printf("spA_dim0_size = %d, spA_dim1_size = %d \n",spA_dim0_size,spA_dim1_size); + + // printf("spA_vector = \n"); + // for (auto v: spA_vector) { + // for (auto elm: v) { + // printf("%f ", elm); + // } + // printf("\n"); + // } + // printf("\n"); + + // A*B + C + outC->basePtr = outC->data = inC->data; + outC->offset = inC->offset; + outC->strides[0] = outC->strides[1] = 1; + outC->sizes[0] = inC->sizes[0]; + outC->sizes[1] = inC->sizes[1]; + // printf("inB_data = \n"); + // for (unsigned j=0; j < jSize; j++) { + // for (unsigned k = 0; k < kSize; k++) + // printf("%f ", inB->data[j*kSize+k]); + // printf("\n"); + // } + // printf("outC_data = \n"); + // for (unsigned i=0; i < iSize; i++) { + // for (unsigned k = 0; k < kSize; k++) + // printf("%f ", outC->data[i*iSize+k]); + // printf("\n"); + // } + for (unsigned diag = 0; diag < spA_dim1_crd.size(); diag++) { + for (int i = 0; i < iSize; i++) { + int j = spA_dim1_crd[diag] + i; + if (j >=0 && j < jSize) { + for (int k = 0; k < kSize; k++) { + outC->data[i*kSize+k] += spA_vector[diag][i] * inB->data[j*kSize+k]; + } + } + } } + // printf("outC_data = \n"); + // for (unsigned i=0; i < iSize; i++) { + // for (unsigned k = 0; k < kSize; k++) + // printf("%f ", outC->data[i*kSize+k]); + // printf("\n"); + // } + // printf("\n"); + } + + void _mlir_ciface_kernel_dia_spmv(StridedMemRefType *outC, + void* inA, + StridedMemRefType *inB, + StridedMemRefType *inC) { + UniSparseStorage* spA = (UniSparseStorage*)inA; + int32_t* spA_dim0_crd = spA->vLevel[1]->crd.data(); + uint64_t spA_dim0_size = spA->vLevel[1]->crd.size(); + std::vector< std::vector > spA_vector = spA->vectorArray; + int64_t iSize = inC->sizes[0]; + int64_t jSize = inB->sizes[0]; + + // A*B + C + outC->basePtr = outC->data = inC->data; + outC->offset = inC->offset; + outC->strides[0] = 1; + outC->sizes[0] = inC->sizes[0]; + + uint64_t diag; + int i, j; + float sum; + double start = omp_get_wtime(); + for (unsigned time = 0; time < 10000; time++) { + #pragma omp parallel for private(diag,i,j,sum) + for (diag = 0; diag < spA_dim0_size; diag++) { + sum=0; + #pragma omp simd reduction(+:sum) + for (i = 0; i < iSize; i++) { + j = spA_dim0_crd[diag] + i; + if (j >=0 && j < jSize) { + sum += spA_vector[diag][i] * inB->data[j]; + } + } + outC->data[i]=sum; + } + } + double end = omp_get_wtime(); + std::cout << "omp time = " << end-start << " s"<< std::endl; + std::cout << "avg time = " << (end-start)*1000/10000 << " ms"<< std::endl; + } + + void _mlir_ciface_calculateCOOSpMV(StridedMemRefType *out, void *ptr, + StridedMemRefType *input, StridedMemRefType *ref) { + UniSparseStorage* sparT = (UniSparseStorage*)(ptr); + int32_t *row_crd = sparT->vLevel[1]->crd.data(); + int32_t *col_crd = sparT->vLevel[2]->crd.data(); + float *values = sparT->valueArray.data(); + uint64_t nnz = sparT->vLevel[2]->crd.size(); + std::cout << "nnz is " << nnz << std::endl; + std::cout << input->data << std::endl; + std::cout << ref->data << std::endl; + for(uint64_t i = 0; i < nnz; i++) { + int32_t rowInd =row_crd[i]; + int32_t colInd = col_crd[i]; + ref->data[rowInd] += values[i] * input->data[colInd]; + } + std::cout << "End loop " << std::endl; + out->data = ref->data; + out->basePtr = ref->data; + out->offset = 0; + out->strides[0] = 1; + } + + void _mlir_ciface_calculateCOOSpMM(StridedMemRefType *out, void *ptr, + StridedMemRefType *input, StridedMemRefType *ref) { + UniSparseStorage* sparT = (UniSparseStorage*)(ptr); + int32_t *row_crd = sparT->vLevel[1]->crd.data(); + int32_t *col_crd = sparT->vLevel[2]->crd.data(); + float *values = sparT->valueArray.data(); + uint64_t nnz = sparT->vLevel[2]->crd.size(); + uint64_t kSize = input->sizes[1]; + std::cout << "nnz is " << nnz << std::endl; + std::cout << input->data << std::endl; + std::cout << ref->data << std::endl; + for(uint64_t i = 0; i < nnz; i++) { + for(uint64_t k = 0; k < kSize; k++) { + int32_t rowInd =row_crd[i]; + int32_t colInd = col_crd[i]; + ref->data[rowInd*kSize + k] += values[i] * input->data[colInd*kSize + k]; + } + } + std::cout << "End loop " << std::endl; + out->data = ref->data; + out->basePtr = ref->data; + out->offset = 0; + out->strides[0] = 1; + out->strides[1] = 1; + } + + #pragma omp declare simd uniform(x, y) linear(i : 1) aligned(x, y : 32) notinbranch + void xpy(float* x, float* y, int i) { + y[i] = x[i] + y[i]; + } + + void _mlir_ciface_kernel_bdia_spmv_iter(StridedMemRefType *outC, + void* inA_CSR, + void* inA_BDIA, + StridedMemRefType *inB, + StridedMemRefType *inC) { + + int ib, i, k, diag, is, ie; + DataType sum; + UniSparseStorage* spA_CSR = (UniSparseStorage*)inA_CSR; + UniSparseStorage* spA_BDIA = (UniSparseStorage*)inA_BDIA; + int32_t* BDIA_dim1_ptr = spA_BDIA->vLevel[1]->ptr.data(); + int n_blocks = spA_BDIA->vLevel[1]->ptr.size(); + int32_t* BDIA_dim2_crd = spA_BDIA->vLevel[2]->crd.data(); + int32_t* CSR_dim1_ptr = spA_CSR->vLevel[1]->ptr.data(); + int32_t* CSR_dim2_crd = spA_CSR->vLevel[2]->crd.data(); + DataType* CSR_value = spA_CSR->valueArray.data(); + + // float* inB_data = inB->data; + // float* inC_data = inC->data; + int blockSize = spA_BDIA->vLevel[3]->size; + std::vector BDIA_vector = spA_BDIA->vector_1d; + int64_t iSize = inC->sizes[0]; + // int64_t jSize = inB->sizes[0]; + // double csr_time = 0.0; + // double bdia_time = 0.0; + unsigned runs = 1; + // std::cout << inC->data[0] << " " << inC->data[1] << " " << inC->data[2] << " " << inC->data[3] << std::endl; + + double start0 = omp_get_wtime(); + for (unsigned time = 0; time < runs; time++) { + #pragma omp parallel for private(ib,i,k,sum,diag,is,ie) + for (ib = 0; ib < n_blocks-1; ib++) { + for (i = ib*blockSize; i < std::min((ib+1)*blockSize, (int)iSize); i++) { + sum=0; + #pragma omp simd reduction(+:sum) + for(k=CSR_dim1_ptr[i]; kdata[CSR_dim2_crd[k]]); + // if(i==0) { + // std::cout << "i="<data="<data[CSR_dim2_crd[k]]<data[i] = sum; + // if(i==0) + // std::cout<<"sum="<< sum<<", inC->data["<data[i]<data="<< inB->data[i+diag] << ", inC->data["<data[i]<=0) && (i+ib*blockSize+diag < jSize)) + // inC->data[i+ib*blockSize] += BDIA_vector[k*blockSize+i] * inB->data[i+ib*blockSize+diag]; + // } + } + } + } + double end0 = omp_get_wtime(); + std::cout << "Hybrid total time = " << (end0-start0) << " s"<< std::endl; + std::cout << "Hybrid avg time = " << (end0-start0)*1000/runs << " ms"<< std::endl; + // std::cout << (end0-start0)*1000/runs << std::endl; + + + outC->data = inC->data; + outC->basePtr = inC->basePtr; + outC->offset = inC->offset; + outC->sizes[0] = inC->sizes[0]; + outC->strides[0] = inC->strides[0]; + // for(unsigned i = 0; i <4; i++ ) + // std::cout <<"outC->data["<data[i]< *outC, + void* inA_CSR, + void* inA_BDIA, + StridedMemRefType *inB, + StridedMemRefType *inC) { + + int ib, i, k, j, diag, is, ie; + + UniSparseStorage* spA_CSR = (UniSparseStorage*)inA_CSR; + UniSparseStorage* spA_BDIA = (UniSparseStorage*)inA_BDIA; + int32_t* BDIA_dim1_ptr = spA_BDIA->vLevel[1]->ptr.data(); + int n_blocks = spA_BDIA->vLevel[1]->ptr.size(); + int32_t* BDIA_dim2_crd = spA_BDIA->vLevel[2]->crd.data(); + int32_t* CSR_dim1_ptr = spA_CSR->vLevel[1]->ptr.data(); + int32_t* CSR_dim2_crd = spA_CSR->vLevel[2]->crd.data(); + DataType* CSR_value = spA_CSR->valueArray.data(); + + // float* inB_data = inB->data; + // float* inC_data = inC->data; + int blockSize = spA_BDIA->vLevel[3]->size; + std::vector BDIA_vector = spA_BDIA->vector_1d; + int64_t iSize = inC->sizes[0]; + // int64_t jSize = inB->sizes[0]; + int64_t kSize = inB->sizes[1]; + assert(kSize == inC->sizes[1]); + DataType *sum; + double start = omp_get_wtime(); + + for (unsigned time = 0; time < 1000; time++) { + #pragma omp parallel for private(ib,i,k,j,sum,diag,is,ie) + for (ib = 0; ib < n_blocks-1; ib++) { + for (i = ib*blockSize; i < std::min((ib+1)*blockSize, (int)iSize); i++) { + sum=new DataType[kSize](); + // sum=0; + + for (j=0;jdata[CSR_dim2_crd[k]]=" + // <data[CSR_dim2_crd[k]]<data[CSR_dim2_crd[k]*kSize+j]); + } + // if(i==0) { + // std::cout << "i="<data="<data[CSR_dim2_crd[k]]<data[i*kSize+j] = sum[j]; + } + // for (j=0;jdata[i*kSize+j] = sum[j]; + // } + delete[] sum; + // if(i==0) + // std::cout<<"sum="<< sum<<", inC->data["<data[i]<data="<< inB->data[i+diag] << ", inC->data["<data[i]<=0) && (i+ib*blockSize+diag < jSize)) + // inC->data[i+ib*blockSize] += BDIA_vector[k*blockSize+i] * inB->data[i+ib*blockSize+diag]; + // } + } + // Robot motion planing + // for (i = ib*blockSize; i < std::min((ib+1)*blockSize, (int)iSize); i++) { + // for(j=0; jdata[i*kSize+j] = inC->data[i*kSize+j]; + // } + // } + } + } + + double end = omp_get_wtime(); + std::cout << "omp time = " << end-start << " s"<< std::endl; + std::cout << "avg time = " << (end-start)*1000/1000 << " ms"<< std::endl; + + outC->data = inC->data; + outC->basePtr = inC->basePtr; + outC->offset = inC->offset; + outC->sizes[0] = inC->sizes[0]; + outC->sizes[1] = inC->sizes[1]; + outC->strides[0] = inC->strides[0]; + outC->strides[1] = inC->strides[1]; + for(unsigned i = 0; i <4; i++ ) { + for(unsigned j = 0; j <4; j++ ) + std::cout <data[i*kSize+j]<<" "; + std::cout << std::endl; } - index_type request_dim = dim + 2; - return metaData[request_dim]; + } + + + void _mlir_ciface_kernel_bdia_spmv(StridedMemRefType *outC, + void* inA_CSR, + void* inA_BDIA, + StridedMemRefType *inB, + StridedMemRefType *inC) { + + // std::cout << "inB = "; + // for (unsigned i = 0; i < jSize; i++) + // std::cout << inB->data[i] << " "; + // std::cout << std::endl; + + + + // A*B + C + // outC->basePtr = outC->data = inC->data; + // outC->offset = inC->offset; + // outC->strides[0] = 1; + // outC->sizes[0] = inC->sizes[0]; + // printf("inC_data = \n"); + // for (unsigned i=0; i < 4; i++) + // printf("%f ", inC->data[i]); + // printf("\n"); + int ib, i, k, diag, is, ie; + DataType sum; + UniSparseStorage* spA_CSR = (UniSparseStorage*)inA_CSR; + UniSparseStorage* spA_BDIA = (UniSparseStorage*)inA_BDIA; + int32_t* BDIA_dim1_ptr = spA_BDIA->vLevel[1]->ptr.data(); + int n_blocks = spA_BDIA->vLevel[1]->ptr.size(); + int32_t* BDIA_dim2_crd = spA_BDIA->vLevel[2]->crd.data(); + int32_t* CSR_dim1_ptr = spA_CSR->vLevel[1]->ptr.data(); + int32_t* CSR_dim2_crd = spA_CSR->vLevel[2]->crd.data(); + DataType* CSR_value = spA_CSR->valueArray.data(); + + // float* inB_data = inB->data; + // float* inC_data = inC->data; + int blockSize = spA_BDIA->vLevel[3]->size; + std::vector BDIA_vector = spA_BDIA->vector_1d; + int64_t iSize = inC->sizes[0]; + // int64_t jSize = inB->sizes[0]; + double start = omp_get_wtime(); + // std::cout << "n_blocks = " << n_blocks << std::endl; + // // std::cout << "BDIA_dim1_ptr_size = " << BDIA_dim1_ptr_size << std::endl; + // // std::cout << "spA->vLevel[3]->size = " << blockSize << std::endl; + // std::cout << "spA_BDIA->vLevel[1]->ptr = "; + // for (auto x: spA_BDIA->vLevel[1]->ptr) + // std::cout << x << " "; + // std::cout << std::endl; + // std::cout << "spA_BDIA->vLevel[2]->crd = "; + // for (auto x: spA_BDIA->vLevel[2]->crd) + // std::cout << x << " "; + // std::cout << std::endl; + // std::cout << "spA_BDIA->vectorArray = "; + // // for (auto i:spA_vector) { + // // for (auto j: i) { + // // std::cout << j << " "; + // // } + // // std::cout << std::endl; + // // } + // for (unsigned i = 0; i < BDIA_vector.size(); i++) { + // // for (unsigned j = 0; j < BDIA_vector[i].size(); j++) + // std::cout << BDIA_vector[i] << " "; + // std::cout << std::endl; + // } + // std::cout << "CSR_dim1_ptr size = " << spA_CSR->vLevel[1]->ptr.size() << std::endl; + // std::cout << "spA_CSR->vLevel[1]->ptr = "; + // for (auto j: spA_CSR->vLevel[1]->ptr) { + // std::cout << j << " "; + // } + // std::cout << std::endl; + // std::cout << "CSR_dim2_crd size = " << spA_CSR->vLevel[2]->crd.size() << std::endl; + // std::cout << "spA_CSR->vLevel[2]->crd = "; + // for (auto j: spA_CSR->vLevel[2]->crd) { + // std::cout << j << " "; + // } + // std::cout << std::endl; + // std::cout << "CSR_value size = " << spA_CSR->valueArray.size() << std::endl; + // std::cout << "spA_CSR->valueArray = "; + // for (auto j: spA_CSR->valueArray) { + // std::cout << j << " "; + // } + // std::cout << std::endl; + for (unsigned time = 0; time < 10000; time++) { + #pragma omp parallel for private(ib,i,k,sum,diag,is,ie) + for (ib = 0; ib < n_blocks-1; ib++) { + for (i = ib*blockSize; i < std::min((ib+1)*blockSize, (int)iSize); i++) { + sum=0; + #pragma omp simd reduction(+:sum) + for(k=CSR_dim1_ptr[i]; kdata[CSR_dim2_crd[k]]=" + // <data[CSR_dim2_crd[k]]<data[CSR_dim2_crd[k]]); + // if(i==0) { + // std::cout << "i="<data="<data[CSR_dim2_crd[k]]<data[i] = sum; + // if(i==0) + // std::cout<<"sum="<< sum<<", inC->data["<data[i]<data="<< inB->data[i+diag] << ", inC->data["<data[i]<=0) && (i+ib*blockSize+diag < jSize)) + // inC->data[i+ib*blockSize] += BDIA_vector[k*blockSize+i] * inB->data[i+ib*blockSize+diag]; + // } + } + } + } + + double end = omp_get_wtime(); + std::cout << "omp time = " << end-start << " s"<< std::endl; + std::cout << "avg time = " << (end-start)*1000/10000 << " ms"<< std::endl; + + outC->data = inC->data; + outC->basePtr = inC->basePtr; + outC->offset = inC->offset; + outC->sizes[0] = inC->sizes[0]; + outC->strides[0] = inC->strides[0]; + for(unsigned i = 0; i <4; i++ ) + std::cout <<"outC->data["<data[i]<data[0]=" <data[0] << ", outC->data[0]=" << outC->data[0]<vectorArray = " << std::endl; + // // for (auto i:spA_vector) { + // // for (auto j: i) { + // // std::cout << j << " "; + // // } + // // std::cout << std::endl; + // // } + // for (unsigned i = 0; i < spA_vector.size(); i++) { + // for (unsigned j = 0; j < spA_vector[i].size(); j++) + // std::cout << spA_vector[i][j] << " "; + // std::cout << std::endl; + // } + // printf("outC_data = \n"); + // for (unsigned i=0; i < iSize; i++) + // printf("%f ", outC->data[i]); + // printf("\n"); } void output_header(std::ofstream& outfile, int row_size, int col_size, int nnz) { @@ -3193,6 +3711,832 @@ FOREVERY_V(IMPL_NEWUNISPARSETENSOR) outfile << row_size << " " << col_size << " " << nnz << "\n"; } + void* _mlir_ciface_decompose_BDIA(void* ptr, int32_t blockSize, float thres) { + UniSparseStorage* sparT = (UniSparseStorage*)ptr; + // int32_t *row_crd = sparT->vLevel[1]->crd.data(); + // int32_t *col_crd = sparT->vLevel[2]->crd.data(); + + uint64_t row_size = sparT->dimSizes.data()[0]; + uint64_t col_size = sparT->dimSizes.data()[1]; + // float *values = sparT->valueArray.data(); + uint64_t nnz = sparT->vLevel[2]->crd.size(); +// std::vector row_crd(sparT->vLevel[1]->crd); +// std::vector col_crd(sparT->vLevel[2]->crd); +// std::vector values(sparT->valueArray); + sparT->vLevel[0]->ptr.pop_back(); + sparT->vLevel[1]->crd.clear(); + sparT->vLevel[1]->same_path.clear(); + sparT->vLevel[1]->same_path.push_back(0); + sparT->vLevel[2]->crd.clear(); + sparT->vLevel[2]->same_path.clear(); + sparT->vLevel[2]->same_path.push_back(0); + sparT->valueArray.clear(); + // for (unsigned i = 0; i < nnz; i++) { + // std::cout << "row = " << row_crd[i] << "col = "<< col_crd[i] << std::endl; + // } + std::cout << "blockSize = " << blockSize << ", thres = " << thres << std::endl; + // std::cout << "row_size = " << row_size << ", col_size = " << col_size << ", nnz = " << nnz << std::endl; + assert(col_size >= row_size); + // bool *root_same_path = sparT->vLevel[0]->same_path.data(); + // bool *row_same_path = sparT->vLevel[1]->same_path.data(); + // bool *col_same_path = sparT->vLevel[2]->same_path.data(); + // std::cout << "root_same_path size= " << sparT->vLevel[0]->same_path.size() << std::endl; + // std::cout << "row_same_path size= " << sparT->vLevel[1]->same_path.size() << std::endl; + // std::cout << "col_same_path size= " << sparT->vLevel[2]->same_path.size() << std::endl; + // std::cout << "root_ptr size= " << sparT->vLevel[0]->ptr.size() << std::endl; + // std::cout << "root_ptr[1] = " << sparT->vLevel[0]->ptr[1] << std::endl; + + int** diag_nnz = new int *[((row_size-1)/blockSize)+1]; + for (unsigned i = 0; i < ((row_size-1)/blockSize)+1; i++) + diag_nnz[i] = new int[row_size+col_size-1]; + for (unsigned i = 0; i < ((row_size-1)/blockSize)+1; i++) + for (unsigned j = 0; j < row_size+col_size-1; j++) + diag_nnz[i][j] = 0; + for(uint64_t i = 0; i < nnz; i++) { + // if (values[i] == 0) + // continue; + int new_dim0 = sparT->vLevel[1]->crd[i]/blockSize; + int new_dim1 = sparT->vLevel[2]->crd[i]-sparT->vLevel[1]->crd[i]; + diag_nnz[new_dim0][new_dim1+col_size-1] += 1; + } + // std::cout << "diag_nnz:" << std::endl; + // for (unsigned i = 0; i < ((row_size-1)/blockSize)+1; i++) { + // for (unsigned j = 0; j < row_size+col_size-1; j++) + // std::cout << diag_nnz[i][j] << " "; + // std::cout << std::endl; + // } + // split the matrix + // step 1: initialize vectorArray + auto T_BDIA = new UniSparseStorage(); + for (unsigned i = 0; i <= 3; i++) + T_BDIA->vLevel.push_back(std::shared_ptr(new LevelStorage)); + T_BDIA->vLevel[1]->type = LVFUSE ; + T_BDIA->vLevel[1]->ptr.push_back(0); + T_BDIA->vLevel[2]->type = LVTRIM ; + T_BDIA->vLevel[3]->size = blockSize; + T_BDIA->dimSizes.push_back(row_size); + T_BDIA->dimSizes.push_back(col_size); + // UniSparseStorage* T_COO = new UniSparseStorage; + // T_COO->initCOO(row_size,col_size); + + + int row_diag_count = 0; + for (unsigned i = 0; i < ((row_size-1)/blockSize)+1; i++) { + for (unsigned j = 0; j < row_size+col_size-1; j++) { + if (diag_nnz[i][j] > blockSize*thres) { + row_diag_count++; + T_BDIA->vLevel[2]->crd.push_back(j-col_size+1); + // std::vector new_vec(blockSize, 0.0); + // T_BDIA->vectorArray.push_back(new_vec); + for (int k = 0; k < blockSize; k++) + T_BDIA->vector_1d.push_back(0); + } + } + T_BDIA->vLevel[1]->ptr.push_back(row_diag_count); + } + // std::cout << "T_BDIA->vLevel[1]->ptr = "; + // for (auto elm: T_BDIA->vLevel[1]->ptr) + // std::cout << elm << " "; + // std::cout << std::endl; + // std::cout << "T_BDIA->vLevel[2]->crd = "; + // for (auto elm: T_BDIA->vLevel[2]->crd) + // std::cout << elm << " "; + // std::cout << std::endl; + // std::cout << "T_BDIA->vectorArray.size = " << T_BDIA->vectorArray.size() << std::endl; + + //step 2: distribute values + int* dim1_ptr = T_BDIA->vLevel[1]->ptr.data(); + int* dim2_crd = T_BDIA->vLevel[2]->crd.data(); + // std::vector punch_pos; + int dia_nnz_count = 0; + std::string output_file_path = "/work/shared/users/staff/zz546/Sparse_Layout_Dialect/test/Data/output_matrix_market.mtx"; + std::ofstream outfile(output_file_path); + output_header(outfile, row_size, col_size, nnz); + for(unsigned i = 0; i < nnz; i++) { + // if (values[i] == 0) + // continue; + int new_dim1 = sparT->vLevel[1]->crd[i]/blockSize; + int new_dim2 = sparT->vLevel[2]->crd[i]-sparT->vLevel[1]->crd[i]; + int new_dim3 = sparT->vLevel[1]->crd[i]%blockSize; + if (diag_nnz[new_dim1][new_dim2+col_size-1] > blockSize*thres) { + outfile << sparT->vLevel[1]->crd[i]+1 << " " << sparT->vLevel[2]->crd[i]+1 << " " << std::scientific << std::setprecision(3) << sparT->valueArray[i] << "\n"; + // if (row_crd[i] == 0) + // std::cout << "col = "<< col_crd[i] << ", values =" << values[i] << std::endl; + // BDIA + int diag_block; + for (diag_block = dim1_ptr[new_dim1]; diag_block < dim1_ptr[new_dim1+1]; diag_block++) + if (dim2_crd[diag_block] == new_dim2) + break; + // T_BDIA->vectorArray[diag_block][new_dim3] = values[i]; + T_BDIA->vector_1d[diag_block*blockSize+new_dim3] = sparT->valueArray[i]; + dia_nnz_count++; + // punch_pos.push_back(i); + } + else { + if (sparT->valueArray.size() > 0) { + sparT->vLevel[1]->same_path.push_back(sparT->vLevel[1]->crd[i] == sparT->vLevel[1]->crd.back()); + sparT->vLevel[2]->same_path.push_back( + (sparT->vLevel[1]->crd[i] == sparT->vLevel[1]->crd.back()) && (sparT->vLevel[2]->crd[i] == sparT->vLevel[2]->crd.back())); + } + sparT->vLevel[1]->crd.push_back(sparT->vLevel[1]->crd[i]); + sparT->vLevel[2]->crd.push_back(sparT->vLevel[2]->crd[i]); + sparT->valueArray.push_back(sparT->valueArray[i]); + // T_COO->vLevel[1]->crd.push_back(row_crd[i]); + // T_COO->vLevel[1]->same_path.push_back(row_crd[i]== T_COO->vLevel[1]->crd.back()); + // T_COO->vLevel[2]->crd.push_back(col_crd[i]); + // T_COO->valueArray.push_back(values[i]); + } + } + outfile.seekp(0); + output_header(outfile, row_size, col_size, dia_nnz_count); + outfile.close(); + + + // for (auto pos: punch_pos) { + // sparT->valueArray[pos]=0; + // } + sparT->vLevel[0]->ptr.push_back(sparT->vLevel[1]->crd.size()); + + // std::cout << "row_same_path size= " << sparT->vLevel[1]->same_path.size() << std::endl; + // std::cout << "col_same_path size= " << sparT->vLevel[2]->same_path.size() << std::endl; + // std::cout << "root_ptr size= " << sparT->vLevel[0]->ptr.size() << std::endl; + std::cout << "root_ptr[1] = " << sparT->vLevel[0]->ptr[1] << std::endl; + std::cout << "diag_nnz_count = " << dia_nnz_count << std::endl; + // std::cout << "T_BDIA->vectorArray = " << std::endl; + // for (auto i=dim1_ptr[0]; i < dim1_ptr[1]; i++) { + // std::cout << "diag=" << dim2_crd[i] << ", "<< T_BDIA->vectorArray[i][0] << " "<valueArray = " << std::endl; + // for (unsigned x = 0; x < T_COO->vLevel[1]->crd.size(); x++) { + // if (T_COO->vLevel[1]->crd[x]==0) + // std::cout <vLevel[1]->crd[x]<<", "<vLevel[2]->crd[x]<<", " <valueArray[x] << " "<finalizeCOO(); + UniSparseStruct* ret = new UniSparseStruct; + ret->vec.push_back((void*)sparT); + ret->vec.push_back((void*)T_BDIA); + return (void*) ret; + } + + void* _mlir_ciface_decompose_BDIA_opt(void* ptr, int32_t blockSize, float thres) { + UniSparseStorage* sparT = (UniSparseStorage*)ptr; + + uint64_t row_size = sparT->dimSizes.data()[0]; + uint64_t col_size = sparT->dimSizes.data()[1]; + uint64_t nnz = sparT->vLevel[2]->crd.size(); +// std::vector row_crd(sparT->vLevel[1]->crd); +// std::vector col_crd(sparT->vLevel[2]->crd); +// std::vector values(sparT->valueArray); + sparT->vLevel[0]->ptr.pop_back(); + sparT->vLevel[1]->crd.clear(); + sparT->vLevel[1]->same_path.clear(); + sparT->vLevel[2]->crd.clear(); + sparT->vLevel[2]->same_path.clear(); + sparT->valueArray.clear(); + std::cout << "blockSize = " << blockSize << ", thres = " << thres << std::endl; + assert(col_size >= row_size); + + // step 1: initialize vectorArray + auto T_BDIA = new UniSparseStorage(); + for (unsigned i = 0; i <= 3; i++) + T_BDIA->vLevel.push_back(std::shared_ptr(new LevelStorage)); + T_BDIA->vLevel[1]->type = LVFUSE ; + T_BDIA->vLevel[1]->ptr.push_back(0); + T_BDIA->vLevel[2]->type = LVTRIM ; + T_BDIA->vLevel[3]->size = blockSize; + T_BDIA->dimSizes.push_back(row_size); + T_BDIA->dimSizes.push_back(col_size); + + // assume read-in data is in row-major order + double start = omp_get_wtime(); + + uint64_t diag_block_count = 0; + uint64_t diag_nnz_count = 0; + std::vector row_ptr; + std::vector dia_row_ptr; + row_ptr.push_back(0); + dia_row_ptr.push_back(0); + int* diag_nnz = new int[blockSize+col_size-1]; + for(unsigned i = 0; i < blockSize+col_size-1; i++) + diag_nnz[i] = 0; + int prev_row_block = sparT->vLevel[1]->crd[0]/blockSize; + + for(uint64_t i = 0; i < nnz; i++) { + int new_dim1 = sparT->vLevel[1]->crd[i] / blockSize; + int new_dim2 = sparT->vLevel[2]->crd[i] - sparT->vLevel[1]->crd[i]; + if (new_dim1 == prev_row_block) { + diag_nnz[new_dim2+(new_dim1+1)*blockSize-1] += 1; + } else { + for (uint64_t j = 0; j < blockSize+col_size-1; j++) { + if (diag_nnz[j]> blockSize*thres) { + diag_block_count++; + diag_nnz_count += diag_nnz[j]; + int64_t offset=j-(prev_row_block+1)*blockSize+1; + // std::cout <<"row="<vec.push_back((void*)sparT); + ret->vec.push_back((void*)T_BDIA); + return (void*) ret; + } + + void* _mlir_ciface_decompose_BDIA_opt2(void* ptr, int32_t blockSize, float thres) { + UniSparseStorage* sparT = (UniSparseStorage*)ptr; + + uint64_t row_size = sparT->dimSizes.data()[0]; + uint64_t col_size = sparT->dimSizes.data()[1]; + uint64_t nnz = sparT->vLevel[2]->crd.size(); + std::vector row_crd(sparT->vLevel[1]->crd); + std::vector col_crd(sparT->vLevel[2]->crd); + std::vector values(sparT->valueArray); + sparT->vLevel[0]->ptr.pop_back(); + sparT->vLevel[1]->crd.clear(); + sparT->vLevel[1]->same_path.clear(); + sparT->vLevel[2]->crd.clear(); + sparT->vLevel[2]->same_path.clear(); + sparT->valueArray.clear(); + std::cout << "blockSize = " << blockSize << ", thres = " << thres << std::endl; + assert(col_size >= row_size); + + // step 1: initialize vectorArray + auto T_BDIA = new UniSparseStorage(); + for (unsigned i = 0; i <= 3; i++) + T_BDIA->vLevel.push_back(std::shared_ptr(new LevelStorage)); + T_BDIA->vLevel[1]->type = LVFUSE ; + T_BDIA->vLevel[1]->ptr.resize(((row_size-1)/blockSize)+2, 0); + T_BDIA->vLevel[2]->type = LVTRIM ; + T_BDIA->vLevel[3]->size = blockSize; + T_BDIA->dimSizes.push_back(row_size); + T_BDIA->dimSizes.push_back(col_size); + + // assume read-in data is in row-major order + double start = omp_get_wtime(); + + std::vector diag_block_count(((row_size-1)/blockSize)+1, 0); + // std::cout<<"diag_block_count size="< row_ptr(((row_size-1)/blockSize)+2, 0); + // row_ptr.push_back(0); + double mem_1 = omp_get_wtime(); + std::vector dia_row_ptr(((row_size-1)/blockSize)+2, 0); + double mem_2 = omp_get_wtime(); + + double mem_3 = omp_get_wtime(); + // std::vector diag_nnz((((row_size-1)/blockSize)+1)*(blockSize+col_size-1), 0); + std::vector> diag_off(((row_size-1)/blockSize)+1); + + double mem_4 = omp_get_wtime(); + + int first_dim1 = row_crd[0]/blockSize; + for (int m = 0; m < first_dim1; m++) + row_ptr[m+1]=0; + int prev_dim1, new_dim1, init_j; + unsigned init_i; + double end_0 = omp_get_wtime(); + #pragma omp parallel for private(prev_dim1, new_dim1, init_j, init_i) + for(init_i = 1; init_i < nnz; init_i++) { + prev_dim1 = row_crd[init_i-1]/blockSize; + new_dim1 = row_crd[init_i]/blockSize; + if (new_dim1 != prev_dim1) { + for (init_j = prev_dim1; init_j < new_dim1; init_j++) + row_ptr[init_j+1]=init_i; + } + } + // std::cout << "new_dim1 = "< diag_nnz; + for(unsigned time = 0; time < 1; time++) { + #pragma omp parallel for private(diag_nnz, iter1_i, iter1_pos, iter1_j, iter1_dim2) + for (iter1_i = 0; iter1_i < ((row_size-1)/blockSize)+1; iter1_i++) { + diag_nnz.clear(); + diag_nnz.resize(blockSize+col_size-1, 0); + diag_block_count[iter1_i] = 0; + dia_row_ptr[iter1_i+1] = 0; + std::vector().swap(diag_off[iter1_i]); + for(iter1_pos = row_ptr[iter1_i]; iter1_pos < row_ptr[iter1_i+1]; iter1_pos++) { + // iter1_dim1 = row_crd[iter1_pos]/blockSize; + iter1_dim2 = col_crd[iter1_pos]-row_crd[iter1_pos]; + // iter1_dim3 = row_crd[iter1_pos]%blockSize; + diag_nnz[iter1_dim2+(iter1_i+1)*blockSize-1] += 1; + } + // std::cout<<"row block="< blockSize*thres) { + diag_block_count[iter1_i] += 1; + dia_row_ptr[iter1_i+1] += diag_nnz[iter1_j]; + int offset = (int)iter1_j-(iter1_i+1)*blockSize+1; + diag_off[iter1_i].push_back(offset); + // if (iter1_i==0) + // std::cout<<"dia_row_ptr["<vLevel[1]->ptr["<vLevel[1]->ptr[init+1]<vLevel[1]->ptr.size()="<vLevel[1]->ptr.size()<vLevel[2]->crd.reserve(total_dia_block); + for (unsigned init = 0; init < ((row_size-1)/blockSize)+1; init++) { + for (auto elm: diag_off[init]) + T_BDIA->vLevel[2]->crd.push_back(elm); + std::vector().swap(diag_off[init]); + } + std::vector>().swap(diag_off); + std::vector().swap(diag_block_count); + + + assert(T_BDIA->vLevel[2]->crd.size()==(unsigned)total_dia_block); + T_BDIA->vector_1d.resize(total_dia_block*blockSize, 0.0); + + // unsigned iter3_i, iter3_j; + // #pragma omp parallel for private(iter3_i, iter3_j) + // for (iter3_i = 0; iter3_i < ((row_size-1)/blockSize)+1; iter3_i++) { + // for (iter3_j = 0; iter3_j < blockSize+col_size-1; iter3_j++) { + // if (diag_nnz[iter3_i*(blockSize+col_size-1) + iter3_j] > blockSize*thres) { + // int64_t offset = iter3_j-(iter3_i+1)*blockSize+1; + // unsigned pos = T_BDIA->vLevel[1]->ptr[iter3_i]; + // T_BDIA->vLevel[2]->crd[pos] = offset; + // } + // } + // } + + // parallelize + int* dim1_ptr = T_BDIA->vLevel[1]->ptr.data(); + int* dim2_crd = T_BDIA->vLevel[2]->crd.data(); + // std::cout << "T_BDIA->vLevel[2]->crd:"<vLevel[2]->crd) + // std::cout << elm <<" "; + // std::cout<vLevel[1]->crd.maxsize="<< + // sparT->vLevel[1]->crd.max_size()<vLevel[1]->crd.resize(nnz-diag_nnz_count); + sparT->vLevel[1]->same_path.resize(nnz-diag_nnz_count); + sparT->vLevel[1]->same_path[0]=0; + sparT->vLevel[2]->crd.resize(nnz-diag_nnz_count); + sparT->vLevel[2]->same_path.resize(nnz-diag_nnz_count); + sparT->vLevel[2]->same_path[0]=0; + sparT->valueArray.resize(nnz-diag_nnz_count); + sparT->vLevel[0]->ptr.push_back(nnz-diag_nnz_count); + unsigned i, pos; + int iter2_dim1, iter2_dim2, iter2_dim3, start_pos, end_pos, insert_pos; + unsigned COO_pos; + bool is_BDIA; + double end_3 = omp_get_wtime(); + for (unsigned time = 0; time < 1; time++) { + #pragma omp parallel for private(i, pos,iter2_dim1, iter2_dim2, \ + iter2_dim3, start_pos, end_pos, insert_pos, COO_pos, is_BDIA) + for (i = 0; i < ((row_size-1)/blockSize)+1; i++) { + COO_pos=row_ptr[i]-dia_row_ptr[i]; + for(pos = row_ptr[i]; pos < row_ptr[i+1]; pos++) { + iter2_dim1 = row_crd[pos]/blockSize; + iter2_dim2 = col_crd[pos]-row_crd[pos]; + iter2_dim3 = row_crd[pos]%blockSize; + start_pos = dim1_ptr[iter2_dim1]; + end_pos = dim1_ptr[iter2_dim1+1]; + // std::cout<<"start_pos="<vector_1d["< 0) { + sparT->vLevel[1]->same_path[COO_pos]=(row_crd[pos] == sparT->vLevel[1]->crd[COO_pos-1]); + sparT->vLevel[2]->same_path[COO_pos]=( + (row_crd[pos] == sparT->vLevel[1]->crd[COO_pos-1]) && (col_crd[pos] == sparT->vLevel[2]->crd[COO_pos-1])); + } + sparT->vLevel[1]->crd[COO_pos]=(row_crd[pos]); + sparT->vLevel[2]->crd[COO_pos]=(col_crd[pos]); + sparT->valueArray[COO_pos]=(values[pos]); + // std::cout<<"COO_pos="<vLevel[1]->crd.size() = " << sparT->vLevel[1]->crd.size() << std::endl; + // std::cout << "sparT->vLevel[0]->ptr[1] = " << sparT->vLevel[0]->ptr[1] << std::endl; + // std::cout << "sparT->vLevel[1]->same_path: "; + // for (auto elm: sparT->vLevel[1]->same_path) + // std::cout<vLevel[2]->same_path: "; + // for (auto elm: sparT->vLevel[2]->same_path) + // std::cout<vLevel[1]->crd: "; + // for (auto elm: sparT->vLevel[1]->crd) + // std::cout<vLevel[2]->crd: "; + // for (auto elm: sparT->vLevel[2]->crd) + // std::cout<valueArray: "; + // for (auto elm: sparT->valueArray) + // std::cout< col_blocks(((row_size-1)/blockSize)+1, 0); + // std::vector col_block_nnz(((col_size-1)/blockSize)+1, 0); + std::vector> col_block_crd(((row_size-1)/blockSize)+1); + #pragma omp parallel for private(iter1_i, iter1_pos, iter1_j, col_block_id) + for (iter1_i = 0; iter1_i < ((row_size-1)/blockSize)+1; iter1_i++) { + std::vector col_block_nnz(((col_size-1)/blockSize)+1, 0); + for (iter1_pos = row_block_ptr[iter1_i]; iter1_pos < row_block_ptr[iter1_i+1]; iter1_pos++) { + col_block_id = col_crd[iter1_pos]/blockSize; + col_block_nnz[col_block_id] += 1; + } + // std::cout << "col_block_nnz = "; + // for (unsigned n = 0; n < col_block_nnz.size(); n++) { + // std::cout << col_block_nnz[n] << " "; + // } + // std::cout << "\n"; + for (iter1_j = 0; iter1_j < ((col_size-1)/blockSize)+1; iter1_j++) { + if (col_block_nnz[iter1_j] > blockSize*blockSize*block_thres) { + col_blocks[iter1_i] += 1; + col_block_crd[iter1_i].push_back(iter1_j); + } + } + } + unsigned max_nnz = *std::max_element(col_blocks.begin(), col_blocks.end()); + unsigned level1_size = unsigned(std::ceil(max_nnz * col_thres)); + for (unsigned i = 0; i < level1_size; i++) + T_BELL->vLevel[1]->crd.push_back(i); + // std::cout << "col_blocks = "; + // for (unsigned n = 0; n < col_blocks.size(); n++) { + // std::cout << col_blocks[n] << " "; + // } + // std::cout << "\n"; + // std::cout << "col_block_crd = "; + // for (unsigned n = 0; n < col_block_crd.size(); n++) { + // std::cout << "\n"; + // for (unsigned m = 0; m < col_block_crd[n].size(); m++) + // std::cout << col_block_crd[n][m] << " "; + // } + // std::cout << "\n"; + + // step 3: compute level 3 crd. Only after the crd order is determined can values be dispatched + T_BELL->vLevel[3]->crd.resize(T_BELL->vLevel[2]->size * level1_size, 0); + unsigned iter2_i, iter2_j, pad_val; + #pragma omp parallel for private(iter2_i, iter2_j, pad_val) + for (iter2_i = 0; iter2_i < ((row_size-1)/blockSize)+1; iter2_i++) { + for (iter2_j = 0; iter2_j < std::min(level1_size, (unsigned)col_block_crd[iter2_i].size()); iter2_j++) { + T_BELL->vLevel[3]->crd[iter2_i*level1_size + iter2_j] = col_block_crd[iter2_i][iter2_j]; + } + pad_val = 0; + while(iter2_j < level1_size) { + if (std::count(col_block_crd[iter2_i].begin(), col_block_crd[iter2_i].end(), pad_val)) { + pad_val += 1; + continue; + } else { + T_BELL->vLevel[3]->crd[iter2_i*level1_size + iter2_j] = pad_val; + pad_val += 1; + iter2_j += 1; + } + } + } + // std::cout << "level3_crd = "; + // for (unsigned n = 0; n < T_BELL->vLevel[3]->crd.size(); n++) { + // std::cout << T_BELL->vLevel[3]->crd[n] << " "; + // } + // std::cout << "\n"; + + // step 4: compute nnz, and row insert boundaries + unsigned iter3_i, iter3_pos; + unsigned inner_row_id, inner_col_id; + bool is_BELL; + unsigned find_val_id; + std::vector> COO_row_crd(((row_size-1)/blockSize)+1); + std::vector> COO_col_crd(((row_size-1)/blockSize)+1); + std::vector> COO_val(((row_size-1)/blockSize)+1); + T_BELL->vectorArray.resize(T_BELL->vLevel[2]->size * level1_size, std::vector(blockSize*blockSize, 0)); + #pragma omp parallel for private(iter3_i, iter3_pos, col_block_id, inner_row_id, inner_col_id, is_BELL) + for (iter3_i = 0; iter3_i < ((row_size-1)/blockSize)+1; iter3_i++) { + for (iter3_pos = row_block_ptr[iter3_i]; iter3_pos < row_block_ptr[iter3_i+1]; iter3_pos++) { + col_block_id = col_crd[iter3_pos]/blockSize; + inner_row_id = row_crd[iter3_pos]%blockSize; + inner_col_id = col_crd[iter3_pos]%blockSize; + // check if col_block_id is in BELL + is_BELL = false; + for (find_val_id = iter3_i*level1_size; + find_val_id < (iter3_i + 1)*level1_size; + find_val_id ++) { + if (T_BELL->vLevel[3]->crd[find_val_id] == col_block_id) { + is_BELL = true; + break; + } + } + if (is_BELL) { + unsigned correct_col_id = find_val_id%level1_size; + // std::cout << "correct_col_id = " << correct_col_id << "\n"; + T_BELL->vectorArray[iter3_i*level1_size+correct_col_id][inner_row_id*blockSize+inner_col_id] = values[iter3_pos]; + } else { + COO_row_crd[iter3_i].push_back(row_crd[iter3_pos]); + COO_col_crd[iter3_i].push_back(col_crd[iter3_pos]); + COO_val[iter3_i].push_back(values[iter3_pos]); + } + } + } + // std::cout << "vectorArray = "; + // for (unsigned n = 0; n < T_BELL->vectorArray.size(); n++) { + // std::cout << "\n"; + // for (unsigned m = 0; m < T_BELL->vectorArray[n].size(); m++) + // std::cout << T_BELL->vectorArray[n][m] << " "; + // } + // std::cout << "\n"; + // std::cout << "COO_row_crd, size = " << COO_row_crd.size(); + // for (unsigned n = 0; n < COO_row_crd.size(); n++) { + // std::cout << "\n"; + // for (unsigned m = 0; m < COO_row_crd[n].size(); m++) + // std::cout << COO_row_crd[n][m] << " "; + // } + // std::cout << "\n"; + // std::cout << "COO_col_crd, size = " << COO_col_crd.size(); + // for (unsigned n = 0; n < COO_col_crd.size(); n++) { + // std::cout << "\n"; + // for (unsigned m = 0; m < COO_col_crd[n].size(); m++) + // std::cout << COO_col_crd[n][m] << " "; + // } + // std::cout << "\n"; + // std::cout << "COO_val, size = " << COO_val.size(); + // for (unsigned n = 0; n < COO_val.size(); n++) { + // std::cout << "\n"; + // for (unsigned m = 0; m < COO_val[n].size(); m++) + // std::cout << COO_val[n][m] << " "; + // } + // std::cout << "\n"; + + // step 5: compute valueArray. Dispatch the values to COO and BELL. + unsigned COO_nnz = 0; + unsigned iter4_i, iter4_j; + sparT->vLevel[1]->same_path.push_back(0); + sparT->vLevel[2]->same_path.push_back(0); + for (iter4_i = 0; iter4_i < ((row_size-1)/blockSize)+1; iter4_i++) { + for (iter4_j = 0; iter4_j < COO_row_crd[iter4_i].size(); iter4_j++) { + sparT->vLevel[1]->crd.push_back(COO_row_crd[iter4_i][iter4_j]); + sparT->vLevel[2]->crd.push_back(COO_col_crd[iter4_i][iter4_j]); + sparT->valueArray.push_back(COO_val[iter4_i][iter4_j]); + if (COO_nnz > 0) { + bool same_row = (sparT->vLevel[1]->crd[COO_nnz] == sparT->vLevel[1]->crd[COO_nnz-1]); + bool same_col = (sparT->vLevel[2]->crd[COO_nnz] == sparT->vLevel[2]->crd[COO_nnz-1]); + sparT->vLevel[1]->same_path.push_back(same_row); + sparT->vLevel[2]->same_path.push_back(same_row && same_col); + } + COO_nnz += 1; + } + } + sparT->vLevel[1]->ptr.push_back(COO_nnz); + // std::cout << "COO level1 crd = "; + // for (unsigned n = 0; n < sparT->vLevel[1]->crd.size(); n++) { + // std::cout << sparT->vLevel[1]->crd[n] << " "; + // } + // std::cout << "\n"; + // std::cout << "COO level2 crd = "; + // for (unsigned n = 0; n < sparT->vLevel[2]->crd.size(); n++) { + // std::cout << sparT->vLevel[2]->crd[n] << " "; + // } + // std::cout << "\n"; + // std::cout << "COO value = "; + // for (unsigned n = 0; n < sparT->valueArray.size(); n++) { + // std::cout << sparT->valueArray[n] << " "; + // } + // std::cout << "\n"; + + UniSparseStruct* ret = new UniSparseStruct; + ret->vec.push_back((void*)sparT); + ret->vec.push_back((void*)T_BELL); + return (void*) ret; + } + + void release(void *tensor) { delete static_cast(tensor); } diff --git a/lib/Transforms/LowerFormatConversionPass.cpp b/lib/Transforms/LowerFormatConversionPass.cpp index 1d7ffed..2f60f80 100644 --- a/lib/Transforms/LowerFormatConversionPass.cpp +++ b/lib/Transforms/LowerFormatConversionPass.cpp @@ -1490,9 +1490,13 @@ class BDIASpMVOpLowering: public OpConversionPattern { std::vector params = {inputTensor_CSR, inputTensor_BDIA, mem_input_B, mem_input_C}; // auto out_tp = MemRefType::get(outputType.getShape(), outputType.getElementType()); auto callOp = rewriter.create(loc, outputType, - getFunc(op, "kernel_hetero_bdia_spmv_iter", outputType, params, true), + getFunc(op, "kernel_bdia_spmv_iter", outputType, params, true), params ); + // auto callOp = rewriter.create(loc, outputType, + // getFunc(op, "kernel_hetero_bdia_spmv_iter", outputType, params, true), + // params + // ); rewriter.replaceOp(op, callOp.getResult(0)); // rewriter.replaceOpWithNewOp(op, outputType, // ValueRange({callOp.getResult(0)})); @@ -1548,9 +1552,13 @@ class DecomposeBDIAOpLowering: public OpConversionPattern params = {inputTensor, blockSize, thres}; auto callOp = rewriter.create(loc, outputType, - getFunc(op, "decompose_BDIA", outputType, params, true), + getFunc(op, "decompose_BDIA_opt2", outputType, params, true), params ); + // auto callOp = rewriter.create(loc, outputType, + // getFunc(op, "decompose_BDIA", outputType, params, true), + // params + // ); auto ret = callOp.getResult(0); rewriter.replaceOp(op, ret); return success(); diff --git a/scripts/build.sh b/scripts/build.sh index 228b0da..ea27008 100644 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -1,10 +1,10 @@ -export LLVM_ROOT=/install/llvm-project +export LLVM_ROOT=/work/shared/common/llvm-project-15.0.0 export PATH=$LLVM_ROOT/build/bin:$PATH export LD_LIBRARY_PATH=$LLVM_ROOT/build/lib:$LD_LIBRARY_PATH export CPATH=$HOME/eigen-3.4.0:$HOME/llvm-project/mlir/lib:$HOME/llvm-project/mlir/include/mlir:$CPATH export CPATH=$LLVM_ROOT/openmp/build/runtime/src:$CPATH export LD_LIBRARY_PATH=$LLVM_ROOT/openmp/build/runtime/src:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=/install/taco/build/lib:$LD_LIBRARY_PATH +# export LD_LIBRARY_PATH=/install/taco/build/lib:$LD_LIBRARY_PATH mkdir -p build && cd build cmake .. \ @@ -13,9 +13,9 @@ cmake .. \ -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ && \ cmake --build . -export SPLHOME=/install/UniSparse -export LD_LIBRARY_PATH=/install/UniSparse/build/lib:$LD_LIBRARY_PATH -export PATH=/install/UniSparse/build/bin:$PATH +export SPLHOME=/work/shared/users/phd/jl3952/workspace/MLIR_dialect/unisparse +export LD_LIBRARY_PATH=$SPLHOME/build/lib:$LD_LIBRARY_PATH +export PATH=$SPLHOME/build/bin:$PATH cd .. diff --git a/test/UniSparse/KernelGen/CPU/unisparse_bdia_csr_spmm.mlir b/test/UniSparse/KernelGen/CPU/unisparse_bdia_csr_spmm.mlir new file mode 100644 index 0000000..7794b23 --- /dev/null +++ b/test/UniSparse/KernelGen/CPU/unisparse_bdia_csr_spmm.mlir @@ -0,0 +1,181 @@ +// unisparse-opt ./decompose-BDIA.mlir -lower-struct-convert -lower-struct -dce -unisparse-codegen -lower-format-conversion | \ +// mlir-opt -one-shot-bufferize="bufferize-function-boundaries=1 allow-return-allocs unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map" \ +// -finalizing-bufferize -convert-linalg-to-loops -convert-vector-to-scf -convert-scf-to-cf -lower-affine \ +// -convert-vector-to-llvm -convert-memref-to-llvm -convert-complex-to-standard -convert-math-to-llvm \ +// -convert-math-to-libm -convert-complex-to-libm -convert-complex-to-llvm -convert-func-to-llvm \ +// -reconcile-unrealized-casts | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -relocation-model=pic -filetype=obj -o bdia_spmv.o + +// clang++ bdia_spmv.o -L$SPLHOME/build/lib -lmlir_unisparse_runner_utils \ +// -L$LLVMHOME/build/lib -lmlir_runner_utils -lmlir_c_runner_utils -o bdia_spmv + +// ./bdia_spmv + +// RUN: unisparse-opt %s -lower-struct-convert -lower-struct -dce -lower-format-conversion | FileCheck %s + + +!Filename = !llvm.ptr + +#COO = #unisparse.encoding<{ + crdMap = #unisparse.crd<(i,j)->(i,j)>, + compressMap = #unisparse.compress +}> + +#CSR = #unisparse.encoding<{ + crdMap = #unisparse.crd<(i,j)->(i,j)>, + compressMap = #unisparse.compress +}> + +#BDIA = #unisparse.encoding<{ + crdMap = #unisparse.crd<(i,j)->(i floordiv 50, j minus i, i mod 50)>, + compressMap = #unisparse.compress +}> + +#amap_0 = affine_map<(i,j) -> (i, j)> + +#trait1 = { +indexing_maps = [ + #amap_0, // A + affine_map<(i,j) -> (j)>, // B + affine_map<(i,j) -> (i)> // X (out) + ], + iterator_types = ["parallel", "reduction"], + doc = "X(i) =+ A(i,j) * B(j)" +} + +module { + func.func private @rtclock() -> f64 + func.func private @getTensorFilename(index) -> (!Filename) + // func.func @kernel_csr_spmv(%arg0: tensor, %arg1: tensor, %argx: tensor) -> tensor { + // %0 = linalg.generic #trait1 + // ins(%arg0, %arg1 : tensor, tensor) + // outs(%argx: tensor) { + // ^bb0(%a: f32, %b: f32, %x: f32): + // %2 = arith.mulf %a, %b : f32 + // %3 = arith.addf %x, %2 : f32 + // linalg.yield %3 : f32 + // } -> tensor + // return %0 : tensor + // } + + func.func @main() { + %c0 = arith.constant 0: index + %c1 = arith.constant 1 : index + %f0 = arith.constant 0.0: f32 + %f05 = arith.constant 0.5: f32 + %i1 = arith.constant 1: i32 + %blockSize = arith.constant 100: i32 + %thres_1 = arith.constant 0.5: f32 + %c1000 = arith.constant 1000 : index + + %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename) + %A_1 = unisparse.fromFile (%fileName): !llvm.ptr to tensor + %dim1 = tensor.dim %A_1, %c1 : tensor + %dim0 = tensor.dim %A_1, %c0 : tensor + // %thres_1 = arith.constant dense<[0.5]>: tensor<1xf32> + // %thres_2 = bufferization.alloc_tensor () copy(%thres_1): tensor<1xf32> + // %thres = bufferization.to_memref %thres_2: memref<1xf32> + + %t_start0 = call @rtclock() : () -> f64 + %S_1 = unisparse.decompose_BDIA %A_1, %blockSize, %thres_1 : tensor, i32, f32 to + !unisparse.struct< tensor, tensor > + %t_end0 = call @rtclock() : () -> f64 + %t_0 = arith.subf %t_end0, %t_start0: f64 + vector.print %t_0 : f64 + + %B_0 = unisparse.struct_access %S_1[0]: + !unisparse.struct< tensor, tensor > + to tensor + %B_1 = unisparse.struct_access %S_1[1]: + !unisparse.struct< tensor, tensor > + to tensor + + %D_0 = unisparse.convert(%B_0) : tensor to tensor + + %init_256_4 = bufferization.alloc_tensor(%dim1, %c1000) : tensor + %b = scf.for %i = %c0 to %dim1 step %c1 iter_args(%t = %init_256_4) -> tensor { + %b2 = scf.for %j = %c0 to %c1000 step %c1 iter_args(%t2 = %t) -> tensor { + %k0 = arith.muli %i, %c1000 : index + %k1 = arith.addi %j, %k0 : index + %k2 = arith.index_cast %k1 : index to i32 + %k = arith.sitofp %k2 : i32 to f32 + %t3 = tensor.insert %k into %t2[%i, %j] : tensor + scf.yield %t3 : tensor + } + scf.yield %b2 : tensor + } + // %init_256_4 = bufferization.alloc_tensor(%dim1, %c1000) : tensor + // %tensor_B = tensor.insert %f05 into %init_256_4[%c0] : tensor + // %dim1_1 = arith.subi %dim1, %c1 : index + // %i_dim1_1 = arith.index_cast %dim1_1 : index to i32 + // %f_dim1_1 = arith.sitofp %i_dim1_1 : i32 to f32 + // %elm = arith.divf %f05, %f_dim1_1 : f32 + // %b = scf.for %i = %c1 to %dim1 step %c1 iter_args(%t = %tensor_B) -> tensor { + // %b2 = scf.for %j = %c0 to %c1000 step %c1 iter_args(%t2 = %t) -> memref { + // %t3 = tensor.insert %elm into %t[%i] : tensor + // scf.yield %t3 : tensor + // } + // } + + // %o0 = bufferization.alloc_tensor(%dim0) : tensor + // %o00 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o0) -> tensor { + // %t3 = tensor.insert %f0 into %t[%i] : tensor + // scf.yield %t3 : tensor + // } + // %o0_4_4 = memref.alloc(%dim0, %c4) : memref + // %o0 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o0_4_4) -> memref { + // %x2 = scf.for %j = %c0 to %c4 step %c1 iter_args(%t2 = %t) -> memref { + // memref.store %i0, %t2[%i, %j] : memref + // scf.yield %t2 : memref + // } + // scf.yield %x2 : memref + // } + %o1 = bufferization.alloc_tensor(%dim0, %c1000) : tensor + %o11 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o1) -> tensor { + %x2 = scf.for %j = %c0 to %c1000 step %c1 iter_args(%t2 = %t) -> tensor { + %t3 = tensor.insert %f0 into %t[%i, %j] : tensor + scf.yield %t3 : tensor + } + scf.yield %x2 : tensor + } + // %o2 = bufferization.alloc_tensor(%dim0) : tensor + // %o22 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o2) -> tensor { + // %t3 = tensor.insert %f0 into %t[%i] : tensor + // scf.yield %t3 : tensor + // } + + %t_start4 = call @rtclock() : () -> f64 + // CSR SpMV + // %result0 = call @kernel_csr_spmv(%D_0, %b, %o00) : (tensor, tensor, tensor) -> tensor + // %t_end1 = call @rtclock() : () -> f64 + // block DIA SpMV + %result1 = unisparse.bdia_spmm %D_0, %B_1, %b, %o1: + tensor, tensor, tensor, tensor to memref + // %t_end2 = call @rtclock() : () -> f64 + // %output = linalg.elemwise_binary ins(%result0, %result1: tensor, tensor) + // outs(%o2: tensor) -> tensor + %t_end4 = call @rtclock() : () -> f64 + // %t_1 = arith.subf %t_end1, %t_start4: f64 + // %t_2 = arith.subf %t_end2, %t_end1: f64 + // %t_4 = arith.subf %t_end4, %t_end2: f64 + %t_5 = arith.subf %t_end4, %t_start4: f64 + // vector.print %t_1 : f64 + // vector.print %t_2 : f64 + // vector.print %t_4 : f64 + vector.print %t_5 : f64 + // %v0 = vector.transfer_read %result0[%c0], %f0: tensor, vector<4xf32> + // vector.print %v0 : vector<4xf32> + %v1 = vector.transfer_read %result1[%c0, %c0], %f0: memref, vector<4x4xf32> + vector.print %v1 : vector<4x4xf32> + // %v2 = vector.transfer_read %output[%c0], %f0: tensor, vector<4xf32> + // vector.print %v2 : vector<4xf32> + bufferization.dealloc_tensor %A_1 : tensor + bufferization.dealloc_tensor %B_1 : tensor + unisparse.release %S_1: !unisparse.struct< tensor, tensor > + // bufferization.dealloc_tensor %B_0 : tensor + // bufferization.dealloc_tensor %o1 : tensor + // bufferization.dealloc_tensor %result0 : tensor + // bufferization.dealloc_tensor %output : tensor + + return + } +} \ No newline at end of file diff --git a/test/UniSparse/KernelGen/CPU/unisparse_bdia_csr_spmv.mlir b/test/UniSparse/KernelGen/CPU/unisparse_bdia_csr_spmv.mlir new file mode 100644 index 0000000..f7e4b1c --- /dev/null +++ b/test/UniSparse/KernelGen/CPU/unisparse_bdia_csr_spmv.mlir @@ -0,0 +1,169 @@ +// unisparse-opt ./unisparse_bdia_csr_spmv.mlir -lower-struct-convert -lower-struct -dce -unisparse-codegen -lower-format-conversion | \ +// mlir-opt -one-shot-bufferize="bufferize-function-boundaries=1 allow-return-allocs unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map" \ +// -finalizing-bufferize -convert-linalg-to-loops -convert-vector-to-scf -convert-scf-to-cf -lower-affine \ +// -convert-vector-to-llvm -convert-memref-to-llvm -convert-complex-to-standard -convert-math-to-llvm \ +// -convert-math-to-libm -convert-complex-to-libm -convert-complex-to-llvm -convert-func-to-llvm \ +// -reconcile-unrealized-casts | mlir-translate -mlir-to-llvmir | opt -O3 -S | llc -O3 -relocation-model=pic -filetype=obj -o bdia_spmv.o + +// clang++ bdia_spmv.o -L$SPLHOME/build/lib -lmlir_unisparse_runner_utils \ +// -L$LLVM_ROOT/build/lib -lmlir_runner_utils -lmlir_c_runner_utils -o bdia_spmv + +// ./bdia_spmv + +// RUN: unisparse-opt %s -lower-struct-convert -lower-struct -dce -lower-format-conversion | FileCheck %s + + +!Filename = !llvm.ptr + +#COO = #unisparse.encoding<{ + crdMap = #unisparse.crd<(i,j)->(i,j)>, + compressMap = #unisparse.compress +}> + +#CSR = #unisparse.encoding<{ + crdMap = #unisparse.crd<(i,j)->(i,j)>, + compressMap = #unisparse.compress +}> + +#BDIA = #unisparse.encoding<{ + crdMap = #unisparse.crd<(i,j)->(i floordiv 50, j minus i, i mod 50)>, + compressMap = #unisparse.compress +}> + +#trait1 = { +indexing_maps = [ + affine_map<(i,j) -> (i, j)>, // A + affine_map<(i,j) -> (j)>, // B + affine_map<(i,j) -> (i)> // X (out) + ], + iterator_types = ["parallel", "reduction"], + doc = "X(i) =+ A(i,j) * B(j)" +} + +module { + func.func private @rtclock() -> f64 + func.func private @getTensorFilename(index) -> (!Filename) + func.func @kernel_csr_spmv(%arg0: tensor, %arg1: tensor, %argx: tensor) -> tensor { + %0 = linalg.generic #trait1 + ins(%arg0, %arg1 : tensor, tensor) + outs(%argx: tensor) { + ^bb0(%a: f32, %b: f32, %x: f32): + %2 = arith.mulf %a, %b : f32 + %3 = arith.addf %x, %2 : f32 + linalg.yield %3 : f32 + } -> tensor + return %0 : tensor + } + + func.func @main() { + %c0 = arith.constant 0: index + %c1 = arith.constant 1 : index + %f0 = arith.constant 0.0: f32 + %f1 = arith.constant 1.0: f32 + %f05 = arith.constant 0.5: f32 + %i1 = arith.constant 1: i32 + %blockSize = arith.constant 100: i32 + %thres_1 = arith.constant 0.5: f32 + + %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename) + %A_1 = unisparse.fromFile (%fileName): !llvm.ptr to tensor + %dim1 = tensor.dim %A_1, %c1 : tensor + %dim0 = tensor.dim %A_1, %c0 : tensor + // %thres_1 = arith.constant dense<[0.5]>: tensor<1xf32> + // %thres_2 = bufferization.alloc_tensor () copy(%thres_1): tensor<1xf32> + // %thres = bufferization.to_memref %thres_2: memref<1xf32> + + %t_start0 = call @rtclock() : () -> f64 + %S_1 = unisparse.decompose_BDIA %A_1, %blockSize, %thres_1 : tensor, i32, f32 to + !unisparse.struct< tensor, tensor > + %t_end0 = call @rtclock() : () -> f64 + %t_0 = arith.subf %t_end0, %t_start0: f64 + vector.print %t_0 : f64 + + %B_0 = unisparse.struct_access %S_1[0]: + !unisparse.struct< tensor, tensor > + to tensor + %B_1 = unisparse.struct_access %S_1[1]: + !unisparse.struct< tensor, tensor > + to tensor + + %D_0 = unisparse.convert(%B_0) : tensor to tensor + + // %init_256_4 = bufferization.alloc_tensor(%dim1) : tensor + // %b = scf.for %i = %c0 to %dim1 step %c1 iter_args(%t = %init_256_4) -> tensor { + // %k0 = arith.muli %i, %c1 : index + // %k1 = arith.index_cast %k0 : index to i32 + // %k = arith.sitofp %k1 : i32 to f32 + // %t3 = tensor.insert %k into %t[%i] : tensor + // scf.yield %t3 : tensor + // } + %init_256_4 = bufferization.alloc_tensor(%dim1) : tensor + // %tensor_B = tensor.insert %f05 into %init_256_4[%c0] : tensor + // %dim1_1 = arith.subi %dim1, %c1 : index + // %i_dim1_1 = arith.index_cast %dim1_1 : index to i32 + // %f_dim1_1 = arith.sitofp %i_dim1_1 : i32 to f32 + // %elm = arith.divf %f05, %f_dim1_1 : f32 + // %b = scf.for %i = %c1 to %dim1 step %c1 iter_args(%t = %tensor_B) -> tensor { + %ts_dim_i = arith.index_cast %dim1 : index to i32 + %ts_dim = arith.sitofp %ts_dim_i : i32 to f32 + %elm = arith.divf %f1, %ts_dim : f32 + // vector.print %elm : f32 + %b = scf.for %i = %c0 to %dim1 step %c1 iter_args(%t = %init_256_4) -> tensor { + // %k1 = arith.index_cast %i : index to i32 + // %k = arith.sitofp %k1 : i32 to f32 + %t3 = tensor.insert %elm into %t[%i] : tensor + scf.yield %t3 : tensor + } + + // %o0 = bufferization.alloc_tensor(%dim0) : tensor + // %o00 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o0) -> tensor { + // %t3 = tensor.insert %f0 into %t[%i] : tensor + // scf.yield %t3 : tensor + // } + %o1 = bufferization.alloc_tensor(%dim0) : tensor + %o11 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o1) -> tensor { + %t3 = tensor.insert %f0 into %t[%i] : tensor + scf.yield %t3 : tensor + } + // %o2 = bufferization.alloc_tensor(%dim0) : tensor + // %o22 = scf.for %i = %c0 to %dim0 step %c1 iter_args(%t = %o2) -> tensor { + // %t3 = tensor.insert %f0 into %t[%i] : tensor + // scf.yield %t3 : tensor + // } + + %t_start4 = call @rtclock() : () -> f64 + // CSR SpMV + // %result0 = call @kernel_csr_spmv(%D_0, %b, %o00) : (tensor, tensor, tensor) -> tensor + // %t_end1 = call @rtclock() : () -> f64 + // block DIA SpMV + %result1 = unisparse.bdia_spmv %D_0, %B_1, %b, %o1: + tensor, tensor, tensor, tensor to memref + // %t_end2 = call @rtclock() : () -> f64 + // %output = linalg.elemwise_binary ins(%result0, %result1: tensor, tensor) + // outs(%o2: tensor) -> tensor + %t_end4 = call @rtclock() : () -> f64 + // %t_1 = arith.subf %t_end1, %t_start4: f64 + // %t_2 = arith.subf %t_end2, %t_end1: f64 + // %t_4 = arith.subf %t_end4, %t_end2: f64 + %t_5 = arith.subf %t_end4, %t_start4: f64 + // vector.print %t_1 : f64 + // vector.print %t_2 : f64 + // vector.print %t_4 : f64 + vector.print %t_5 : f64 + // %v0 = vector.transfer_read %result0[%c0], %f0: tensor, vector<4xf32> + // vector.print %v0 : vector<4xf32> + %v1 = vector.transfer_read %result1[%c0], %f0: memref, vector<4xf32> + vector.print %v1 : vector<4xf32> + // %v2 = vector.transfer_read %output[%c0], %f0: tensor, vector<4xf32> + // vector.print %v2 : vector<4xf32> + bufferization.dealloc_tensor %A_1 : tensor + bufferization.dealloc_tensor %B_1 : tensor + unisparse.release %S_1: !unisparse.struct< tensor, tensor > + // bufferization.dealloc_tensor %B_0 : tensor + // bufferization.dealloc_tensor %o1 : tensor + // bufferization.dealloc_tensor %result0 : tensor + // bufferization.dealloc_tensor %output : tensor + + return + } +} \ No newline at end of file