diff --git a/csrc/sparse/cusparselt/binding.py b/csrc/sparse/cusparselt/binding.py
deleted file mode 100644
index 035c18abd312a..0000000000000
--- a/csrc/sparse/cusparselt/binding.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from torch.utils.cpp_extension import load
-import os
-import torch
-
-base_path = __file__.replace("spmm.py", "")
-
-if not os.path.exists(f"{base_path}/build"):
-    os.makedirs(f"{base_path}/build")
-
-if not os.path.exists(base_path + "/libcusparse_lt"):
-    os.system(
-    "wget https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.1.1-archive.tar.xz")
-    os.system("tar -xf libcusparse_lt-linux-x86_64-0.5.1.1-archive.tar.xz")
-    os.system(f"mv libcusparse_lt-linux-x86_64-0.5.1.1-archive {base_path}/libcusparse_lt")
-    os.system("rm libcusparse_lt-linux-x86_64-0.5.1.1-archive.tar.xz")
-
-pruner = load(name='pruner',
-              sources=[f'{base_path}/spmm_backend.cpp',
-                       f'{base_path}/spmm_backend.cu',
-                       ],
-              extra_cflags=[
-                  f'-L{base_path}/libcusparse_lt/lib',
-                  '-lcusparse',
-                  '-lcusparseLt',
-                  '-ldl'
-              ],
-              extra_cuda_cflags=[
-                  f'-L{base_path}/libcusparse_lt/lib',
-                  '-lcusparse',
-                  '-lcusparseLt',
-                  '-ldl'
-              ],
-              extra_ldflags=[
-                  f'-L{base_path}/libcusparse_lt/lib',
-                  '-lcusparse',
-                  '-lcusparseLt',
-                  '-ldl'
-              ],
-              extra_include_paths=[
-                  base_path + '/libcusparse_lt/include'
-              ],
-              build_directory=f'{base_path}/build',
-              with_cuda=True,
-              verbose=False)
-
-init_flag = pruner.init_cusparse_lt()
-assert init_flag == 0, "Failed to initialize CuSparseLT"
\ No newline at end of file
diff --git a/csrc/sparse/cusparselt/cusparselt_mm.cu b/csrc/sparse/cusparselt/cusparselt_mm.cu
deleted file mode 100644
index 0e088b35c7b87..0000000000000
--- a/csrc/sparse/cusparselt/cusparselt_mm.cu
+++ /dev/null
@@ -1,1077 +0,0 @@
-/*
- * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
- *
- * NOTICE TO LICENSEE:
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
- * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
- * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
- * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
- * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
- * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
- * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
- * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
- * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- * OF THESE LICENSED DELIVERABLES.
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
-#include <cusparseLt.h>       // cusparseLt header
-#include <cstdio>             // printf
-#include <cstdlib>            // std::rand
-#include <vector>             // std::vector
-#include <torch/extension.h>
-#include <iostream>
-
-
-#define INT8_OUTPUT_TYPE int32_t //at::Half //int8_t
-#define INT8_OUTPUT_TYPE_CUDA CUDA_R_8I //CUDA_R_32I
-#define INT8_OUTPUT_TYPE_TORCH torch::kInt32 //torch::kInt32
-
-
-#define MAX(a, b) ((abs(a) > abs(b) ? (a) : (b)))
-#define MIN(a, b) ((abs(a) < abs(b) ? (a) : (b)))
-
-
-#define CHECK_CUDA(func)                                                       \
-{                                                                              \
-    cudaError_t status = (func);                                               \
-    if (status != cudaSuccess) {                                               \
-        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
-               __LINE__, cudaGetErrorString(status), status);                  \
-        return EXIT_FAILURE;                                                   \
-    }                                                                          \
-}
-
-
-#define CHECK_CUDA_TORCH(func)                                                       \
-{                                                                              \
-    cudaError_t status = (func);                                               \
-    if (status != cudaSuccess) {                                               \
-        printf("CUDA API failed at line %d with error: %s (%d)\n",             \
-               __LINE__, cudaGetErrorString(status), status);                  \
-        return torch::ones(1);                                                   \
-    }                                                                          \
-}
-
-
-#define CHECK_CUSPARSE(func)                                                   \
-{                                                                              \
-    cusparseStatus_t status = (func);                                          \
-    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
-        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
-               __LINE__, cusparseGetErrorString(status), status);              \
-        return EXIT_FAILURE;                                                   \
-    }                                                                          \
-}
-
-
-#define CHECK_CUSPARSE_TORCH(func)                                                   \
-{                                                                              \
-    cusparseStatus_t status = (func);                                          \
-    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
-        printf("CUSPARSE API failed at line %d with error: %s (%d)\n",         \
-               __LINE__, cusparseGetErrorString(status), status);              \
-        return torch::ones(1);                                                   \
-    }                                                                          \
-}
-
-constexpr int EXIT_UNSUPPORTED = 2;
-
-cusparseLtHandle_t handle;
-
-float alpha = 1.0;
-float beta  = 0.0;
-
-
-typedef struct {
-   at::Half data;
-   int index;
-} indexed_half;
-
-
-int init_cusparse_lt_cuda()
-{
-    int major_cc, minor_cc;
-    CHECK_CUDA( cudaDeviceGetAttribute(&major_cc,
-                                       cudaDevAttrComputeCapabilityMajor, 0) )
-    CHECK_CUDA( cudaDeviceGetAttribute(&minor_cc,
-                                       cudaDevAttrComputeCapabilityMinor, 0) )
-    if (!(major_cc == 8 && minor_cc == 0) &&
-        !(major_cc == 8 && minor_cc == 6) &&
-        !(major_cc == 8 && minor_cc == 9)) {
-        std::printf("\ncusparseLt is supported only on GPU devices with"
-                    " compute capability == 8.0, 8.6, 8.9 current: %d.%d\n\n",
-                     major_cc, minor_cc);
-        return EXIT_UNSUPPORTED;
-    }
-    CHECK_CUSPARSE( cusparseLtInit(&handle) )
-
-    return EXIT_SUCCESS;
-}
-
-
-typedef struct cusparseLtMatmulArgs_t {
-    cusparseLtMatmulPlan_t*         plan;
-    cusparseLtMatmulDescriptor_t*   matmul;
-    cusparseLtMatmulAlgSelection_t* alg_sel;
-    cudaStream_t*                   streams;
-    int                             num_streams;
-    cudaStream_t                    stream;
-    size_t                          workspace_size;
-//     void*                           d_workspace;
-    void                            *dCompressed;
-    int                             m;
-    int                             n;
-//     torch::Tensor                   grad;
-
-    cusparseLtMatmulArgs_t()
-    {
-        plan = new cusparseLtMatmulPlan_t;
-        matmul = new cusparseLtMatmulDescriptor_t;
-        alg_sel = new cusparseLtMatmulAlgSelection_t;
-        streams = nullptr;
-        num_streams = 0;
-        stream = nullptr;
-        m = 0;
-        n = 0;
-        dCompressed = nullptr;
-    }
-
-    ~cusparseLtMatmulArgs_t()
-    {
-        cusparseLtMatmulPlanDestroy(plan);
-//         cudaFree(d_workspace);
-    }
-} cusparseLtMatmulArgs ;
-
-
-std::vector<cusparseLtMatmulArgs*> matmul_args;
-
-
-template <class T, class V>
-int setup_prune_matmul( const int                       m,
-                        const int                       n,
-                        const int                       k,
-                        T                               *dSparse,
-                        T                               *dDense,
-                        int                             *index,
-                        const bool                      transpose_A=false,
-                        const bool                      transpose_B=false,
-                        const bool                      sparseA=true,
-                        const bool                      transposable_mask=false,
-                        const bool                      is_sparse_pruned=false,
-                        const bool                      check_sparsity=false,
-                        cudaDataType_t                  input_type=CUDA_R_16F,
-                        cudaDataType_t                  output_type=CUDA_R_16F,
-                        cusparseComputeType             compute_type=CUSPARSE_COMPUTE_16F)
-{
-    matmul_args.push_back(new cusparseLtMatmulArgs_t);
-    *index = matmul_args.size() - 1;
-
-    auto args = matmul_args.back();
-    args->m = m;
-    args->n = n;
-
-    // Host problem definition, row-major order
-    // bigger sizes may require dynamic allocations
-    auto          order        = CUSPARSE_ORDER_ROW;
-    auto          opA          = transpose_A ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-    auto          opB          = transpose_B ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
-
-    bool     is_rowmajor    = (order == CUSPARSE_ORDER_ROW);
-    bool     isA_transposed = (opA != CUSPARSE_OPERATION_NON_TRANSPOSE);
-    bool     isB_transposed = (opB != CUSPARSE_OPERATION_NON_TRANSPOSE);
-    auto     num_A_rows     = (isA_transposed) ? k : m;
-    auto     num_A_cols     = (isA_transposed) ? m : k;
-    auto     num_B_rows     = (isB_transposed) ? n : k;
-    auto     num_B_cols     = (isB_transposed) ? k : n;
-    auto     num_C_rows     = m;
-    auto     num_C_cols     = n;
-    unsigned alignment      = 16;
-    auto     lda            = (is_rowmajor) ? num_A_cols : num_A_rows;
-    auto     ldb            = (is_rowmajor) ? num_B_cols : num_B_rows;
-    auto     ldc            = (is_rowmajor) ? num_C_cols : num_C_rows;
-    auto     C_height       = (is_rowmajor) ? num_C_rows : num_C_cols;
-    auto     C_size         = C_height * ldc * sizeof(V);
-
-
-    cusparseLtMatDescriptor_t*      matA;
-    cusparseLtMatDescriptor_t*      matB;
-    cusparseLtMatDescriptor_t*      matC;
-    matA = new cusparseLtMatDescriptor_t;
-    matB = new cusparseLtMatDescriptor_t;
-    matC = new cusparseLtMatDescriptor_t;
-
-    V *dC, *dD;
-    CHECK_CUDA( cudaMalloc((void**) &dC, C_size) )
-    dD = dC;
-
-    int *d_valid;
-    CHECK_CUDA( cudaMalloc((void**) &d_valid, sizeof(int)) )
-
-    // matrix descriptor initialization
-    if(sparseA)
-    {
-        CHECK_CUSPARSE( cusparseLtStructuredDescriptorInit(
-                                                &handle, matA, num_A_rows,
-                                                num_A_cols, lda, alignment,
-                                                input_type, order,
-                                                CUSPARSELT_SPARSITY_50_PERCENT) )
-
-        CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
-                                                &handle, matB, num_B_rows,
-                                                num_B_cols, ldb, alignment,
-                                                input_type, order) )
-    }
-    else
-    {
-        CHECK_CUSPARSE( cusparseLtStructuredDescriptorInit(
-                                                &handle, matB, num_B_rows,
-                                                num_B_cols, ldb, alignment,
-                                                input_type, order,
-                                                CUSPARSELT_SPARSITY_50_PERCENT) )
-
-        CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
-                                                &handle, matA, num_A_rows,
-                                                num_A_cols, lda, alignment,
-                                                input_type, order) )
-    }
-    CHECK_CUSPARSE( cusparseLtDenseDescriptorInit(
-                                            &handle, matC, num_C_rows,
-                                            num_C_cols, ldc, alignment,
-                                            output_type, order) )
-
-    // matmul, algorithm selection, and plan initialization
-    CHECK_CUSPARSE( cusparseLtMatmulDescriptorInit(
-                                            &handle, args->matmul, opA, opB,
-                                            matA, matB, matC, matC,
-                                            compute_type) )
-
-    CHECK_CUSPARSE( cusparseLtMatmulAlgSelectionInit(
-                                            &handle, args->alg_sel, args->matmul,
-                                            CUSPARSELT_MATMUL_ALG_DEFAULT) )
-
-    CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, args->plan, args->matmul, args->alg_sel))
-
-    //--------------------------------------------------------------------------
-    // Prune the A matrix (in-place) and check the correctness
-    if (!is_sparse_pruned){
-        cusparseLtPruneAlg_t prune_alg = transposable_mask ? CUSPARSELT_PRUNE_SPMMA_TILE : CUSPARSELT_PRUNE_SPMMA_STRIP;
-        CHECK_CUSPARSE( cusparseLtSpMMAPrune(&handle, args->matmul, dSparse, dSparse,
-                                             prune_alg, args->stream) )
-    }
-    if (check_sparsity)
-    {
-        CHECK_CUSPARSE( cusparseLtSpMMAPruneCheck(&handle, args->matmul, dSparse, d_valid, args->stream) )
-        int is_valid;
-        CHECK_CUDA( cudaMemcpyAsync(&is_valid, d_valid, sizeof(int), cudaMemcpyDeviceToHost, args->stream) )
-        CHECK_CUDA( cudaStreamSynchronize(args->stream) )
-        if (is_valid != 0) {
-            std::printf("!!!! The matrix does not conform to the SpMMA sparsity pattern. "
-                        "cusparseLtMatmul does not provide correct results\n");
-            return EXIT_FAILURE;
-        }
-    }
-    
-
-//     int    *d_valid;
-//     CHECK_CUDA( cudaMalloc((void**) &d_valid, sizeof(int)) )
-//     CHECK_CUSPARSE( cusparseLtSpMMAPruneCheck2(    &handle,
-//                                                     sparseA ? matA : matB,
-//                                                     sparseA,
-//                                                     sparseA ? opA : opB,
-//                                                     dSparse,
-//                                                     d_valid,
-//                                                     args->stream) )
-
-//     int is_valid;
-//     CHECK_CUDA( cudaMemcpyAsync(&is_valid, d_valid, sizeof(int),
-//                                 cudaMemcpyDeviceToHost, args->stream) )
-//     CHECK_CUDA( cudaStreamSynchronize(args->stream) )
-//     if (is_valid != 0) {
-//         std::printf("!!!! The matrix has been pruned in a wrong way. "
-//                     "cusparseLtMatmul will not provide correct results\n");
-//         return EXIT_FAILURE;
-//     }
-    CHECK_CUDA( cudaFree(d_valid) )
-
-    //--------------------------------------------------------------------------
-    // Compress the A matrix
-    size_t compressed_size, compressed_buffer_size;
-    void*  dCompressedBuffer;
-    CHECK_CUSPARSE( cusparseLtSpMMACompressedSize(&handle,
-                                                  args->plan,
-                                                  &compressed_size,
-                                                  &compressed_buffer_size) )
-
-    CHECK_CUDA( cudaMalloc((void**) &args->dCompressed, compressed_size) )
-    CHECK_CUDA( cudaMalloc((void**) &dCompressedBuffer,
-                           compressed_buffer_size) )
-
-    CHECK_CUSPARSE( cusparseLtSpMMACompress(&handle,
-                                            args->plan,
-                                            dSparse,
-                                            (T *) args->dCompressed,
-                                            dCompressedBuffer,
-                                            args->stream) )
-    CHECK_CUDA( cudaFree(dCompressedBuffer) )
-
-    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // Search the best kernel
-    if(sparseA)
-    {
-//         printf("%f, %f, %f, %f, %f, %f\n", alpha, beta, *dDense,0.,0.,0.);// , dDense[0], beta, dC[0], dD[0]);
-        CHECK_CUSPARSE( cusparseLtMatmulSearch(&handle, args->plan, &alpha,
-                                            (T*) args->dCompressed, dDense, &beta,
-                                            dC, dD, nullptr,
-                                            args->streams, args->num_streams) )
-    } else {
-        CHECK_CUSPARSE( cusparseLtMatmulSearch(&handle, args->plan, &alpha,
-                                            dDense, (T*) args->dCompressed, &beta,
-                                            dC, dD, nullptr,
-                                            args->streams, args->num_streams) )
-    }
-//     // otherwise, it is possible to set it directly:
-//     int alg = 0;
-//     CHECK_CUSPARSE( cusparseLtMatmulAlgSetAttribute(
-//                                            &handle, args->alg_sel,
-//                                            CUSPARSELT_MATMUL_ALG_CONFIG_ID,
-//                                            &alg, sizeof(alg)))
-
-
-    //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    CHECK_CUSPARSE( cusparseLtMatmulPlanInit(&handle, args->plan, args->matmul, args->alg_sel))
-
-    CHECK_CUSPARSE( cusparseLtMatmulGetWorkspace(&handle, args->plan,
-                                                 &args->workspace_size))
-
-//     printf("workspace_size: %lu (MB)\n", args->workspace_size / 1024 / 1024);
-    CHECK_CUDA( cudaFree(dC) )
-    cusparseLtMatDescriptorDestroy(matA);
-    cusparseLtMatDescriptorDestroy(matB);
-    cusparseLtMatDescriptorDestroy(matC);
-
-    return EXIT_SUCCESS;
-}
-
-int destroy_cusparse_matmul_cuda(int index){
-    if (index > matmul_args.size() - 1)
-        throw std::runtime_error("Index out of range of matmul_args");
-
-    auto args = matmul_args[index];
-    cusparseLtMatmulPlanDestroy(args->plan);
-    CHECK_CUDA(cudaFree(args->streams));
-    CHECK_CUDA(cudaFree(args->dCompressed));
-    matmul_args.erase(matmul_args.begin() + index);
-
-    return EXIT_SUCCESS;
-}
-
-torch::Tensor setup_spmatmul_cuda(torch::Tensor A,
-                                torch::Tensor B,
-                                const bool transpose_A=false,
-                                const bool transpose_B=false,
-                                const bool sparseA=true,
-                                const bool transposable_mask=false,
-                                const bool is_sparse_pruned=false,
-                                const bool check_sparsity=false) {
-   auto index = torch::zeros({1}, torch::kInt32);
-   int result;
-   int m, k, n;
-   if(transpose_A && transpose_B)
-   {
-        m = A.size(1);
-        k = A.size(0);
-        n = B.size(0);
-   } else if(transpose_A)
-   {
-        m = A.size(1);
-        k = A.size(0);
-        n = B.size(1);
-   } else if(transpose_B)
-   {
-        m = A.size(0);
-        k = A.size(1);
-        n = B.size(0);
-   } else {
-        m = A.size(0);
-        k = A.size(1);
-        n = B.size(1);
-   }
-   switch (A.type().scalarType()) {
-        case torch::ScalarType::Half:
-        {
-            auto sparse_mat = sparseA ? A.data_ptr<at::Half>() : B.data_ptr<at::Half>();
-            auto dense_mat = sparseA ? B.data_ptr<at::Half>() : A.data_ptr<at::Half>();
-            at::Half *dCompressed;
-            result = setup_prune_matmul<at::Half, at::Half>(     m,
-                                             n,
-                                             k,
-                                             sparse_mat,
-                                             dense_mat,
-                                             index.data_ptr<int>(),
-                                             transpose_A,
-                                             transpose_B,
-                                             sparseA,
-                                             transposable_mask,
-                                             is_sparse_pruned,
-                                             check_sparsity,
-                                             CUDA_R_16F,
-                                             CUDA_R_16F,
-                                             CUSPARSE_COMPUTE_16F);
-            break;
-        }
-        case torch::ScalarType::Char:
-        {
-            auto sparse_mat = sparseA ? A.data_ptr<int8_t>() : B.data_ptr<int8_t>();
-            auto dense_mat = sparseA ? B.data_ptr<int8_t>() : A.data_ptr<int8_t>();
-            int8_t *dCompressed;
-            result = setup_prune_matmul<int8_t, INT8_OUTPUT_TYPE>(     m,
-                                             n,
-                                             k,
-                                             sparse_mat,
-                                             dense_mat,
-                                             index.data_ptr<int>(),
-                                             transpose_A,
-                                             transpose_B,
-                                             sparseA,
-                                             transposable_mask,
-                                             is_sparse_pruned,
-                                             check_sparsity,
-                                             CUDA_R_8I,
-                                             INT8_OUTPUT_TYPE_CUDA,
-                                             CUSPARSE_COMPUTE_32I);
-            break;}
-        default:
-        {
-            std::cout << A.type().scalarType() << std::endl;
-            throw std::runtime_error("Unsupported data type");
-        }
-   }
-   if(result == EXIT_SUCCESS) {
-     return index;
-   } else {
-     return -torch::ones({1}, torch::kInt32);
-   }
-}
-
-
-template <class T, class V>
-torch::Tensor matmul(   T* dDense,
-                        int index,
-                        bool sparseA,
-                        int m,
-                        torch::TensorOptions options=torch::TensorOptions()
-                    )
-{
-    auto args = matmul_args[index];
-
-    torch::Tensor C = torch::zeros({m, args->n}, options);
-    auto dC = C.data_ptr<V>();
-    auto dD = dC;
-    auto dA = sparseA ? (T*) args->dCompressed : dDense;
-    auto dB = sparseA ? dDense : (T*) args->dCompressed;
-    void *d_workspace;
-    CHECK_CUDA_TORCH( cudaMalloc((void**) &d_workspace, args->workspace_size) )
-    // Perform the matrix multiplication
-    CHECK_CUSPARSE_TORCH( cusparseLtMatmul(&handle, args->plan, &alpha, dA, dB,
-                                     &beta, dC, dD, d_workspace, args->streams,
-                                     args->num_streams) )
-    CHECK_CUDA_TORCH( cudaFree(d_workspace) )
-    return C;
-}
-
-
-torch::Tensor spmatmul_cuda(torch::Tensor   Dense,
-                            int             index,
-                            bool            sparseA)
-{
-    switch (Dense.type().scalarType()) {
-        case torch::ScalarType::Half: {
-            auto options = torch::TensorOptions().dtype(torch::kHalf).device(torch::kCUDA);
-            return matmul<at::Half, at::Half>(Dense.data_ptr<at::Half>(), index, sparseA, Dense.size(0), options);
-        }
-        case torch::ScalarType::Char: {
-            auto options = torch::TensorOptions().dtype(INT8_OUTPUT_TYPE_TORCH).device(torch::kCUDA);
-            return matmul<int8_t, INT8_OUTPUT_TYPE>(Dense.data_ptr<int8_t>(), index, sparseA, Dense.size(0), options);
-        }
-        default:
-        {
-            throw std::runtime_error("Unsupported data type");
-        }
-    }
-}
-
-
-void save_grad_cuda(torch::Tensor grad, int index)
-{
-    auto args = matmul_args[index];
-//    args->grad = grad.clone().detach();
-}
-
-
-__global__ void prune_kernel(
-        const float* __restrict__ input,
-        float* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-    const int column = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        reinterpret_cast<float4*>(&output[index])[0] = reinterpret_cast<const float4*>(&input[index])[0];
-        if(abs(output[index]) > abs(output[index + 1])){
-            output[index + 1] = 0.;
-            mask[index + 1] = true;
-        } else {
-            output[index] = 0.;
-            mask[index] = true;
-        }
-        if(abs(output[index + 2]) > abs(output[index + 3])){
-            output[index + 3] = 0.;
-            mask[index + 3] = true;
-        } else {
-            output[index + 2] = 0.;
-            mask[index + 2] = true;
-        }
-  }
-}
-
-
-__global__ void prune_kernel(
-        const at::Half* __restrict__ input,
-        at::Half* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-    const int column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        reinterpret_cast<float4*>(&output[index])[0] = reinterpret_cast<const float4*>(&input[index])[0];
-        at::Half min1, min2;
-        int min_idx1, min_idx2;
-        min1 = output[index];
-        min_idx1 = index;
-        if(MIN(min1, output[index + 1]) == output[index + 1]){
-            min1 = output[index + 1];
-            min_idx1 = index + 1;
-        }
-        if(MIN(min1, output[index + 2]) == output[index + 2]){
-            min1 = output[index + 2];
-            min_idx1 = index + 2;
-        }
-        if(MIN(min1, output[index + 3]) == output[index + 3]){
-            min1 = output[index + 3];
-            min_idx1 = index + 3;
-        }
-        min2 = min_idx1 == index ? output[index + 1] : output[index];
-        min_idx2 = min_idx1 == index ? index + 1 : index;
-        if((MIN(min2, output[index + 1]) == output[index + 1]) && min_idx1 != index + 1){
-            min2 = output[index + 1];
-            min_idx2 = index + 1;
-        }
-        if((MIN(min2, output[index + 2]) == output[index + 2]) && min_idx1 != index + 2){
-            min2 = output[index + 2];
-            min_idx2 = index + 2;
-        }
-        if((MIN(min2, output[index + 3]) == output[index + 3]) && min_idx1 != index + 3){
-            min2 = output[index + 3];
-            min_idx2 = index + 3;
-        }
-        output[min_idx1] = 0.; mask[min_idx1] = true;
-        output[min_idx2] = 0.; mask[min_idx2] = true;
-
-        min1 = output[index + 4];
-        min_idx1 = index + 4;
-        if(MIN(min1, output[index + 5]) == output[index + 5]){
-            min1 = output[index + 5];
-            min_idx1 = index + 5;
-        }
-        if(MIN(min1, output[index + 6]) == output[index + 6]){
-            min1 = output[index + 6];
-            min_idx1 = index + 6;
-        }
-        if(MIN(min1, output[index + 7]) == output[index + 7]){
-            min1 = output[index + 7];
-            min_idx1 = index + 7;
-        }
-        min2 = min_idx1 == index + 4 ? output[index + 5] : output[index + 4];
-        min_idx2 = min_idx1 == index + 4 ? index + 5 : index + 4;
-        if((MIN(min2, output[index + 5]) == output[index + 5]) && min_idx1 != index + 5){
-            min2 = output[index + 5];
-            min_idx2 = index + 5;
-        }
-        if((MIN(min2, output[index + 6]) == output[index + 6]) && min_idx1 != index + 6){
-            min2 = output[index + 6];
-            min_idx2 = index + 6;
-        }
-        if((MIN(min2, output[index + 7]) == output[index + 7]) && min_idx1 != index + 7){
-            min2 = output[index + 7];
-            min_idx2 = index + 7;
-        }
-
-        output[min_idx1] = 0.; mask[min_idx1] = true;
-        output[min_idx2] = 0.; mask[min_idx2] = true;
-  }
-}
-
-
-template <class T>
-__device__ void find_kth_smallest(
-                                    int *smallest_idx,
-                                    const T* __restrict__ input,
-                                    const int k,
-                                    const int M, int index) {
-    int min_idx = 0;
-    T min = 6.0e4;
-
-    for(int i = 0; i < M; i++)
-    {
-        bool ignore = false;
-        for(int j = 0; j < k; j++)
-        {
-            if(smallest_idx[j] == i)
-            {
-                ignore = true;
-            }
-        }
-        if(ignore)
-        {
-            continue;
-        }
-        if(MIN(min, input[i]) == input[i]){
-            min = input[i];
-            min_idx = i;
-        }
-    }
-    smallest_idx[k] = min_idx;
-}
-
-
-__global__ void prune_kernel(
-        const at::Half* __restrict__ input,
-        at::Half* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size,
-        const int N,
-        const int M) {
-
-    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        for(int i = 0; i < M / 8; i++)
-        {
-            reinterpret_cast<float4*>(&output[index + 8 * i])[0] = reinterpret_cast<const float4*>(&input[index + 8 * i])[0];
-        }
-
-        int min_idx_list[16];
-        for(int k = 0; k < (M - N); k++)
-        {
-            find_kth_smallest<at::Half>(min_idx_list, &input[index], k, M, index);
-        }
-
-        for(int i = 0; i < (M - N); i++)
-        {
-            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
-        }
-  }
-}
-
-
-__global__ void prune_kernel(
-        const float* __restrict__ input,
-        float* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size,
-        const int N,
-        const int M) {
-
-    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        for(int i = 0; i < M / 4; i++)
-        {
-            reinterpret_cast<float4*>(&output[index + 4 * i])[0] = reinterpret_cast<const float4*>(&input[index + 4 * i])[0];
-        }
-
-        int *min_idx_list;
-        min_idx_list = (int*)malloc((M - N) * sizeof(int));
-        for(int k = 0; k < (M - N); k++)
-        {
-            find_kth_smallest<float>(min_idx_list, &input[index], k, M, index);
-        }
-
-        for(int i = 0; i < (M - N); i++)
-        {
-            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
-        }
-  }
-}
-
-
-template <int N, int M>
-__global__ void prune_kernel(
-        const float* __restrict__ input,
-        float* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-
-    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        for(int i = 0; i < M / 4; i++)
-        {
-            reinterpret_cast<float4*>(&output[index + 4 * i])[0] = reinterpret_cast<const float4*>(&input[index + 4 * i])[0];
-        }
-
-        int min_idx_list[M - N];
-        for(int k = 0; k < (M - N); k++)
-        {
-            find_kth_smallest<float>(min_idx_list, &input[index], k, M, index);
-        }
-
-        for(int i = 0; i < (M - N); i++)
-        {
-            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
-        }
-  }
-}
-
-
-template <int N, int M>
-__global__ void prune_kernel(
-        const at::Half* __restrict__ input,
-        at::Half* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-
-    const int column = M * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        for(int i = 0; i < M / 8; i++)
-        {
-            reinterpret_cast<float4*>(&output[index + 8 * i])[0] = reinterpret_cast<const float4*>(&input[index + 8 * i])[0];
-        }
-
-        int min_idx_list[M - N];
-        for(int k = 0; k < (M - N); k++)
-        {
-            find_kth_smallest<at::Half>(min_idx_list, &input[index], k, M, index);
-        }
-
-        for(int i = 0; i < (M - N); i++)
-        {
-            output[min_idx_list[i] + index] = 0.; mask[min_idx_list[i] + index] = true;
-        }
-  }
-}
-
-
-std::vector<torch::Tensor> prune_cuda(
-    torch::Tensor input, const int N, const int M) {
-
-    auto output = torch::zeros_like(input);
-    auto options = torch::TensorOptions().dtype(torch::kBool);
-    auto mask = torch::zeros_like(input, options);
-
-    const auto batch_size = input.size(0);
-    const auto row_size = input.size(1);
-
-    const int threads = 1024;
-
-    if(N == 1 && M == 2) {
-        switch (input.type().scalarType()) {
-            case torch::ScalarType::Float: {
-                const dim3 blocks(((row_size / 4) + threads - 1) / threads, batch_size);
-                prune_kernel<<<blocks, threads>>>(
-                        input.data<float>(),
-                        output.data<float>(),
-                        mask.data<bool>(),
-                        row_size);
-                break;
-            }
-            case torch::ScalarType::Half: {
-                throw std::runtime_error("Half precision not supported for N=1, M=2");
-            }
-        }
-    }
-    else if(N == 2 && M == 4)
-    {
-            switch (input.type().scalarType()) {
-                case torch::ScalarType::Float: {
-                    throw std::runtime_error("Full precision not supported for N=2, M=4");
-                    break;
-                }
-                case torch::ScalarType::Half: {
-                    const dim3 blocks(((row_size / 8) + threads - 1) / threads, batch_size);
-                    prune_kernel<<<blocks, threads>>>(
-                            input.data<at::Half>(),
-                            output.data<at::Half>(),
-                            mask.data<bool>(),
-                            row_size);
-                }
-            }
-    }
-    else if((N == 2 && M == 8))
-    {
-        switch (input.type().scalarType()){
-            case torch::ScalarType::Float: {
-            const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-            prune_kernel<2, 8><<<blocks, threads>>>(
-                    input.data<float>(),
-                    output.data<float>(),
-                    mask.data<bool>(),
-                    row_size);
-            break;
-            }
-            case torch::ScalarType::Half: {
-                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-                prune_kernel<2, 8><<<blocks, threads>>>(
-                        input.data<at::Half>(),
-                        output.data<at::Half>(),
-                        mask.data<bool>(),
-                        row_size);
-            }
-        }
-    }
-    else if((N == 2 && M == 16))
-    {
-        switch (input.type().scalarType()){
-            case torch::ScalarType::Float: {
-            const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-            prune_kernel<2, 16><<<blocks, threads>>>(
-                    input.data<float>(),
-                    output.data<float>(),
-                    mask.data<bool>(),
-                    row_size);
-            break;
-            }
-            case torch::ScalarType::Half: {
-                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-                prune_kernel<2, 16><<<blocks, threads>>>(
-                        input.data<at::Half>(),
-                        output.data<at::Half>(),
-                        mask.data<bool>(),
-                        row_size);
-            }
-        }
-    }
-    else
-    {
-        if(M < 8 || M % 8 != 0)
-        {
-            throw std::runtime_error("M must be a multiple of 8");
-        }
-        switch (input.type().scalarType()) {
-            case torch::ScalarType::Float:
-            {
-                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-                prune_kernel<<<blocks, threads>>>(
-                    input.data<float>(),
-                    output.data<float>(),
-                    mask.data<bool>(),
-                    row_size,
-                    N,
-                    M);
-                 break;
-            }
-            case torch::ScalarType::Half:
-            {
-                const dim3 blocks(((row_size / M) + threads - 1) / threads, batch_size);
-                prune_kernel<<<blocks, threads>>>(
-                    input.data<at::Half>(),
-                    output.data<at::Half>(),
-                    mask.data<bool>(),
-                    row_size,
-                    N,
-                    M);
-            }
-        }
-    }
-  return {output, mask};
-}
-
-
-__global__ void prune_and_compress_kernel(
-        const at::Half* __restrict__ input,
-        at::Half* __restrict__ output,
-        bool* __restrict__ mask,
-        size_t row_size) {
-    const int input_column = 16 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int output_column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int input_row = blockIdx.y * row_size;
-    const int output_row = blockIdx.y * (row_size / 2);
-    const int input_index = input_row + input_column;
-    const int output_index = output_row + output_column;
-    if (input_column < row_size) {
-        bool local_mask[16];
-        reinterpret_cast<float4*>(local_mask)[0] = reinterpret_cast<const float4*>(&mask[input_index])[0];
-
-        int local_index = 0;
-        #pragma unroll (2)
-        for(int i = 0; i < 2; i++)
-        {
-            at::Half local_data[8];
-            reinterpret_cast<float4*>(local_data)[0] = reinterpret_cast<const float4*>(&input[input_index + 8 * i])[0];
-            #pragma unroll (8)
-            for(int j = 0; j < 8; j++)
-            {
-                if(local_mask[8 * i + j])
-                {
-                    output[local_index + output_index] = local_data[j];
-                    local_index++;
-                }
-            }
-        }
-    }
-}
-
-
-torch::Tensor prune_and_compress_cuda(torch::Tensor dense, torch::Tensor mask)
-{
-    auto row_size = dense.size(1);
-    auto batch_size = dense.size(0);
-    if(row_size % 16 != 0)
-    {
-        throw std::runtime_error("Pruning dimension should be a multiple of 128.");
-    }
-    auto options = torch::TensorOptions().dtype(torch::kHalf).device(torch::kCUDA);
-    torch::Tensor result = torch::zeros({dense.size(0), dense.size(1) / 2}, options);
-    const int threads = 1024;
-    switch (dense.type().scalarType()) {
-        case torch::ScalarType::Float:
-        {
-            throw std::runtime_error("Full precision not supported for prune_and_compress");
-        }
-        case torch::ScalarType::Half:
-        {
-            const dim3 blocks(((row_size / 16) + threads - 1) / threads, batch_size);
-            prune_and_compress_kernel<<<blocks, threads>>>(
-                dense.data<at::Half>(),
-                result.data<at::Half>(),
-                mask.data<bool>(),
-                row_size);
-        }
-    }
-    return result;
-}
-
-
-__global__ void sparse_add_kernel(
-        const at::Half* __restrict__ mat1,
-        const at::Half* __restrict__ mat2,
-        const at::Half alpha,
-        const at::Half beta,
-        at::Half* __restrict__ output,
-        size_t row_size) {
-    const int column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        at::Half mat1_local[8], mat2_local[8];
-        reinterpret_cast<float4 *>(&mat1_local)[0] = reinterpret_cast<const float4 *>(&mat1[index])[0];
-        reinterpret_cast<float4 *>(&mat2_local)[0] = reinterpret_cast<const float4 *>(&mat2[index])[0];
-        #pragma unroll (8)
-        for(int i = 0; i < 8; i++)
-        {
-            output[index + i] = alpha * mat1_local[i] + beta * mat2_local[i];
-        }
-    }
-
-}
-
-
-torch::Tensor sparse_add_cuda(torch::Tensor dense, torch::Tensor sparse_index, torch::Tensor alpha, torch::Tensor beta)
-{
-    int row_size = dense.size(1);
-    int batch_size = dense.size(0);
-    if(row_size % 8 != 0)
-    {
-        throw std::runtime_error("Pruning dimension should be a multiple of 8.");
-    }
-    int index = sparse_index.item<int>();
-    auto args = matmul_args[index];
-    torch::Tensor result = torch::zeros_like(dense);
-    const int threads = 1024;
-    switch (dense.type().scalarType()) {
-        case torch::ScalarType::Float:
-        {
-            throw std::runtime_error("Full precision not supported for prune_and_compress");
-        }
-        case torch::ScalarType::Half:
-        {
-            const dim3 blocks(((row_size / 8) + threads - 1) / threads, batch_size);
-            sparse_add_kernel<<<blocks, threads>>>(
-                dense.data<at::Half>(),
-                (at::Half*) args->dCompressed,
-                alpha.item<float>(),
-                beta.item<float>(),
-                result.data<at::Half>(),
-                row_size);
-        }
-    }
-    return result;
-}
-
-
-__global__ void update_sparse_matrix_kernel(
-        const at::Half* __restrict__ new_data,
-        at::Half* __restrict__ output,
-        size_t row_size) {
-    const int column = 8 * (blockIdx.x * blockDim.x + threadIdx.x);
-    const int index = blockIdx.y * row_size + column;
-    if (column < row_size) {
-        reinterpret_cast<float4 *>(&output[index])[0] = reinterpret_cast<const float4 *>(&new_data[index])[0];
-    }
-}
-
-
-void update_sparse_matrix_cuda(torch::Tensor new_data, torch::Tensor sparse_idx)
-{
-    auto args = matmul_args[sparse_idx.item<int>()];
-    const int threads = 1024;
-    switch (new_data.type().scalarType()) {
-        case torch::ScalarType::Float:
-        {
-            throw std::runtime_error("Full precision not supported for prune_and_compress");
-        }
-        case torch::ScalarType::Half:
-        {
-            cudaMemcpy(args->dCompressed, new_data.data<at::Half>(), new_data.size(0) * new_data.size(1) * sizeof(at::Half), cudaMemcpyDeviceToDevice);
-        }
-    }
-}
-
-
-// sparse = prune_and_compress(dense, mask)
-// result = add_sparse_dense(sparse_idx, dense, alpha, beta)
-// update_sparse(data, sparse_idx, sparse_transpose_idx)
diff --git a/csrc/sparse/cusparselt/cusparselt_mm_entry.cu b/csrc/sparse/cusparselt/cusparselt_mm_entry.cu
deleted file mode 100644
index ddc5ef090ec2b..0000000000000
--- a/csrc/sparse/cusparselt/cusparselt_mm_entry.cu
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <torch/extension.h>
-#include <cusparseLt.h>       // cusparseLt header
-#include <iostream>
-
-#define CHECK_CUDA_DEVICE(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA_DEVICE(x); CHECK_CONTIGUOUS(x)
-
-int init_cusparse_lt_cuda();
-torch::Tensor setup_spmatmul_cuda(torch::Tensor A,
-                                torch::Tensor B,
-                                const bool transpose_A=false,
-                                const bool transpose_B=false,
-                                const bool sparseA=true,
-                                const bool transposable_mask=false,
-                                const bool is_sparse_pruned=false,
-                                const bool check_sparsity=false);
-
-
-torch::Tensor spmatmul_cuda(torch::Tensor       Dense,
-                            int                 index,
-                            bool                sparseA);
-
-int destroy_cusparse_matmul_cuda(int index);
-
-void save_grad_cuda(torch::Tensor grad, int index);
-
-
-torch::Tensor init_cusparse_lt() {
-  int result = init_cusparse_lt_cuda();
-  if(result == EXIT_SUCCESS) {
-    return torch::zeros({1}, torch::kInt32);
-  } else {
-    return torch::ones({1}, torch::kInt32);
-  }
-}
-
-
-torch::Tensor setup_spmatmul(torch::Tensor A,
-                                torch::Tensor B,
-                                const bool transpose_A=false,
-                                const bool transpose_B=false,
-                                const bool sparseA=true,
-                                const bool transposable_mask=false,
-                                const bool is_sparse_pruned=false,
-                                const bool check_sparsity=false) {
-
-   CHECK_INPUT(A);
-   CHECK_INPUT(B);
-   return setup_spmatmul_cuda(A,
-                              B,
-                              transpose_A,
-                              transpose_B,
-                              sparseA,
-                              transposable_mask,
-                              is_sparse_pruned,
-                              check_sparsity);
-}
-
-
-torch::Tensor spmatmul( torch::Tensor Dense,
-                        torch::Tensor index,
-                        const bool sparseA=true) {
-   CHECK_INPUT(Dense);
-//   std::cout << Dense.data_ptr<at::Half>()[0] << std::endl;
-   auto result = spmatmul_cuda(     Dense,
-                                    *index.data_ptr<int>(),
-                                    sparseA);
-   return result;
-}
-
-int destroy_cusparse_matmul(int index){
-    return destroy_cusparse_matmul_cuda(index);
-}
-
-torch::Tensor save_grad(torch::Tensor input, torch::Tensor index) {
-    CHECK_INPUT(input);
-    save_grad_cuda(input, *index.data_ptr<int>());
-}
-
-
-std::vector<torch::Tensor> prune_cuda(torch::Tensor input, const int N, const int M);
-
-
-std::vector<torch::Tensor> prune(
-        torch::Tensor input, const int N, const int M) {
-    CHECK_INPUT(input);
-    return prune_cuda(input, N, M);
-}
-
-
-torch::Tensor prune_and_compress_cuda(torch::Tensor input, torch::Tensor mask);
-
-
-torch::Tensor prune_and_compress(
-        torch::Tensor input, torch::Tensor mask) {
-    CHECK_INPUT(input);
-    return prune_and_compress_cuda(input, mask);
-}
-
-
-torch::Tensor sparse_add_cuda(torch::Tensor dense, torch::Tensor sparse_index, torch::Tensor alpha, torch::Tensor beta);
-
-
-torch::Tensor sparse_add(
-        torch::Tensor dense, torch::Tensor sparse_index, torch::Tensor alpha, torch::Tensor beta) {
-    CHECK_INPUT(dense);
-    return sparse_add_cuda(dense, sparse_index, alpha, beta);
-}
-
-
-void update_sparse_matrix_cuda(torch::Tensor new_data, torch::Tensor sparse_idx);
-
-
-void update_sparse_matrix(
-        torch::Tensor new_data, torch::Tensor sparse_idx) {
-    CHECK_INPUT(new_data);
-    update_sparse_matrix_cuda(new_data, sparse_idx);
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("init_cusparse_lt", &init_cusparse_lt, "Initialize CUSPARSE LT");
-    m.def("setup_spmatmul", &setup_spmatmul, "Setup Sparse Matrix Multiplication");
-    m.def("destroy_cusparse_matmul", &destroy_cusparse_matmul, "Destroy matmul arguments");
-    m.def("spmatmul", &spmatmul, "Sparse Matrix Multiplication");
-    m.def("save_grad", &save_grad, "Save Gradient");
-    m.def("prune", &prune, "N:M Prune (CUDA)");
-    m.def("prune_and_compress", &prune_and_compress, "Prune the dense matrix using the mask and store it in a "
-                                                     "compressed tensor (CUDA)");
-    m.def("sparse_add", &sparse_add, "Add the sparse matrix to the dense matrix and return a "
-                                     "compressed dense matrix(CUDA)");
-    m.def("update_sparse_matrix", &update_sparse_matrix, "Update the sparse matrix with the new dense matrix "
-                                                         "data (CUDA)");
-}
diff --git a/csrc/sparse/cutlass/example/62_hopper_sparse_gemm.cu b/csrc/sparse/cutlass/example/62_hopper_sparse_gemm.cu
deleted file mode 100644
index 5b7361f805098..0000000000000
--- a/csrc/sparse/cutlass/example/62_hopper_sparse_gemm.cu
+++ /dev/null
@@ -1,596 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Hopper Sparse GEMM example.
-
-  This example demonstrates how to construct and run a structured sparse GEMM kernel
-  on NVIDIA Hopper architecture.
-    
-*/
-
-#include <iostream>
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-#include "cutlass/epilogue/thread/linear_combination.h"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/dispatch_policy.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/transform/device/transform_universal_adapter.hpp"
-#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
-
-#include "util/command_line.h"
-#include "util/distribution.h"
-#include "util/host_tensor.h"
-#include "util/packed_stride.hpp"
-#include "util/tensor_view_io.h"
-#include "util/reference/device/gemm.h"
-#include "util/reference/device/tensor_compare.h"
-#include "util/reference/device/tensor_fill.h"
-
-#include "util/helper.h"
-
-using namespace cute;
-
-#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// GEMM kernel configurations
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// A matrix configuration
-using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
-using         LayoutTagA  = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
-constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
-
-// B matrix configuration
-using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
-using         LayoutTagB  = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
-constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
-
-// C/D matrix configuration
-using         ElementC    = float;                                          // Element type for C and D matrix operands
-using         LayoutTagC  = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
-constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
-
-// Core kernel configurations
-using ElementAccumulator  = float;                                          // Element type for internal accumulation
-using TileShape           = Shape<_128,_128,_128>;                          // Threadblock-level tile size for sparse kernel
-using TileShapeRef        = Shape<_128,_128, _64>;                          // Threadblock-level tile size for reference (dense) kernel
-using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
-using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecialized;        // Kernel schedule policy
-using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized;          // Epilogue schedule policy
-
-using ProblemShape = Shape<int,int,int,int>;
-
-// Sparse kernel setup
-
-using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-    TileShape, ClusterShape,
-    cutlass::epilogue::collective::EpilogueTileAuto,
-    ElementAccumulator, ElementAccumulator,
-    ElementC, LayoutTagC, AlignmentC,
-    ElementC, LayoutTagC, AlignmentC,
-    EpilogueSchedule
-  >::CollectiveOp;
-
-using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
-    ElementA, LayoutTagA, AlignmentA,
-    ElementB, LayoutTagB, AlignmentB,
-    ElementAccumulator,
-    TileShape, ClusterShape,
-    cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-    KernelSchedule
-  >::CollectiveOp;
-
-using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-    ProblemShape,
-    CollectiveMainloop,
-    CollectiveEpilogue
->;
-
-using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-// Reference (dense) kernel setup
-
-using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-    TileShapeRef, ClusterShape,
-    cutlass::epilogue::collective::EpilogueTileAuto,
-    ElementAccumulator, ElementAccumulator,
-    ElementC, LayoutTagC, AlignmentC,
-    ElementC, LayoutTagC, AlignmentC,
-    EpilogueSchedule
-  >::CollectiveOp;
-
-using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
-    ElementA, LayoutTagA, AlignmentA,
-    ElementB, LayoutTagB, AlignmentB,
-    ElementAccumulator,
-    TileShapeRef, ClusterShape,
-    cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
-    KernelSchedule
-  >::CollectiveOp;
-
-using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
-    ProblemShape,
-    CollectiveMainloopRef,
-    CollectiveEpilogue
->;
-
-using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
-
-// Layouts 
-using LayoutA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
-using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
-using StrideB = typename Gemm::GemmKernel::StrideB;
-using StrideC = typename Gemm::GemmKernel::StrideC;
-using StrideD = typename Gemm::GemmKernel::StrideD;
-
-// Layouts for reference (non-sparse) tensors
-using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
-using StrideE = StrideA;
-
-using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
-using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
-
-// Offline compressor kernel
-using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
-                            ProblemShape,
-                            ElementA,
-                            LayoutTagA,
-                            SparseConfig>;
-
-using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
-                            ProblemShape,
-                            ElementA,
-                            LayoutTagA,
-                            SparseConfig,
-                            cutlass::arch::Sm90>;
-
-using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
-
-//
-// Data members
-//
-
-ProblemShape problem_shape;
-
-StrideA stride_A;
-StrideA stride_A_compressed;
-StrideE stride_E;
-StrideB stride_B;
-StrideC stride_C;
-StrideD stride_D;
-
-LayoutA layout_A;
-LayoutE layout_E;
-
-uint64_t seed;
-
-cutlass::DeviceAllocation<typename Gemm::ElementA> block_A;
-cutlass::DeviceAllocation<typename Gemm::ElementA> block_A_compressed;
-cutlass::DeviceAllocation<typename Gemm::CollectiveMainloop::ElementE> block_E;
-cutlass::DeviceAllocation<typename Gemm::ElementB> block_B;
-cutlass::DeviceAllocation<typename Gemm::ElementC> block_C;
-cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D;
-cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_D_ref;
-
-#endif // defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// Testbed utility types
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Command line options parsing
-struct Options {
-
-  bool help;
-
-  float alpha, beta;
-  int iterations;
-  int m, n, k, l;
-
-  Options():
-    help(false),
-    m(5120), n(4096), k(16384), l(1),
-    alpha(1.f), beta(0.f),
-    iterations(10)
-  { }
-
-  // Parses the command line
-  void parse(int argc, char const **args) {
-    cutlass::CommandLine cmd(argc, args);
-
-    if (cmd.check_cmd_line_flag("help")) {
-      help = true;
-      return;
-    }
-
-    cmd.get_cmd_line_argument("m", m);
-    cmd.get_cmd_line_argument("n", n);
-    cmd.get_cmd_line_argument("k", k);
-    cmd.get_cmd_line_argument("l", l);
-    cmd.get_cmd_line_argument("alpha", alpha);
-    cmd.get_cmd_line_argument("beta", beta);
-    cmd.get_cmd_line_argument("iterations", iterations);
-  }
-
-  /// Prints the usage statement.
-  std::ostream & print_usage(std::ostream &out) const {
-
-    out << "62_hopper_sparse_gemm\n\n"
-      << "  Hopper Sparse GEMM example.\n\n"
-      << "Options:\n\n"
-      << "  --help                      If specified, displays this usage statement\n\n"
-      << "  --m=<int>                   Sets the M extent of the GEMM\n"
-      << "  --n=<int>                   Sets the N extent of the GEMM\n"
-      << "  --k=<int>                   Sets the K extent of the GEMM\n"
-      << "  --l=<int>                   Sets the L extent of the GEMM (batch size)\n"
-      << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n\n"
-      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
-
-    out
-      << "\n\nExamples:\n\n"
-      << "$ " << "62_hopper_sparse_gemm" << " --m=4096 --n=5120 --k=8192 --l=1 --alpha=2 --beta=0.707 \n\n";
-
-    return out;
-  }
-
-  /// Compute performance in GFLOP/s
-  double gflops(double runtime_s) const
-  {
-    // Two flops per multiply-add
-    uint64_t flop = uint64_t(2) * m * n * k;
-    double gflop = double(flop) / double(1.0e9);
-    return gflop / runtime_s;
-  }
-};
-
-#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-/// GEMM setup and evaluation
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to initialize a block of device data
-template <class Element>
-bool initialize_block(
-  cutlass::DeviceAllocation<Element>& block,
-  uint64_t seed) {
-
-  Element scope_max, scope_min;
-  int bits_input = cutlass::sizeof_bits<Element>::value;
-
-  if (bits_input == 1) {
-    scope_max = Element(2);
-    scope_min = Element(0);
-  } else if (bits_input <= 8) {
-    scope_max = Element(2);
-    scope_min = Element(-2);
-  } else {
-    scope_max = Element(8);
-    scope_min = Element(-8);
-  }
-
-  cutlass::reference::device::BlockFillRandomUniform(
-    block.get(), block.size(), seed, scope_max, scope_min, 0);
-
-  return true;
-}
-
-/// Make A structured sparse by replacing elements with 0 and compress it
-bool sparsify_and_compress()
-{
-  auto [M, N, K, L] = problem_shape;
-  CompressorUtility compressor_utility(problem_shape, stride_A);
-
-  int ME = compressor_utility.get_metadata_m_physical();
-  int KE = compressor_utility.get_metadata_k_physical();
-  int KC = compressor_utility.get_tensorA_k_physical();
-
-  block_A_compressed.reset(M * KC * L);
-  block_E.reset(ME * KE * L);
-
-  stride_A_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KC, L));
-  stride_E = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(ME, KE, L));
-
-  // Random sparsification is performed on host
-  std::vector<ElementA> block_A_host(block_A.size());
-  cutlass::device_memory::copy_to_host(block_A_host.data(), block_A.get(), block_A.size());
-  compressor_utility.structure_sparse_zero_mask_fill(block_A_host.data(), static_cast<int>(seed + 2024));
-  cutlass::device_memory::copy_to_device(block_A.get(), block_A_host.data(), block_A.size());
-
-  cutlass::KernelHardwareInfo hw_info;
-  hw_info.device_id = 0;
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
-  typename Compressor::Arguments arguments {
-    problem_shape,
-    { block_A.get(),
-      stride_A,
-      block_A_compressed.get(),
-      block_E.get() },
-    {hw_info} };
-
-  Compressor compressor_op;
-  size_t workspace_size = Compressor::get_workspace_size(arguments);
-  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-  CUTLASS_CHECK(compressor_op.can_implement(arguments));
-  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
-  CUTLASS_CHECK(compressor_op.run());
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  return true;
-}
-
-/// Initialize operands to be used in the GEMM and reference GEMM
-bool initialize(Options const& options) {
-
-  problem_shape = make_tuple(options.m, options.n, options.k, options.l);
-  auto [M, N, K, L] = problem_shape;
-
-  stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
-  stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
-  stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
-  stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
-
-  // Allocate memory for tensors
-  block_A.reset(M * K * L);
-  block_B.reset(N * K * L);
-  block_C.reset(M * N * L);
-  block_D.reset(M * N * L);
-  block_D_ref.reset(M * N * L);
-
-  // Fill input tensors with data
-  initialize_block(block_A, seed + 2021);
-  initialize_block(block_B, seed + 2022);
-  initialize_block(block_C, seed + 2023);
-
-  // Replace 0 in A with 1 to avoid metadata changes
-  std::vector<ElementA> block_A_host(block_A.size());
-  cutlass::device_memory::copy_to_host(block_A_host.data(), block_A.get(), block_A.size());
-  for (size_t i = 0; i < block_A.size(); ++i) if (block_A_host[i] == ElementA(0)) block_A_host[i] = ElementA(1.0);
-  cutlass::device_memory::copy_to_device(block_A.get(), block_A_host.data(), block_A.size());
-
-  if (!sparsify_and_compress()) {
-    return false;
-  };
-
-  // Build the compressed/metadata layouts
-  layout_A = SparseConfig::fill_layoutA(problem_shape);
-  layout_E = SparseConfig::fill_layoutE(problem_shape);
-
-  return true;
-}
-
-/// Populates a Gemm::Arguments structure from the given commandline options
-typename Gemm::Arguments make_args(Options const& options)
-{
-  typename Gemm::Arguments arguments{
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    problem_shape,
-    { block_A_compressed.get(), layout_A, block_B.get(), stride_B, block_E.get(), layout_E },
-    { { ElementAccumulator(options.alpha), ElementAccumulator(options.beta) },
-      block_C.get(), stride_C, block_D.get(), stride_D }
-  };
-
-  return arguments;
-}
-
-typename GemmRef::Arguments make_args_ref(Options const& options)
-{
-  typename GemmRef::Arguments arguments{
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    problem_shape,
-    { block_A.get(), stride_A, block_B.get(), stride_B },
-    { { ElementAccumulator(options.alpha), ElementAccumulator(options.beta) },
-      block_C.get(), stride_C, block_D_ref.get(), stride_D }
-  };
-
-  return arguments;
-}
-
-template<class Engine, class Layout>
-void print_device_tensor(cute::Tensor<Engine, Layout> const& t)
-{
-  // Assumes size = cosize, i.e. compact tensor
-  std::vector<typename Engine::value_type> data_host(t.size());
-  cutlass::device_memory::copy_to_host(data_host.data(), t.data(), t.size());
-  auto t_host = cute::make_tensor(data_host.data(), t.layout());
-  cute::print_tensor(t_host);
-}
-
-bool verify(Options const& options) {
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  bool passed = cutlass::reference::device::BlockCompareEqual(block_D_ref.get(), block_D.get(), block_D.size());
-
-#if 0
-  if (!passed) {
-    auto [M, N, K, L] = problem_shape;
-    CompressorUtility compressor_utility(problem_shape, stride_A);
-    int ME = compressor_utility.get_metadata_m_physical();
-    int KE = compressor_utility.get_metadata_k_physical();
-    int KC = compressor_utility.get_tensorA_k_physical();
-
-    cute::print("A (original): "); print_device_tensor(make_tensor(block_A.get(), make_shape(M, K, L), stride_A));
-    cute::print("A (compressed): "); print_device_tensor(make_tensor(block_A_compressed.get(), make_shape(M, KC, L), stride_A_compressed));
-    cute::print("E (physical): "); print_device_tensor(make_tensor(block_E.get(), make_shape(ME, KE, L), stride_E));
-    cute::print("E (logical): "); print_device_tensor(make_tensor(block_E.get(), upcast<CollectiveMainloop::ElementEMmaSparsity>(layout_E)));
-    cute::print("B: "); print_device_tensor(make_tensor(block_B.get(), make_shape(N, K, L), stride_B));
-    cute::print("C: "); print_device_tensor(make_tensor(block_C.get(), make_shape(M, N, L), stride_C));
-    cute::print("D reference: "); print_device_tensor(make_tensor(block_D_ref.get(), make_shape(M, N, L), stride_D));
-    cute::print("D  computed: "); print_device_tensor(make_tensor(block_D.get(), make_shape(M, N, L), stride_D));
-  }
-#endif
-
-  return passed;
-}
-
-template<typename Gemm>
-struct Runner
-{
-  using Arguments = typename Gemm::Arguments;
-
-  Runner(Arguments args): arguments(args) {
-    // Using the arguments, query for extra workspace required for matrix multiplication computation
-    size_t workspace_size = Gemm::get_workspace_size(arguments);
-
-    // Allocate workspace memory
-    workspace.reset(workspace_size);
-
-    // Check if the problem size is supported or not
-    CUTLASS_CHECK(gemm.can_implement(arguments));
-  }
-
-  void run() {
-    CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
-    CUTLASS_CHECK(gemm.run());
-  }
-
-  void benchmark(Options const& options) {
-    if (options.iterations > 0)
-    {
-      GpuTimer timer;
-      timer.start();
-      for (int iter = 0; iter < options.iterations; ++iter) {
-        run();
-      }
-      timer.stop();
-
-      // Compute average runtime and GFLOPs.
-      float elapsed_ms = timer.elapsed_millis();
-      double avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
-      double gflops = options.gflops(avg_runtime_ms / 1000.0);
-
-      std::cout << "  Avg runtime: " << avg_runtime_ms << " ms" << std::endl;
-      std::cout << "  GFLOPS: " << gflops << std::endl;
-    }
-  }
-
-  Gemm gemm;
-  Arguments arguments;
-  cutlass::device_memory::allocation<uint8_t> workspace;
-};
-
-/// Execute the example (verification and timing)
-void run(Options &options) {
-  bool init = initialize(options);
-  if (!init) {
-    std::cout << "Initialization failure" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  Runner<Gemm> gemm(make_args(options));
-  Runner<GemmRef> gemm_ref(make_args_ref(options));
-
-  gemm.run();
-  gemm_ref.run();
-
-  bool passed = verify(options);
-
-  std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
-  std::cout << "  Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
-
-  if (!passed) {
-    exit(EXIT_FAILURE);
-  }
-
-  std::cout << "Sparse GEMM:" << std::endl;
-  gemm.benchmark(options);
-
-  std::cout << "Dense GEMM:" << std::endl;
-  gemm_ref.benchmark(options);
-}
-
-#endif // defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-int main(int argc, char const **args) {
-
-  // CUTLASS must be compiled with CUDA 12.2 Toolkit to run this example
-  // and must have compute capability at least 90.
-  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 2)) {
-    std::cerr << "This example requires CUDA 12.2 or newer.\n";
-    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
-    return 0;
-  }
-
-  cudaDeviceProp props;
-  int current_device_id;
-  CUDA_CHECK(cudaGetDevice(&current_device_id));
-  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
-  cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (props.major < 9) {
-    std::cerr
-      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
-      << "later (compute capability 90 or greater).\n";
-    return 0;
-  }
-  //
-  // Parse options
-  //
-
-  Options options;
-
-  options.parse(argc, args);
-
-  if (options.help) {
-    options.print_usage(std::cout) << std::endl;
-    return 0;
-  }
-
-  //
-  // Evaluate CUTLASS kernels
-  //
-
-#if defined(CUTLASS_ARCH_MMA_SPARSE_SM90_SUPPORTED)
-  run(options);
-#endif
-
-  return EXIT_SUCCESS;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/example/Makefile b/csrc/sparse/cutlass/example/Makefile
deleted file mode 100644
index 7e5eac250d2e3..0000000000000
--- a/csrc/sparse/cutlass/example/Makefile
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2023 The FLash-LLM Authors. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# host compiler
-HOST_COMPILER ?= g++
-CUDA_PATH ?= /usr/local/cuda/
-#below is the path for Narval
-#CUDA_PATH ?= /cvmfs/soft.computecanada.ca/easybuild/software/2020/Core/cudacore/11.7.0/
-# CUDA_PATH ?= /cvmfs/soft.computecanada.ca/easybuild/software/2023/x86-64-v3/Core/cudacore/12.2.2/
-NVCC          := /usr/local/cuda/bin/nvcc -ccbin $(HOST_COMPILER)
-
-# internal flags
-NVCCFLAGS   := -m$(shell getconf LONG_BIT)
-CCFLAGS     := -fPIC
-LDFLAGS     :=
-
-ALL_CCFLAGS :=
-ALL_CCFLAGS += $(NVCCFLAGS)
-ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
-
-ALL_LDFLAGS :=
-ALL_LDFLAGS += $(ALL_CCFLAGS)
-ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
-
-# Common includes and paths for CUDA
-INCLUDES  := -I/usr/local/cuda/include/ -I /home/ferrar/vllm/.deps/cutlass-src/include
-LIBRARIES := -lcublas -lcusparse
-
-################################################################################
-
-# Gencode arguments
-SMS ?= 90
-# Generate SASS code for each SM architecture listed in $(SMS)
-$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
-
-ALL_CCFLAGS += --threads 0 --std=c++11 -lineinfo -O3
-
-FLASHLLM_CCFLAGS := -maxrregcount=255
-ALL_CCFLAGS += --use_fast_math
-ALL_CCFLAGS += --ptxas-options=-v,-warn-lmem-usage,--warn-on-spills
-################################################################################
-
-HEAD_FILES = ./util/command_line.h \
-			 ./util/distribution.h \
-			 ./util/host_tensor.h \
-			 ./util/packed_stride.hpp \
-			 ./util/tensor_view_io.h \
-			 ./util/reference/device/gemm.h \
-			 ./util/reference/device/tensor_compare.h \
-			 ./util/reference/device/tensor_fill.h
-
-
-# Target rules
-all: example
-
-example: 62_hopper_sparse_gemm.cu $(HEAD_FILES)
-	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(OUR_CCFLAGS) $(GENCODE_FLAGS) $< -o $@
-
-clean:
-	rm -f example
\ No newline at end of file
diff --git a/csrc/sparse/cutlass/example/util/command_line.h b/csrc/sparse/cutlass/example/util/command_line.h
deleted file mode 100644
index 9dc3a1174067a..0000000000000
--- a/csrc/sparse/cutlass/example/util/command_line.h
+++ /dev/null
@@ -1,313 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#pragma once
-
-/**
- * \file
- * Utility for parsing command line arguments
- */
-
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include <cuda_runtime.h>
-
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-
-/******************************************************************************
- * command_line
- ******************************************************************************/
-
-/**
- * Utility for parsing command line arguments
- */
-struct CommandLine {
-  std::vector<std::string> keys;
-  std::vector<std::string> values;
-  std::vector<std::string> args;
-
-  /**
-   * Constructor
-   */
-  CommandLine(int argc, const char** argv) {
-    using namespace std;
-
-    for (int i = 1; i < argc; i++) {
-      string arg = argv[i];
-
-      if ((arg[0] != '-') || (arg[1] != '-')) {
-        args.push_back(arg);
-        continue;
-      }
-
-      string::size_type pos;
-      string key, val;
-      if ((pos = arg.find('=')) == string::npos) {
-        key = string(arg, 2, arg.length() - 2);
-        val = "";
-      } else {
-        key = string(arg, 2, pos - 2);
-        val = string(arg, pos + 1, arg.length() - 1);
-      }
-
-      keys.push_back(key);
-      values.push_back(val);
-    }
-  }
-
-  /**
-   * Checks whether a flag "--<flag>" is present in the commandline
-   */
-  bool check_cmd_line_flag(const char* arg_name) const {
-    using namespace std;
-
-    for (int i = 0; i < int(keys.size()); ++i) {
-      if (keys[i] == string(arg_name)) return true;
-    }
-    return false;
-  }
-
-  /**
-   * Returns number of naked (non-flag and non-key-value) commandline parameters
-   */
-  size_t num_naked_args() const {
-    return args.size();
-  }
-
-  /**
-   * Print naked (non-flag and non-key-value) commandline parameters
-   */
-  void print_naked_args(std::ostream &out) const {
-    for (auto arg : args) {
-      out << "   " << arg <<"\n";
-    }
-  }
-
-  /**
-   * Returns the commandline parameter for a given index (not including flags)
-   */
-  template <typename value_t>
-  void get_cmd_line_argument(size_t index, value_t& val) const {
-    using namespace std;
-    if (index < args.size()) {
-      istringstream str_stream(args[index]);
-      str_stream >> val;
-    }
-  }
-
-  /**
-   * Obtains the boolean value specified for a given commandline parameter --<flag>=<bool>
-   */
-  void get_cmd_line_argument(const char* arg_name, bool& val, bool _default) const {
-    val = _default;
-    if (check_cmd_line_flag(arg_name)) {
-      std::string value;
-      get_cmd_line_argument(arg_name, value);
-
-      val = !(value == "0" || value == "false");
-    }
-  }
-  
-  /**
-   * Obtains the value specified for a given commandline parameter --<flag>=<value>
-   */
-  template <typename value_t>
-  void get_cmd_line_argument(const char* arg_name,
-                             value_t& val) const {
-
-    get_cmd_line_argument(arg_name, val, val);
-  }
-
-  /**
-   * Obtains the value specified for a given commandline parameter --<flag>=<value>
-   */
-  template <typename value_t>
-  void get_cmd_line_argument(const char* arg_name,
-                             value_t& val,
-                             value_t const& _default) const {
-    using namespace std;
-
-    val = _default;
-
-    for (int i = 0; i < int(keys.size()); ++i) {
-      if (keys[i] == string(arg_name)) {
-        istringstream str_stream(values[i]);
-        str_stream >> val;
-      }
-    }
-  }
-
-  /**
-   * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
-   */
-  template <typename value_t>
-  void get_cmd_line_arguments(const char* arg_name,
-                              std::vector<value_t>& vals,
-                              char sep = ',') const {
-    using namespace std;
-
-    if (check_cmd_line_flag(arg_name)) {
-      // Clear any default values
-      vals.clear();
-
-      // Recover from multi-value string
-      for (size_t i = 0; i < keys.size(); ++i) {
-        if (keys[i] == string(arg_name)) {
-          string val_string(values[i]);
-          separate_string(val_string, vals, sep);
-        }
-      }
-    }
-  }
-
-  /**
-   * Returns the values specified for a given commandline parameter
-   * --<flag>=<value>,<value_start:value_end>*
-   */
-  void get_cmd_line_argument_pairs(const char* arg_name,
-                                   std::vector<std::pair<std::string, std::string> >& tokens,
-                                   char delim = ',',
-                                   char sep = ':') const {
-    if (check_cmd_line_flag(arg_name)) {
-      std::string value;
-      get_cmd_line_argument(arg_name, value);
-
-      tokenize(tokens, value, delim, sep);
-    }
-  }
-
-  /**
-   * Returns a list of ranges specified for a given commandline parameter
-   * --<flag>=<key:value>,<key:value>*
-   */
-  void get_cmd_line_argument_ranges(const char* arg_name,
-                                    std::vector<std::vector<std::string> >& vals,
-                                    char delim = ',',
-                                    char sep = ':') const {
-    std::vector<std::string> ranges;
-    get_cmd_line_arguments(arg_name, ranges, delim);
-
-    for (std::vector<std::string>::const_iterator range = ranges.begin();
-      range != ranges.end(); ++range) {
-
-      std::vector<std::string> range_vals;
-      separate_string(*range, range_vals, sep);
-      vals.push_back(range_vals);
-    }
-  }
-
-  /**
-   * The number of pairs parsed
-   */
-  int parsed_argc() const { return (int)keys.size(); }
-
-  //-------------------------------------------------------------------------
-  // Utility functions
-  //-------------------------------------------------------------------------
-
-  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
-  static void tokenize(std::vector<std::pair<std::string, std::string> >& tokens,
-                       std::string const& str,
-                       char delim = ',',
-                       char sep = ':') {
-    // Home-built to avoid Boost dependency
-    size_t s_idx = 0;
-    size_t d_idx = std::string::npos;
-    while (s_idx < str.size()) {
-      d_idx = str.find_first_of(delim, s_idx);
-
-      size_t end_idx = (d_idx != std::string::npos ? d_idx : str.size());
-      size_t sep_idx = str.find_first_of(sep, s_idx);
-      size_t offset = 1;
-      if (sep_idx == std::string::npos || sep_idx >= end_idx) {
-        sep_idx = end_idx;
-        offset = 0;
-      }
-
-      std::pair<std::string, std::string> item(
-          str.substr(s_idx, sep_idx - s_idx),
-          str.substr(sep_idx + offset, end_idx - sep_idx - offset));
-
-      tokens.push_back(item);
-      s_idx = end_idx + 1;
-    }
-  }
-
-  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
-  static void tokenize(std::vector<std::string>& tokens,
-                       std::string const& str,
-                       char delim = ',',
-                       char sep = ':') {
-    typedef std::vector<std::pair<std::string, std::string> > TokenVector;
-    typedef TokenVector::const_iterator token_iterator;
-
-    std::vector<std::pair<std::string, std::string> > token_pairs;
-    tokenize(token_pairs, str, delim, sep);
-    for (token_iterator tok = token_pairs.begin(); tok != token_pairs.end(); ++tok) {
-      tokens.push_back(tok->first);
-    }
-  }
-
-  template <typename value_t>
-  static void separate_string(std::string const& str,
-                              std::vector<value_t>& vals,
-                              char sep = ',') {
-    std::istringstream str_stream(str);
-    std::string::size_type old_pos = 0;
-    std::string::size_type new_pos = 0;
-
-    // Iterate <sep>-delimited values
-    value_t val;
-    while ((new_pos = str.find(sep, old_pos)) != std::string::npos) {
-      if (new_pos != old_pos) {
-        str_stream.width(new_pos - old_pos);
-        str_stream >> val;
-        vals.push_back(val);
-      }
-
-      // skip over delimiter
-      str_stream.ignore(1);
-      old_pos = new_pos + 1;
-    }
-
-    // Read last value
-    str_stream >> val;
-    vals.push_back(val);
-  }
-};
-
-}  // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/distribution.h b/csrc/sparse/cutlass/example/util/distribution.h
deleted file mode 100644
index 649a573603ff5..0000000000000
--- a/csrc/sparse/cutlass/example/util/distribution.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-/*! \file
-    \brief This header contains a class to parametrize a statistical distribution function.
-*/
-
-#include <ostream>
-
-namespace cutlass {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Distribution type
-struct Distribution {
-  /// Variant types
-  enum Kind { Invalid, Uniform, Gaussian, Identity, Sequential, AllZeros, AllOnes };
-
-  /// Distribution state
-  union {
-    /// Uniform distribution
-    struct {
-      double min;
-      double max;
-      // Percent elements set to NaN
-      double pnan;
-    } uniform;
-
-    /// Gaussian distribution
-    struct {
-      double mean;
-      double stddev;
-      double pnz;
-      double pnzA;
-      double pnzB;
-      double pnzC;
-    } gaussian;
-
-    /// Elements are linear combination of row and column index
-    struct {
-      double start;
-      double delta;
-    } sequential;
-  };
-
-  /// Active variant kind
-  Kind kind;
-
-  /// Random values are cast to integer after scaling by this power of two
-  int int_scale;
-
-  //
-  // Methods
-  //
-
-  Distribution() : kind(Invalid), int_scale(0) {}
-
-/// Configures distribution as uniform random
-  Distribution &set_uniform(double _min, double _max, int _int_scale = 0, double _pnan = 0) {
-    kind = Uniform;
-    uniform.min = _min;
-    uniform.max = _max;
-    int_scale = _int_scale;
-    uniform.pnan = _pnan;
-    return *this;
-  }
-
-  /// Configures distribution as Gaussian distribution
-  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 1.0) {
-    kind = Gaussian;
-    gaussian.mean = _mean;
-    gaussian.stddev = _stddev;
-    gaussian.pnz = _pnz;
-    int_scale = _int_scale;
-    return *this;
-  }
-
-  /// Sets identity
-  Distribution &set_identity() {
-    kind = Identity;
-    return *this;
-  }
-
-  /// Sets sequential
-  Distribution &set_sequential(double start, double delta, int _int_scale = 0) {
-    kind = Sequential;
-    sequential.start = start;
-    sequential.delta = delta;
-    int_scale = _int_scale;
-    return *this;
-  }
-};
-
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Prints a Distribution to ostream
-inline std::ostream &operator<<(std::ostream &out, cutlass::Distribution const &dist) {
-  switch (dist.kind) {
-    case cutlass::Distribution::Uniform:
-      out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max
-          << ", pnan: " << dist.uniform.pnan;
-      break;
-    case cutlass::Distribution::Gaussian:
-      out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev
-          << ", pnzA: " << dist.gaussian.pnzA << ", pnzB: "
-          << dist.gaussian.pnzB << ", pnzC: " << dist.gaussian.pnzC;
-      break;
-    case cutlass::Distribution::Identity:
-      out << "identity";
-      break;
-    case cutlass::Distribution::Sequential:
-      out << "sequential";
-      break;
-    default:
-      out << "unknown";
-  }
-
-  out << ", int_scale: " << dist.int_scale;
-
-  return out;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/gather_tensor.hpp b/csrc/sparse/cutlass/example/util/gather_tensor.hpp
deleted file mode 100644
index 62616e00c7357..0000000000000
--- a/csrc/sparse/cutlass/example/util/gather_tensor.hpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cute/layout.hpp"
-#include "cute/tensor.hpp"
-#include "cute/util/print.hpp"
-
-namespace example {
-
-using namespace cute;
-
-// Empty type used to disable gather/scatter for a GEMM argument
-struct NoGather
-{
-  template<class... Ts>
-  NoGather(Ts...) {};
-};
-
-/// Function object that applies an index to its argument
-template <class Index>
-struct IndexedGather
-{
-  CUTE_HOST_DEVICE constexpr
-  IndexedGather(Index const *indices = {}): indices_(indices) {}
-
-  template <typename I>
-  CUTE_HOST_DEVICE constexpr
-  Index
-  operator()(I i) const { return indices_[i]; }
-
-  CUTE_HOST_DEVICE friend
-  void 
-  print(IndexedGather const &s) {
-    cute::print("Indexed");
-  }
-
-  Index const *indices_;
-};
-
-/// Function object that applies a stride to its argument
-/// Example: StridedFunc<int,_2> gathers every other row/column
-template <class Stride>
-struct StridedGather
-{
-  CUTE_HOST_DEVICE constexpr
-  StridedGather(Stride stride = {}): stride_(stride) {}
-
-  template <class I>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(I i) const { return i * stride_; }
-
-  CUTE_HOST_DEVICE friend
-  void 
-  print(StridedGather const &s) {
-    cute::print("Strided{");
-    print(s.stride_);
-    cute::print("}");
-  }
-
-  Stride stride_;
-};
-
-/// Custom stride object that applies a function followed by a stride
-template <class Func, class Stride>
-struct CustomStride
-{
-  CUTE_HOST_DEVICE constexpr
-  CustomStride(Func const &func, Stride const &stride): func_(func), stride_(stride) {}
-
-  template <class I>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  operator*(I i, CustomStride const &s) { return s.func_(i) * s.stride_; }
-
-  template <class I>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  operator*(CustomStride const &s, I i) { return s.func_(i) * s.stride_; }
-
-  CUTE_HOST_DEVICE friend
-  void
-  print(CustomStride const & s) {
-    cute::print("Custom{");
-    print(s.func_);
-    cute::print(",");
-    print(s.stride_);
-    cute::print("}");
-  }
-
-  template<class Div>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  safe_div(CustomStride const &s, Div const &div)
-  {
-    return CustomStride<Func, decltype(safe_div(s.stride_, div))>(s.func_, safe_div(s.stride_, div));
-  }
-
-  // Circumvent the requirement on make_layout that shape and stride are integral
-  template <class Shape>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  make_layout(Shape const &shape, CustomStride const &stride)
-  {
-    return Layout<Shape, CustomStride>(shape, stride);
-  }
-
-  Func func_;
-  Stride stride_;
-};
-
-template<class Stride, class Func>
-CUTLASS_HOST_DEVICE
-auto
-make_custom_stride_layout(Stride const &stride, Func&& func)
-{
-  // Use a dummy shape and replace the first non-unit stride with a custom gather stride
-  auto idx = find_if(stride, [](auto x){ return not is_constant<1, decltype(x)>{}; });
-  constexpr int I = decltype(idx)::value;
-  return make_layout(repeat_like(stride, _1{}),
-                     replace<I>(stride, CustomStride{static_cast<Func&&>(func), get<I>(stride)}));
-}
-
-/// Helper function to optionally create a gather tensor
-template<class Iterator, class Shape, class Stride, class Func>
-CUTLASS_HOST_DEVICE
-auto 
-make_gather_tensor(Iterator iter, Shape const &shape, Stride const &stride, Func &&func)
-{
-  if constexpr (not cutlass::platform::is_same<remove_cvref_t<Func>, NoGather>::value) {
-    Layout matrix_layout = make_identity_layout(shape);
-    auto offset = as_arithmetic_tuple(repeat_like(shape, _0{}));
-    Layout gather_layout = make_custom_stride_layout(stride, static_cast<Func&&>(func));
-    return make_tensor(iter, ComposedLayout{gather_layout, offset, matrix_layout});
-  } else {
-    return make_tensor(iter, shape, stride);
-  }
-}
-
-} // namespace example
-
-namespace cute
-{
-
-template<int N, int I, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(Shape const& shape, Stride const& stride)
-{
-  if constexpr (is_tuple<Shape>::value) {
-    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast<N,I>(s,d); });
-  } else if constexpr (is_scaled_basis<Stride>::value) {
-    if constexpr (Stride::mode() == I) {
-      return make_layout(shape_div(shape, Int<N>{}), shape_div(stride, Int<N>{}));
-    } else {
-      return make_layout(shape, stride);
-    }
-  } else {
-    return upcast<N>(shape, stride);
-  }
-
-  CUTE_GCC_UNREACHABLE;
-}
-
-template <int N, class OuterShape, class OuterStride, class Offset, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(ComposedLayout<Layout<OuterShape,OuterStride>,Offset,Layout<Shape,Stride>> const& layout)
-{
-  // Find index of the stride-1 mode - that is the only one that requires updating inner shape and offset
-  auto idx = find_if(layout.layout_a().stride(), [](auto x){ return is_constant<1, decltype(x)>{}; });
-  constexpr int I = decltype(idx)::value;
-
-  // Upcast the outer layout (works as expected)
-  auto outer = upcast<N>(layout.layout_a());
-
-  // Upcast the accumulated offset along stride-1 mode
-  auto offset = as_arithmetic_tuple(replace<I>(layout.offset(), upcast<N>(get<I>(layout.offset()))));
-
-  // Upcast the inner layout's shape along stride-1 mode
-  auto inner = upcast<N,I>(layout.layout_b().shape(), layout.layout_b().stride());
-
-  return composition(outer, offset, inner);
-}
-
-} // namespace example
diff --git a/csrc/sparse/cutlass/example/util/helper.h b/csrc/sparse/cutlass/example/util/helper.h
deleted file mode 100644
index a7a81e7479022..0000000000000
--- a/csrc/sparse/cutlass/example/util/helper.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include "cuda_runtime.h"
-#include <iostream>
-
-/**
- * Panic wrapper for unwinding CUTLASS errors
- */
-#define CUTLASS_CHECK(status)                                                                    \
-  {                                                                                              \
-    cutlass::Status error = status;                                                              \
-    if (error != cutlass::Status::kSuccess) {                                                    \
-      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ \
-                << std::endl;                                                                    \
-      exit(EXIT_FAILURE);                                                                        \
-    }                                                                                            \
-  }
-
-
-/**
- * Panic wrapper for unwinding CUDA runtime errors
- */
-#define CUDA_CHECK(status)                                              \
-  {                                                                     \
-    cudaError_t error = status;                                         \
-    if (error != cudaSuccess) {                                         \
-      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \
-                << " at line: " << __LINE__ << std::endl;               \
-      exit(EXIT_FAILURE);                                               \
-    }                                                                   \
-  }
-
-
-/**
- * GPU timer for recording the elapsed time across kernel(s) launched in GPU stream
- */
-struct GpuTimer
-{
-    cudaStream_t _stream_id;
-    cudaEvent_t _start;
-    cudaEvent_t _stop;
-
-    /// Constructor
-    GpuTimer() : _stream_id(0)
-    {
-        CUDA_CHECK(cudaEventCreate(&_start));
-        CUDA_CHECK(cudaEventCreate(&_stop));
-    }
-
-    /// Destructor
-    ~GpuTimer()
-    {
-        CUDA_CHECK(cudaEventDestroy(_start));
-        CUDA_CHECK(cudaEventDestroy(_stop));
-    }
-
-    /// Start the timer for a given stream (defaults to the default stream)
-    void start(cudaStream_t stream_id = 0)
-    {
-        _stream_id = stream_id;
-        CUDA_CHECK(cudaEventRecord(_start, _stream_id));
-    }
-
-    /// Stop the timer
-    void stop()
-    {
-        CUDA_CHECK(cudaEventRecord(_stop, _stream_id));
-    }
-
-    /// Return the elapsed time (in milliseconds)
-    float elapsed_millis()
-    {
-        float elapsed = 0.0;
-        CUDA_CHECK(cudaEventSynchronize(_stop));
-        CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start, _stop));
-        return elapsed;
-    }
-};
diff --git a/csrc/sparse/cutlass/example/util/host_tensor.h b/csrc/sparse/cutlass/example/util/host_tensor.h
deleted file mode 100644
index 3f061875b48dc..0000000000000
--- a/csrc/sparse/cutlass/example/util/host_tensor.h
+++ /dev/null
@@ -1,541 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-/*! \file
-  \brief HostTensor contributes management for both host and device memory.
-
-  HostTensor allocates host and device memory upon construction. Basic element-wise operations on
-  host memory synchronize device memory automatically. Explicit copy operations provide abstractions
-  for CUDA memcpy operations.
-
-  Call {host, device}_{data, ref, view}() for accessing host or device memory.
-
-  See cutlass/tensor_ref.h and cutlass/tensor_view.h for more details.
-*/
-
-#include <vector>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/fast_math.h"
-
-#include "device_memory.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Host tensor
-template <
-  /// Data type of element stored within tensor (concept: NumericType)
-  typename Element_,
-  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
-  typename Layout_
->
-class HostTensor {
-public:
-
-  /// Data type of individual access
-  using Element = Element_;
-
-  /// Mapping function from logical coordinate to linear memory
-  using Layout = Layout_;
-
-  /// Logical rank of tensor index space
-  static int const kRank = Layout::kRank;
-
-  /// Index type
-  using Index = typename Layout::Index;
-
-  /// Long index used for pointer offsets
-  using LongIndex = typename Layout::LongIndex;
-
-  /// Coordinate in logical tensor space
-  using TensorCoord = typename Layout::TensorCoord;
-
-  /// Layout's stride vector
-  using Stride = typename Layout::Stride;
-
-  /// Tensor reference to device memory
-  using TensorRef = TensorRef<Element, Layout>;
-
-  /// Tensor reference to constant device memory
-  using ConstTensorRef = typename TensorRef::ConstTensorRef;
-
-  /// Tensor reference to device memory
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Tensor reference to constant device memory
-  using ConstTensorView = typename TensorView::ConstTensorView;
-
-  /// Reference to element in tensor
-  using Reference = typename TensorRef::Reference;
-
-  /// Constant reference to element in tensor
-  using ConstReference = typename ConstTensorRef::Reference;
-
-private:
-  using StorageUnit = typename platform::conditional_t<std::is_same_v<Element, bool>, uint8_t,            // Avoid the std::vector<bool> specialization
-                                  typename platform::conditional_t<sizeof_bits<Element>::value % 8 == 0,  // Handle subbyte types
-                                      Element, uint8_t>>;
-  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
-  static constexpr int kContainerTypeNumBits = StorageContainerCalculator::kContainerTypeNumBits;
-  static constexpr int kContainerTypeNumLogicalElements = StorageContainerCalculator::kContainerTypeNumLogicalElements;
-  static constexpr int kContainerTypeNumBytes = StorageContainerCalculator::kContainerTypeNumBytes;
-  static constexpr int kContainerTypeNumStorageUnit = StorageContainerCalculator::kContainerTypeNumStorageUnit;
-
-  //
-  // Data members
-  //
-
-  /// Extent of tensor in logical dimensions
-  TensorCoord extent_;
-
-  /// Layout object
-  Layout layout_;
-
-  /// Host-side memory allocation
-  std::vector<StorageUnit> host_;
-
-  /// Device-side memory
-  device_memory::allocation<StorageUnit> device_;
-
-  /// number of containers 
-  size_t count_to_container_storage_unit_count(size_t count) {
-    return (count + kContainerTypeNumLogicalElements - 1) / kContainerTypeNumLogicalElements * kContainerTypeNumStorageUnit;
-  }
-
-public:
-  //
-  // Device and Host Methods
-  //
-
-  /// Default constructor
-  HostTensor() {}
-
-  /// Constructs a tensor given an extent. Assumes a packed layout
-  HostTensor(
-    TensorCoord const &extent,
-    bool device_backed = true
-  ) {
-
-    this->reset(extent, Layout::packed(extent), device_backed);
-  }
-
-  /// Constructs a tensor given an extent and layout
-  HostTensor(
-    TensorCoord const &extent,
-    Layout const &layout,
-    bool device_backed = true
-  ) {
-
-    this->reset(extent, layout, device_backed);
-  }
-
-  ~HostTensor() { }
-
-  /// Clears the HostTensor allocation to size/capacity = 0
-  void reset() {
-    extent_ = TensorCoord();
-    layout_ = Layout::packed(extent_);
-
-    host_.clear();
-    device_.reset();
-  }
-
-  /// Resizes internal memory allocations without affecting layout or extent
-  void reserve(
-    size_t count,                                        ///< size of tensor in elements
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
-#endif
-
-    device_.reset();
-    host_.clear();
-
-    size_t count_container = count_to_container_storage_unit_count(count);
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
-#endif    
-    host_.resize(count_container);
-
-    // Allocate memory
-    StorageUnit* device_memory = nullptr;
-    if (device_backed_) {
-#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
-      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
-#endif
-      device_memory = device_memory::allocate<StorageUnit>(count_container);
-    }
-    device_.reset(device_memory, device_backed_ ? count_container : 0);
-  }
-
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
-
-    extent_ = extent;
-    layout_ = layout;
-
-    reserve(size_t(layout_.capacity(extent_)), device_backed_);
-  }
-
-  /// Updates the extent and layout of the HostTensor. Allocates memory according to the new
-  /// extent and layout. Assumes a packed tensor configuration.
-  void reset(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
-
-    reset(extent, Layout::packed(extent), device_backed_);
-  }
-
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset().
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    Layout const &layout,                                ///< layout object of tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
-
-    extent_ = extent;
-    layout_ = layout;
-
-    LongIndex new_size = size_t(layout_.capacity(extent_));
-    LongIndex new_size_container = count_to_container_storage_unit_count((layout_.capacity(extent_)));
-
-    if (static_cast<decltype(host_.size())>(new_size_container) > host_.size()) {
-      reserve(new_size, device_backed_);
-    }
-  }
-
-  /// Changes the size of the logical tensor. Only allocates memory if new capacity exceeds reserved capacity.
-  /// To force allocation, call reset(). Note, this form of resize() assumes a packed tensor configuration.
-  void resize(
-    TensorCoord const &extent,                           ///< extent of logical tensor
-    bool device_backed_ = true) {                        ///< if true, device memory is also allocated. 
-
-    resize(extent, Layout::packed(extent), device_backed_);
-  }
-
-  /// Returns the logical number of elements stored in the host tensor
-  size_t size() const {
-    return layout_.capacity(extent_);
-  }
-
-  /// Returns the logical capacity in terms of number of elements. May be larger than the size().
-  LongIndex capacity() const {
-    return host_.size() / kContainerTypeNumStorageUnit * kContainerTypeNumLogicalElements;
-  }
-
-  /// Gets pointer to host data
-  Element * host_data() { return reinterpret_cast<Element *>(host_.data()); }
-
-  /// Gets pointer to host data with a pointer offset
-  Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
-
-  /// Gets a reference to an element in host memory
-  Reference host_data(LongIndex idx) {
-    return ReferenceFactory<Element>::get(host_data(), idx);
-  }
-
-  /// Gets pointer to host data
-  Element const * host_data() const { return reinterpret_cast<Element const *>(host_.data()); }
-
-  /// Gets pointer to host data with a pointer offset
-  Element const * host_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(host_data(), ptr_element_offset); }
-
-  /// Gets a constant reference to an element in host memory
-  ConstReference host_data(LongIndex idx) const {
-    return ReferenceFactory<Element const>::get(host_data(), idx);
-  }
-
-  /// Gets pointer to device data
-  Element * device_data() { return reinterpret_cast<Element *>(device_.get()); }
-
-  /// Gets pointer to device data
-  Element const * device_data() const { return reinterpret_cast<Element const *>(device_.get()); }
-
-  /// Gets pointer to device data with a pointer offset
-  Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
-
-  /// Gets pointer to device data with a pointer offset
-  Element const * device_data_ptr_offset(LongIndex ptr_element_offset) const { return &ReferenceFactory<Element>::get(device_data(), ptr_element_offset); }
-
-  /// Accesses the tensor reference pointing to data
-  TensorRef host_ref(LongIndex ptr_element_offset=0) { return TensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef host_ref(LongIndex ptr_element_offset=0) const { return ConstTensorRef(host_data_ptr_offset(ptr_element_offset), layout_); }
-
-  /// Accesses the tensor reference pointing to data
-  TensorRef device_ref(LongIndex ptr_element_offset=0) {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorRef device_ref(LongIndex ptr_element_offset=0) const {
-    return TensorRef(device_data_ptr_offset(ptr_element_offset), layout_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  TensorView host_view(LongIndex ptr_element_offset=0) {
-    return TensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView host_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(host_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  TensorView device_view(LongIndex ptr_element_offset=0) {
-    return TensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-
-  /// Accesses the tensor reference pointing to data
-  ConstTensorView device_view(LongIndex ptr_element_offset=0) const {
-    return ConstTensorView(device_data_ptr_offset(ptr_element_offset), layout_, extent_);
-  }
-
-  /// Returns true if device memory is allocated
-  bool device_backed() const {
-    return (device_.get() == nullptr) ? false : true;
-  }
-
-
-  /// Returns the layout object
-  Layout & layout() {
-    return layout_;
-  }
-
-  /// Returns the layout object
-  Layout layout() const {
-    return layout_;
-  }
-
-  /// Returns the layout object's stride vector
-  Stride stride() const {
-    return layout_.stride();
-  }
-
-  /// Returns the layout object's stride vector
-  Stride & stride() {
-    return layout_.stride();
-  }
-
-  /// Returns the layout object's stride in a given physical dimension
-  LongIndex stride(int dim) const {
-    return layout_.stride().at(dim);
-  }
-
-  /// Returns the layout object's stride in a given physical dimension
-  LongIndex & stride(int dim) {
-    return layout_.stride().at(dim);
-  }
-
-  /// Computes the offset of an index from the origin of the tensor
-  LongIndex offset(TensorCoord const& coord) const {
-    return layout_(coord);
-  }
-
-  /// Returns a reference to the element at the logical Coord in host memory
-  Reference at(TensorCoord const& coord) {
-    return host_data(offset(coord));
-  }
-
-  /// Returns a const reference to the element at the logical Coord in host memory
-  ConstReference at(TensorCoord const& coord) const {
-    return host_data(offset(coord));
-  }
-
-  /// Returns the extent of the tensor
-  TensorCoord extent() const {
-    return extent_;
-  }
-
-  /// Returns the extent of the tensor
-  TensorCoord & extent() {
-    return extent_;
-  }
-
-  /// Copies data from device to host
-  void sync_host() {
-    if (device_backed()) {
-      device_memory::copy_to_host(
-          host_.data(), device_.get(), device_.size());
-    }
-  }
-
-  /// Copies data from host to device
-  void sync_device() {
-    if (device_backed()) {
-      device_memory::copy_to_device(
-          device_.get(), host_.data(), host_.size());
-    }
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_host(
-    Element const* ptr_device,        ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_host(
-      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_device_to_device(
-    Element const* ptr_device,        ///< source device memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_device_to_device(
-      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_device), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_device(
-    Element const* ptr_host,          ///< source host memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_device(
-      device_.get(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_in_host_to_host(
-    Element const* ptr_host,          ///< source host memory
-    LongIndex count = -1) {           ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_host_to_host(
-      host_.data(), reinterpret_cast<StorageUnit const *>(ptr_host), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_host(
-    Element * ptr_host,               ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_host(
-      reinterpret_cast<StorageUnit *>(ptr_host), device_.get(), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_device_to_device(
-    Element * ptr_device,             ///< source device memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_device_to_device(
-      reinterpret_cast<StorageUnit *>(ptr_device), device_.get(), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_device(
-    Element * ptr_device,             ///< source host memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_to_device(
-      reinterpret_cast<StorageUnit *>(ptr_device), host_.data(), container_count);
-  }
-
-  /// Copy data from a caller-supplied device pointer into host memory.
-  void copy_out_host_to_host(
-    Element * ptr_host,               ///< source host memory
-    LongIndex count = -1) const {     ///< number of elements to transfer; if negative, entire tensor is overwritten.
-
-    if (count < 0) {
-      count = capacity();
-    }
-    else {
-      count = __NV_STD_MIN(capacity(), count);
-    }
-    size_t container_count = count_to_container_storage_unit_count(count);
-    device_memory::copy_host_to_host(
-      reinterpret_cast<StorageUnit *>(ptr_host), host_.data(), container_count);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/packed_stride.hpp b/csrc/sparse/cutlass/example/util/packed_stride.hpp
deleted file mode 100644
index e9a243a1322cc..0000000000000
--- a/csrc/sparse/cutlass/example/util/packed_stride.hpp
+++ /dev/null
@@ -1,570 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Utilities for packing constructing canonical CuTe stride types for 3.x mainloop params.
-*/
-
-#pragma once
-
-#include "cute/layout.hpp"
-#include "cute/container/array.hpp"   // cute::array
-#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides without batch mode
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>>
-make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
-  return s_copy;
-}
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
-  return s_copy;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides with batch mode
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>, int64_t>
-make_cute_packed_stride(cute::Stride<IntT, cute::Int<1>, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<IntT>(cute::get<1>(shape_MKL));
-  int batch_count =  cute::get<2>(shape_MKL);
-  if (batch_count > 1) {
-    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
-  }
-  else {
-    cute::get<2>(s_copy) = static_cast<IntT>(0);
-  }
-  return s_copy;
-}
-
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT, int64_t>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, IntT, int64_t> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL));
-  int batch_count =  cute::get<2>(shape_MKL);
-  if (batch_count > 1) {
-    cute::get<2>(s_copy) = static_cast<IntT>(cute::get<0>(shape_MKL) * cute::get<1>(shape_MKL));
-  }
-  else {
-    cute::get<2>(s_copy) = static_cast<IntT>(0);
-  }
-  return s_copy;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides with group mode
-
-template <class StrideIntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>>
-make_cute_packed_stride(cute::Stride<StrideIntT, cute::Int<1>, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<StrideIntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<0>(s_copy) = static_cast<StrideIntT>(cute::get<1>(shape_MKL));
-  return s_copy;
-}
-
-template <class StrideIntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>>
-make_cute_packed_stride(cute::Stride<cute::Int<1>, StrideIntT, cute::Int<0>> s, cute::Shape<int,int,int> shape_MKL) {
-  static_assert(std::is_integral_v<StrideIntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  auto s_copy = s;
-  cute::get<1>(s_copy) = static_cast<StrideIntT>(cute::get<0>(shape_MKL));
-  return s_copy;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Strides for convolutions
-
-// Output cutlass::layout::TensorNDHWC -> rank-3 stride (InT,_1,_0)
-// Note: For fprop/dgrad kernel, strides are assumed to be layout right in NZPQK/NDHWC order
-// and therefore can be coalesced to just q/w. For wgrad kernel, strides are assumed to be layout
-// right in KTRSC order and can be coalesced to just k.
-// We enforce this condition here with asserts.
-template <class IntT, size_t RankT_>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Int<1>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Int<1>, cute::Int<0>> s,
-    cute::array<int32_t, RankT_> shape_output,
-    cute::array<IntT, RankT_> stride_output,
-    cutlass::conv::Operator conv_op) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  static_assert(RankT_ >= 3u);
-  constexpr static int RankT = static_cast<int>(RankT_);
-
-  assert(stride_output[RankT-1] == 1);
-  cute::for_each(cute::make_seq<RankT-2>{}, [&](auto i) {
-    assert(stride_output[i] == shape_output[i+1] * stride_output[i+1]);
-  });
-
-  auto s_copy = s;
-  cute::get<0>(s_copy) = (conv_op == cutlass::conv::Operator::kWgrad) ?
-      stride_output[0] :
-      stride_output[RankT-2];
-  return s_copy;
-}
-
-//
-// Activation tensor ((w, h, d, n), _1) for fprop kernel
-//
-
-// Activation cutlass::layout::TensorNWC -> rank-2 stride ((W,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 3> stride_nwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nwc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_nwc[1];
-  cute::get<0,1>(s_copy) = stride_nwc[0];
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNHWC -> rank-2 stride ((W,H,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 4> stride_nhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-  assert(stride_nhwc[3] == 1);
-  auto s_copy = s;
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<0,i>(s_copy) = stride_nhwc[2-i];
-  });
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNDHWC -> rank-2 stride ((W,H,D,N),_1)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<IntT, IntT, IntT, IntT>, cute::Int<1>> s,
-    cute::array<IntT, 5> stride_ndhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ndhwc[4] == 1);
-  auto s_copy = s;
-  cute::for_each(cute::make_seq<4>{}, [&](auto i) {
-    cute::get<0,i>(s_copy) = stride_ndhwc[3-i];
-  });
-  return s_copy;
-}
-
-//
-// Filter tensor (k, (_1, s, r, t)) for fprop kernel
-//
-
-// Filter cutlass::layout::TensorNWC -> rank-2 stride (k, (_1, s))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>> s,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ksc[0];
-  cute::get<1,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorNHWC -> rank-2 stride (k, (_1, s, r))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>> s,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorNDHWC -> rank-2 stride (k, (_1, s, r, t))
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>> s,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-
-//
-// Activation tensor (_1, (w, h, d, n)) for wgrad kernel
-//
-// It is also Filter tensor ((_1), (k, s, r, t)) for dgrad kernel
-//
-
-// Activation cutlass::layout::TensorNWC -> rank-2 stride (_1, (W,N)) in wgrad
-// Filter cutlass::layout::TensorNWC -> rank-2 stride ((_1), (k, s)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT>> s,
-    cute::array<IntT, 3> stride_nwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nwc[2] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::get<1,0>(s_copy) = stride_nwc[1];
-    cute::get<1,1>(s_copy) = stride_nwc[0];
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_nwc in dgrad is ksc.
-    cute::get<1,0>(s_copy) = stride_nwc[0];
-    cute::get<1,1>(s_copy) = stride_nwc[1];
-  }
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNHWC -> rank-2 stride (_1, (W,H,N)) in wgrad
-// Filter cutlass::layout::TensorNHWC -> rank-2 stride ((_1), (k, s, r)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT>> s,
-    cute::array<IntT, 4> stride_nhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nhwc[3] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-      cute::get<1,i>(s_copy) = stride_nhwc[2-i];
-    });
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_nhwc in dgrad is krsc.
-    cute::get<1,0>(s_copy) = stride_nhwc[0];
-    cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-      cute::get<1,2-i>(s_copy) = stride_nhwc[i+1];
-    });
-  }
-  return s_copy;
-}
-
-// Activation cutlass::layout::TensorNDHWC -> rank-2 stride (_1, (W,H,D,N)) in wgrad
-// Filter cutlass::layout::TensorNDHWC -> rank-2 stride ((_1), (k, s, r, t)) in dgrad
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, cute::Stride<IntT, IntT, IntT, IntT>> s,
-    cute::array<IntT, 5> stride_ndhwc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ndhwc[4] == 1);
-  auto s_copy = s;
-  if (ConvOp == cutlass::conv::Operator::kWgrad) {
-    cute::for_each(cute::make_seq<4>{}, [&](auto i) {
-      cute::get<1,i>(s_copy) = stride_ndhwc[3-i];
-    });
-  }
-  else if (ConvOp == cutlass::conv::Operator::kDgrad) {
-    // stride_ndhwc in dgrad is ktrsc.
-    cute::get<1,0>(s_copy) = stride_ndhwc[0];
-    cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-      cute::get<1,3-i>(s_copy) = stride_ndhwc[i+1];
-    });
-  }
-  return s_copy;
-}
-
-//
-// NZPQ tensor (_1, nzpq) for wgrad kernel
-//
-
-// cutlass::layout::TensorNWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 3> stride_nqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nqk[2] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_nqk[1];
-  return s_copy;
-}
-
-// cutlass::layout::TensorNHWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 4> stride_npqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_npqk[3] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_npqk[2];
-  return s_copy;
-}
-
-// cutlass::layout::TensorNDHWC -> rank-2 stride (_1, nzpq)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Int<1>, IntT>
-make_cute_packed_stride(
-    cute::Stride<cute::Int<1>, IntT> s,
-    cute::array<IntT, 5> stride_nzpqk,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_nzpqk[4] == 1);
-  auto s_copy = s;
-  cute::get<1>(s_copy) = stride_nzpqk[3];
-  return s_copy;
-}
-
-
-
-//
-// Wgrad output tensor (k, (_1, s, r, t), _0)
-//
-
-// Filter cutlass::layout::TensorKCS -> rank-3 stride (k, (_1, s), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ksc[0];
-  cute::get<1,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorKCSR -> rank-3 stride (k, (_1, s, r), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<1,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorKCSRT -> rank-3 stride (k, (_1, s, r, t), _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<IntT, cute::Stride<cute::Int<1>, IntT, IntT, IntT>, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<0,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<1,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-
-
-//
-// Wgrad output tensor ((_1, s, r, t), k, _0)
-//
-
-// Filter cutlass::layout::TensorCSK -> rank-3 stride ((_1, s), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 3> shape_output,
-    cute::array<IntT, 3> stride_ksc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ksc[2] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_ksc[0];
-  cute::get<0,1>(s_copy) = stride_ksc[1];
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorCSRK -> rank-3 stride ((_1, s, r), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 4> shape_output,
-    cute::array<IntT, 4> stride_krsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_krsc[3] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_krsc[0];
-  cute::for_each(cute::make_seq<2>{}, [&](auto i) {
-    cute::get<0,2-i>(s_copy) = stride_krsc[i+1];
-  });
-  return s_copy;
-}
-
-// Filter cutlass::layout::TensorCSRTK -> rank-3 stride ((_1, s, r, t), k, _0)
-template <class IntT>
-CUTLASS_HOST_DEVICE
-cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>>
-make_cute_packed_stride(
-    cute::Stride<cute::Stride<cute::Int<1>, IntT, IntT, IntT>, IntT, cute::Int<0>> s,
-    [[maybe_unused]] cute::array<int32_t, 5> shape_output,
-    cute::array<IntT, 5> stride_ktrsc,
-    conv::Operator ConvOp) {
-  static_assert(std::is_integral_v<IntT>,
-    "Stride must have an integral type so it can be set dynamically. Static strides not supported.");
-
-  assert(stride_ktrsc[4] == 1);
-  auto s_copy = s;
-  cute::get<1,0>(s_copy) = stride_ktrsc[0];
-  cute::for_each(cute::make_seq<3>{}, [&](auto i) {
-    cute::get<0,3-i>(s_copy) = stride_ktrsc[i+1];
-  });
-  return s_copy;
-}
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/detail/inner_product.h b/csrc/sparse/cutlass/example/util/reference/detail/inner_product.h
deleted file mode 100644
index 2bce60b1390c0..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/detail/inner_product.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-
-namespace cutlass {
-namespace reference {
-namespace detail {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Template function to compute an inner product.
-#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate with a
-                            // host-only type
-template <typename Atype, typename Btype, typename Ctype>
-CUTLASS_HOST_DEVICE
-Ctype inner_product(Atype a, Btype b, Ctype c) {
-  return Ctype(a) * Ctype(b) + c;
-}
-
-/// Specialization for matrix multiplication with binary operands
-template <>
-CUTLASS_HOST_DEVICE
-int inner_product<Array<bin1_t, 32>, Array<bin1_t, 32>, int>(
-    Array<bin1_t, 32> a,
-    Array<bin1_t, 32> b,
-    int c) {
-
-  int accum = 0;
-  for (int bit = 0; bit < 32; bit++) {
-    accum += a[bit] ^ b[bit];
-  }
-  return accum + c;
-}
-
-/*
-/// Specialization for matrix multiplication with signed 4-bit integer operands
-template <>
-CUTLASS_HOST_DEVICE
-int inner_product<Array<int4b_t, 8>, Array<int4b_t, 8>, int>(
-    Array<int4b_t, 8> a,
-    Array<int4b_t, 8> b,
-    int c) {
-
-  int accum = 0;
-  for (int k = 0; k < 8; k++) {
-    accum += a[k] * b[k];
-  }
-  return accum + c;
-}
-
-/// Specialization for matrix multiplication with unsigned 4-bit integer operands
-template <>
-CUTLASS_HOST_DEVICE
-int inner_product<Array<uint4b_t, 8>, Array<uint4b_t, 8>, int>(
-    Array<uint4b_t, 8> a,
-    Array<uint4b_t, 8> b,
-    int c) {
-
-  int accum = 0;
-  for (int k = 0; k < 8; k++) {
-    accum += a[k] * b[k];
-  }
-  return accum + c;
-}
-*/
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename SrcType, typename DstType>
-struct Cast {
-  // Default behavior: convert to the destination type
-#pragma hd_warning_disable  // Suppresses warnings when attempting to instantiate complex<T> with a
-                            // host-only type
-  CUTLASS_HOST_DEVICE
-  static DstType apply(SrcType src) { return static_cast<DstType>(src); };
-};
-
-template <>
-struct Cast<float, int8_t> {
-  CUTLASS_HOST_DEVICE
-  static int8_t apply(float src) {
-    // Clamp to the range of signed 8-bit integers.
-    return static_cast<int8_t>(fmaxf(-128.f, fminf(127.f, src)));
-  };
-};
-
-template <>
-struct Cast<float, uint8_t> {
-  CUTLASS_HOST_DEVICE
-  static uint8_t apply(float src) {
-    // Clamp to the range of signed 8-bit integers.
-    return static_cast<uint8_t>(fmaxf(0.f, fminf(255.f, src)));
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace detail
-} // namespace reference
-} // namespace cutlass
-
diff --git a/csrc/sparse/cutlass/example/util/reference/detail/linear_to_coordinate.h b/csrc/sparse/cutlass/example/util/reference/detail/linear_to_coordinate.h
deleted file mode 100644
index 1f784c46f6eb9..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/detail/linear_to_coordinate.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace detail {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int Rank, int Index>
-struct LinearToCoordinateHelper {
-
-  CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
-
-    int64_t prod = 1;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = Rank - Index; i < Rank; ++i) {
-      prod *= int64_t(extent[i]);
-    }
-
-    coord[Rank - Index - 1] = int(idx / prod);
-
-    int64_t residual = idx % prod;
-    LinearToCoordinateHelper<Rank, Index - 1>()(coord, residual, extent);
-  }
-};
-
-template <int Rank>
-struct LinearToCoordinateHelper<Rank, 0> {
-
-  CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &) const {
-    coord[Rank - 1] = int(idx);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int Rank>
-struct LinearToCoordinate {
-
-  CUTLASS_HOST_DEVICE
-  void operator()(Coord<Rank> &coord, int64_t idx, Coord<Rank> const &extent) const {
-    LinearToCoordinateHelper<Rank, Rank - 1>()(coord, idx, extent);
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace detail
-} // namespace reference
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/csrc/sparse/cutlass/example/util/reference/device/convolution.h b/csrc/sparse/cutlass/example/util/reference/device/convolution.h
deleted file mode 100644
index c91cd0e229bdd..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/convolution.h
+++ /dev/null
@@ -1,1549 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Reference implementation for convolution in device-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/functional.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-///                                   Conv2d device reference kernel
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2d Fprop kernel - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv2dFprop(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int64_t npq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-
-  int thread_n[kThreadM];
-  int thread_p[kThreadM];
-  int thread_q[kThreadM];
-
-  // Compute N, P, Q coordinates for each row of a thread's tile
-  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    int64_t npq = npq_start + m;
-
-    thread_n[m] = int(npq / PQ);
-    
-    int64_t residual = npq % PQ;
-    thread_p[m] = int(residual / problem_size.Q);
-    thread_q[m] = int(residual % problem_size.Q);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  int c_per_group = problem_size.C / problem_size.groups;
-  int k_per_group = problem_size.K / problem_size.groups;
-
-  // Compute convolution
-  for (int R = 0; R < problem_size.R; ++R) {
-    for (int S = 0; S < problem_size.S; ++S) {
-      for (int C = 0; C < problem_size.C; ++C) {
-
-        // Get group id of currnet channel
-        int c_group_idx = C / c_per_group;
-
-        // Load from activations tensor
-        int filter_r = R;
-        int filter_s = S;   
-
-        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-          filter_r = problem_size.R - 1 - R;
-          filter_s = problem_size.S - 1 - S;
-        }
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-          int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-          if (thread_n[m] < problem_size.N && h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
-            element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], h, w, C}));
-          }
-          else {
-            element_A[m] = ElementAccumulator();
-          }
-        }
-
-        // Load from filters tensor
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < kThreadN; ++n) {
-          int thread_k = k_start + n;
-          int k_group_idx = thread_k / k_per_group;
-
-          if (thread_k < problem_size.K && k_group_idx == c_group_idx) {
-            element_B[n] = ElementAccumulator(tensor_w.at({thread_k, R, S, C % c_per_group}));
-          }
-          else {
-            element_B[n] = ElementAccumulator();
-          }
-        }
-
-        // Accumulate matrix product
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-          }
-        }
-      }
-    }
-  }
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    if (thread_n[m] < problem_size.N && thread_p[m] < problem_size.P && thread_q[m] < problem_size.Q) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_k = k_start + n;
-        if (thread_k < problem_size.K) {
-
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k}));
-          }
-
-          tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-// Conv3d Fprop kernel - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator =  ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv3dFprop(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int64_t nzpq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-
-  int thread_n[kThreadM];
-  int thread_z[kThreadM];
-  int thread_p[kThreadM];
-  int thread_q[kThreadM];
-
-  // Compute N, Z, P, Q coordinates for each row of a thread's tile
-  int64_t PQ = int64_t(problem_size.P) * problem_size.Q;
-  int64_t ZPQ = PQ * problem_size.Z;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    int64_t nzpq = nzpq_start + m;
-
-    thread_n[m] = int(nzpq / ZPQ);
-    
-    int64_t residual = nzpq % ZPQ;
-    thread_z[m] = int(residual / PQ);
-
-    residual = residual % PQ;
-    thread_p[m] = int(residual / problem_size.Q);
-    thread_q[m] = int(residual % problem_size.Q);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int T = 0; T < problem_size.T; ++T) {
-    for (int R = 0; R < problem_size.R; ++R) {
-      for (int S = 0; S < problem_size.S; ++S) {
-        for (int C = 0; C < problem_size.C; ++C) {
-
-          // Load from activations tensor
-          int filter_t = T;
-          int filter_r = R;
-          int filter_s = S;   
-
-          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-            filter_t = problem_size.T - 1 - T;
-            filter_r = problem_size.R - 1 - R;
-            filter_s = problem_size.S - 1 - S;
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            int d = thread_z[m] * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
-            int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-            int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-            if (thread_n[m] < problem_size.N && 
-              d >= 0 && d < problem_size.D && 
-              h >= 0 && h < problem_size.H && 
-              w >= 0 && w < problem_size.W) {
-
-              element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], d, h, w, C}));
-            }
-            else {
-              element_A[m] = ElementAccumulator();
-            }
-          }
-
-          // Load from filters tensor
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            int thread_k = k_start + n;
-
-            if (thread_k < problem_size.K) {
-              element_B[n] = ElementAccumulator(tensor_w.at({thread_k, T, R, S, C}));
-            }
-            else {
-              element_B[n] = ElementAccumulator();
-            }
-          }
-
-          // Accumulate matrix product
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < kThreadN; ++n) {
-              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-            }
-          }
-
-        } // for (C)
-      } // for (S)
-    }  // for (R) 
-  } // for (T)
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    if (thread_n[m] < problem_size.N && 
-      thread_z[m] < problem_size.Z && 
-      thread_p[m] < problem_size.P && 
-      thread_q[m] < problem_size.Q) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_k = k_start + n;
-        if (thread_k < problem_size.K) {
-
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}));
-          }
-
-          tensor_y_out.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } // for (n)
- 
-    }
-  } // for (m)
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2d dgrad kernel - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv2dDgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int64_t nhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-
-  int thread_n[kThreadM];
-  int thread_h[kThreadM];
-  int thread_w[kThreadM];
-
-  // Compute N, H, W coordinates for each row of a thread's tile
-  int64_t HW = int64_t(problem_size.H) * problem_size.W;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    int64_t nhw = nhw_start + m;
-
-    thread_n[m] = int(nhw / HW);
-    
-    int64_t residual = nhw % HW;
-    thread_h[m] = int(residual / problem_size.W);
-    thread_w[m] = int(residual % problem_size.W);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int R = 0; R < problem_size.R; ++R) {
-    for (int S = 0; S < problem_size.S; ++S) {
-      for (int K = 0; K < problem_size.K; ++K) {
-
-        // Load from activations tensor
-        int filter_r = R;
-        int filter_s = S;   
-
-        if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-          filter_r = problem_size.R - 1 - R;
-          filter_s = problem_size.S - 1 - S;
-        }
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-
-          int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
-          int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
-
-          element_A[m] = ElementAccumulator();
-
-          if (p >= 0 && !(p % problem_size.stride_h) && q >= 0 && !(q % problem_size.stride_w)) {
-
-            p = p / problem_size.stride_h;
-            q = q / problem_size.stride_w;
-
-            if (thread_n[m] < problem_size.N && p < problem_size.P && q < problem_size.Q) {
-              element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], p, q, K}));  
-            }
-          }
-        }
-
-        // Load from filters tensor
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < kThreadN; ++n) {
-          int thread_c = c_start + n;
-
-          if (thread_c < problem_size.C) {
-            element_B[n] = ElementAccumulator(tensor_w.at({K, R, S, thread_c}));
-          }
-          else {
-            element_B[n] = ElementAccumulator();
-          }
-        }
-
-        // Accumulate matrix product
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-          }
-        }
-      }
-    }
-  }
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    
-    if (thread_n[m] < problem_size.N && thread_h[m] < problem_size.H && thread_w[m] < problem_size.W) {
-      
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_c = c_start + n;
-        if (thread_c < problem_size.C) {
-
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_h[m], thread_w[m], thread_c}));
-          }
-
-          tensor_dx_out.at({thread_n[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-// Conv3d dgrad kernel - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 16,    // shape of a threadblock in units of threads
-  int kCtaShapeN = 8      // shape of a threadblock in units of threads
->
-__global__ void Conv3dDgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int64_t ndhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-
-  int thread_n[kThreadM];
-  int thread_d[kThreadM];
-  int thread_h[kThreadM];
-  int thread_w[kThreadM];
-
-  // Compute N, H, W coordinates for each row of a thread's tile
-  int64_t HW = int64_t(problem_size.H) * problem_size.W;
-  int64_t DHW = HW * problem_size.D;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-
-    int64_t ndhw = ndhw_start + m;
-
-    thread_n[m] = int(ndhw / DHW);
-    
-    int64_t residual = ndhw % DHW;
-    thread_d[m] = int(residual / HW);
-
-    residual = residual % HW;
-    thread_h[m] = int(residual / problem_size.W);
-    thread_w[m] = int(residual % problem_size.W);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int T = 0; T < problem_size.T; ++T) {
-    for (int R = 0; R < problem_size.R; ++R) {
-      for (int S = 0; S < problem_size.S; ++S) {
-        for (int K = 0; K < problem_size.K; ++K) {
-
-          // Load from activations tensor
-          int filter_t = T;
-          int filter_r = R;
-          int filter_s = S;   
-
-          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-            filter_t = problem_size.T - 1 - T;
-            filter_r = problem_size.R - 1 - R;
-            filter_s = problem_size.S - 1 - S;
-          }
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-
-            int z = thread_d[m] + problem_size.pad_d - filter_t * problem_size.dilation_d;
-            int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h;
-            int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w;
-
-            element_A[m] = ElementAccumulator();
-
-            if (z >= 0 && !(z % problem_size.stride_d) && 
-              p >= 0 && !(p % problem_size.stride_h) && 
-              q >= 0 && !(q % problem_size.stride_w)) {
-
-              z = z / problem_size.stride_d;
-              p = p / problem_size.stride_h;
-              q = q / problem_size.stride_w;
-
-              if (thread_n[m] < problem_size.N && z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
-                element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], z, p, q, K}));  
-              }
-            }
-          }
-
-          // Load from filters tensor
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            int thread_c = c_start + n;
-
-            if (thread_c < problem_size.C) {
-              element_B[n] = ElementAccumulator(tensor_w.at({K, T, R, S, thread_c}));
-            }
-            else {
-              element_B[n] = ElementAccumulator();
-            }
-          }
-
-          // Accumulate matrix product
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < kThreadN; ++n) {
-              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-            }
-          }
-
-        } // for (C)
-      } // for (S)
-    } // for (R)
-  } // for (T)
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    
-    if (thread_n[m] < problem_size.N && 
-      thread_d[m] < problem_size.D && 
-      thread_h[m] < problem_size.H && 
-      thread_w[m] < problem_size.W) {
-      
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-        int thread_c = c_start + n;
-        if (thread_c < problem_size.C) {
-
-          ElementCompute c_ref = ElementCompute();
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}));
-          }
-
-          tensor_dx_out.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Conv2d wgrad kernel - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
-  int kCtaShapeN = 16     // shape of a threadblock in units of threads
->
-__global__ void Conv2dWgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int64_t rsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  
-  int thread_r[kThreadN];
-  int thread_s[kThreadN];
-  int thread_c[kThreadN];
-
-  // Compute R, S, C coordinates for each row of a thread's tile
-  int64_t SC = int64_t(problem_size.S) * problem_size.C;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int n = 0; n < kThreadN; ++n) {
-
-    int64_t rsc = rsc_start + n;
-    int64_t residual = rsc % SC;
-
-    thread_r[n] = int(rsc / SC);
-    thread_s[n] = int(residual / problem_size.C);
-    thread_c[n] = int(residual % problem_size.C);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int N = 0; N < problem_size.N; ++N) {
-    for (int P = 0; P < problem_size.P; ++P) {
-      for (int Q = 0; Q < problem_size.Q; ++Q) {
-
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          int thread_k = k_start + m;
-
-          element_A[m] = ElementAccumulator();
-
-          if (thread_k < problem_size.K) {
-            element_A[m] = ElementAccumulator(tensor_dy.at({N, P, Q, thread_k}));
-          }
-        }
-
-        // Load from filters tensor
-        CUTLASS_PRAGMA_UNROLL
-        for (int n = 0; n < kThreadN; ++n) {
-          
-          // Load from activations tensor
-          int filter_r = thread_r[n];
-          int filter_s = thread_s[n];
-
-          if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-            filter_r = problem_size.R - 1 - filter_r;
-            filter_s = problem_size.S - 1 - filter_s;
-          }
-
-          int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-          int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-          element_B[n] = ElementAccumulator();
-
-          if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W && thread_c[n] < problem_size.C) {
-            element_B[n] = ElementAccumulator(tensor_x.at({N, h, w, thread_c[n]}));
-          }
-        }
-
-        // Accumulate matrix product
-        CUTLASS_PRAGMA_UNROLL
-        for (int m = 0; m < kThreadM; ++m) {
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-          }
-        }
-      }
-    }
-  }
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int thread_k = k_start + m;
-
-    if (thread_k < problem_size.K) {
-      
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-
-        if (thread_r[n] < problem_size.R && thread_s[n] < problem_size.S && thread_c[n] < problem_size.C) {
-
-          ElementCompute c_ref = ElementCompute();
-
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}));
-          }
-
-          tensor_dw_out.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-// Conv3d wgrad kernel - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>,
-  int kThreadM = 2,       // shape of a thread's tile in the GEMM M dimension
-  int kThreadN = 4,       // shape of a thread's tile in the GEMM N dimension
-  int kCtaShapeM = 8,     // shape of a threadblock in units of threads
-  int kCtaShapeN = 16     // shape of a threadblock in units of threads
->
-__global__ void Conv3dWgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta
-  ) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  ElementAccumulator element_A[kThreadM];
-  ElementAccumulator element_B[kThreadN];
-  ElementAccumulator accum[kThreadM][kThreadN];
-
-  int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM;
-  int64_t trsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN;
-  
-  int thread_t[kThreadN];
-  int thread_r[kThreadN];
-  int thread_s[kThreadN];
-  int thread_c[kThreadN];
-
-  // Compute R, S, C coordinates for each row of a thread's tile
-  int64_t SC = int64_t(problem_size.S) * problem_size.C;
-  int64_t RSC = SC * problem_size.R;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int n = 0; n < kThreadN; ++n) {
-
-    int64_t trsc = trsc_start + n;
-
-    thread_t[n] = int(trsc / RSC);
-
-    int64_t residual = trsc % RSC;
-    thread_r[n] = int(residual / SC);
-
-    residual = residual % SC; 
-    thread_s[n] = int(residual / problem_size.C);
-    thread_c[n] = int(residual % problem_size.C);
-  }
-
-  // Clear accumulators
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int n = 0; n < kThreadN; ++n) {
-      accum[m][n] = ElementAccumulator();
-    }
-  }
-
-  // Compute convolution
-  for (int N = 0; N < problem_size.N; ++N) {
-    for (int Z = 0; Z < problem_size.Z; ++Z) {
-      for (int P = 0; P < problem_size.P; ++P) {
-        for (int Q = 0; Q < problem_size.Q; ++Q) {
-
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            int thread_k = k_start + m;
-
-            element_A[m] = ElementAccumulator();
-
-            if (thread_k < problem_size.K) {
-              element_A[m] = ElementAccumulator(tensor_dy.at({N, Z, P, Q, thread_k}));
-            }
-          }
-
-          // Load from filters tensor
-          CUTLASS_PRAGMA_UNROLL
-          for (int n = 0; n < kThreadN; ++n) {
-            
-            // Load from activations tensor
-            int filter_t = thread_t[n];
-            int filter_r = thread_r[n];
-            int filter_s = thread_s[n];
-
-            if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-              filter_t = problem_size.T - 1 - filter_t;
-              filter_r = problem_size.R - 1 - filter_r;
-              filter_s = problem_size.S - 1 - filter_s;
-            }
-
-            int d = Z * problem_size.stride_d - problem_size.pad_w + filter_t * problem_size.dilation_d;
-            int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-            int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-            element_B[n] = ElementAccumulator();
-
-            if (d >= 0 && d < problem_size.D && 
-              h >= 0 && h < problem_size.H && 
-              w >= 0 && w < problem_size.W && 
-              thread_c[n] < problem_size.C) {
-
-              element_B[n] = ElementAccumulator(tensor_x.at({N, d, h, w, thread_c[n]}));
-            }
-          }
-
-          // Accumulate matrix product
-          CUTLASS_PRAGMA_UNROLL
-          for (int m = 0; m < kThreadM; ++m) {
-            CUTLASS_PRAGMA_UNROLL
-            for (int n = 0; n < kThreadN; ++n) {
-              accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]);
-            }
-          }
-
-        } // for (Q)
-      } // for (P)
-    } // for (Z)
-  } // for (N)
-
-  // Write out the results
-  CUTLASS_PRAGMA_UNROLL
-  for (int m = 0; m < kThreadM; ++m) {
-    int thread_k = k_start + m;
-
-    if (thread_k < problem_size.K) {
-      
-      CUTLASS_PRAGMA_UNROLL
-      for (int n = 0; n < kThreadN; ++n) {
-
-        if (thread_t[n] < problem_size.T && 
-          thread_r[n] < problem_size.R &&
-          thread_s[n] < problem_size.S && 
-          thread_c[n] < problem_size.C) {
-
-          ElementCompute c_ref = ElementCompute();
-
-          if (beta != ElementCompute()) {
-            c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}));
-          }
-
-          tensor_dw_out.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}) = convert_op(
-            alpha * ElementCompute(accum[m][n]) + beta * c_ref);
-        }
-      } 
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Conv2d Fprop dispatcher - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2dFprop(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-
-  int64_t npq = int64_t(problem_size.N) * problem_size.P * problem_size.Q;
-  int64_t blocks_m = (npq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-
-  kernel::Conv2dFprop<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_x,
-    tensor_w,
-    tensor_y_in,
-    tensor_y_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv3d Fprop dispatcher - y = fprop(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3dFprop(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 4;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-
-  int64_t nzpq = int64_t(problem_size.N) * problem_size.Z * problem_size.P * problem_size.Q;
-  int64_t blocks_m = (nzpq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-
-  kernel::Conv3dFprop<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_x,
-    tensor_w,
-    tensor_y_in,
-    tensor_y_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv2d Dgrad dispatcher - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2dDgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-
-  int64_t nhw = int64_t(problem_size.N) * problem_size.H * problem_size.W;
-  int64_t blocks_m = (nhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-
-  kernel::Conv2dDgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_w,
-    tensor_dx_in,
-    tensor_dx_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv3d Dgrad dispatcher - dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3dDgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 16;    // shape of a threadblock in units of threads
-  int const kCtaShapeN = 8;     // shape of a threadblock in units of threads
-
-  int64_t ndhw = int64_t(problem_size.N) * problem_size.D * problem_size.H * problem_size.W;
-  int64_t blocks_m = (ndhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN));
-
-  kernel::Conv3dDgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_w,
-    tensor_dx_in,
-    tensor_dx_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv2d Wgrad dispatcher - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2dWgrad(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
-  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
-
-  int64_t rsc = int64_t(problem_size.R) * problem_size.S * problem_size.C;
-  int64_t blocks_n = (rsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
-
-  kernel::Conv2dWgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_x,
-    tensor_dw_in,
-    tensor_dw_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/// Conv3d Wgrad dispatcher - dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3dWgrad(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-
-  //
-  // Blocking factors improve performance of reference implementation
-  //
-
-  int const kThreadM = 2;       // shape of a thread's tile in the GEMM M dimension
-  int const kThreadN = 4;       // shape of a thread's tile in the GEMM N dimension
-  int const kCtaShapeM = 8;     // shape of a threadblock in units of threads
-  int const kCtaShapeN = 16;    // shape of a threadblock in units of threads
-
-  int64_t trsc = int64_t(problem_size.T) * problem_size.R * problem_size.S * problem_size.C;
-  int64_t blocks_n = (trsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN);
-
-  dim3 block(kCtaShapeM, kCtaShapeN);
-  dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n));
-
-  kernel::Conv3dWgrad<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ElementCompute,
-    ElementAccumulator,
-    ConvertOp,
-    InnerProductOp,
-    kThreadM,
-    kThreadN,
-    kCtaShapeM,
-    kCtaShapeN
-  ><<< grid, block, 0, stream >>>(
-    problem_size,
-    tensor_dy,
-    tensor_x,
-    tensor_dw_in,
-    tensor_dw_out,
-    alpha,
-    beta
-  );
-
-  cudaError_t result = cudaPeekAtLastError();
-  if (result != cudaSuccess) {
-    return Status::kErrorInternal;
-  }
-
-  return Status::kSuccess;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv2d(
-  conv::Operator convolutional_operator,
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    return Conv2dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-    break;
-
-  case conv::Operator::kDgrad:
-    return Conv2dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-    break;
-
-  case conv::Operator::kWgrad:
-    return Conv2dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-    break;
-
-  default: break;
-  }
-  
-  return Status::kErrorNotSupported;
-}
-
-/// Generic 3D convolution targeting Conv3dFprop, Conv3dDgrad, and Conv3dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-Status Conv3d(
-  conv::Operator convolutional_operator,
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta,
-  cudaStream_t stream = nullptr) {
-  
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    return Conv3dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-
-  case conv::Operator::kDgrad:
-    return Conv3dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-
-  case conv::Operator::kWgrad:
-    return Conv3dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream);
-
-  default: break;
-  }
-  
-  return Status::kErrorNotSupported;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace device
-}  // namespace reference
-}  // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/csrc/sparse/cutlass/example/util/reference/device/gemm.h b/csrc/sparse/cutlass/example/util/reference/device/gemm.h
deleted file mode 100644
index 1a1bd3751801a..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/gemm.h
+++ /dev/null
@@ -1,385 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in device-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/util/reference/device/kernel/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp = multiply_add<AccumulatorType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  AccumulatorType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  // Blocking structure potentially improves performance of reference implementation
-  // with a minor increase in complexity.
-  //
-  // Note, this reference implementation is NOT expected to approach peak performance.
-  using OutputTile = MatrixShape<4, 4>;
-
-  dim3 block(16, 8);
-
-  dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
-  );
-
-  // Launch a GEMM kernel
-  kernel::Gemm<
-    TensorRef<ElementA, LayoutA>,
-    TensorRef<ElementB, LayoutB>,
-    TensorRef<ElementC, LayoutC>,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
-}
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp = multiply_add<AccumulatorType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  AccumulatorType initial_accum) {
-
-  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                ScalarType, AccumulatorType, InnerProductOp, ConvertOp>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-        initial_accum);
-}
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Gemm;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename AccumulatorType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-            ScalarType, AccumulatorType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-
-    static_assert(
-      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-      "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                  ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-      LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-      "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                ScalarType, AccumulatorType, multiply_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add-saturate
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename AccumulatorType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            AccumulatorType, arch::OpMultiplyAddSaturate> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, multiply_add<AccumulatorType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for XOR-popc
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename AccumulatorType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            AccumulatorType, arch::OpXorPopc> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  AccumulatorType initial_accum = AccumulatorType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, AccumulatorType, xor_add<AccumulatorType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Batched GEMM
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a batch of GEMMs over a set of matrices of common dimension.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename InnerProductOp,
-  typename ConvertOp
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c,
-  AccumulatorType initial_accum) {
-
-  static_assert(
-    TensorRefCollectionA::kRank == 2 &&
-    TensorRefCollectionB::kRank == 2 &&
-    TensorRefCollectionC::kRank == 2, "Tensors must be of rank 2");
-
-  // Blocking structure potentially improves performance of reference implementation
-  // with a minor increase in complexity.
-  //
-  // Note, this reference implementation is NOT expected to approach peak performance.
-  using OutputTile = MatrixShape<4, 4>;
-
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn),
-    batch_count
-  );
-
-  // Launch a GEMM kernel
-  kernel::BatchedGemm<
-    TensorRefCollectionA,
-    TensorRefCollectionB,
-    TensorRefCollectionC,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    initial_accum
-  );
-}
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c) {
-
-  BatchedGemm(problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/gemm_complex.h b/csrc/sparse/cutlass/example/util/reference/device/gemm_complex.h
deleted file mode 100644
index b4d41bd28efb5..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/gemm_complex.h
+++ /dev/null
@@ -1,350 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in device-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  int kMblock = 4,
-  int kNblock = 4
->
-__global__ void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
-  int batch_idx = blockIdx.z;
-
-  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
-  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
-  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
-  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
-
-  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
-
-    // Compute matrix product using blocks
-    ComputeType accum[kMblock][kNblock];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        accum[i][j] = initial_accum;
-      }
-    }
-
-    for (int k_block = 0; k_block < K; ++k_block) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < kNblock; j++) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kMblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          if (row < M && col < N) {
-            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-            ElementB b = tensor_b.at(MatrixCoord(k_block, col));
-
-            ComputeType a_ik = ComputeType(a);
-            ComputeType b_kj = ComputeType(b);
-
-            if (transform_a == ComplexTransform::kConjugate) {
-              a_ik = conj(a_ik);
-            }
-
-            if (transform_b == ComplexTransform::kConjugate) {
-              b_kj = conj(b_kj);
-            }
-
-            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-          }
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        int row = row_block + i;
-        int col = col_block + j;
-
-        MatrixCoord coord = MatrixCoord(row, col);
-
-        if (row < M && col < N) {
-
-          tensor_d.at(coord) = convert_op(
-            alpha * ScalarType(accum[i][j]) + 
-            beta * ScalarType(tensor_c.at(coord)));
-        }
-      }
-    }
-
-    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
-    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
-    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
-    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
-
-  } // for (batch_idx)
-}
-
-} // namespace kernel
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
- 
-  int const kMblock = 4;
-  int const kNblock = 4;
-
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
-    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
-    batch_count % std::numeric_limits<uint16_t>::max()
-  );
-
-  if (grid.y <= std::numeric_limits<uint16_t>::max()) {
-    kernel::GemmComplex<
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      ScalarType,
-      ComputeType,
-      ElementD,
-      ConvertOp,
-      InnerProductOp,
-      kMblock,
-      kNblock
-    ><<< grid, block >>>(
-      problem_size,
-      alpha,
-      tensor_a,
-      transform_a,
-      tensor_b,
-      transform_b,
-      beta,
-      tensor_c,
-      tensor_d,
-      initial_accum,
-      batch_count,
-      batch_stride_A,
-      batch_stride_B,
-      batch_stride_C,
-      batch_stride_D
-    );
-  } else {
-    // Using bigger thread tile size
-    int const kBigMblock = 4;
-    int const kBigNblock = 16;
-
-    dim3 Bigblock(16, 8);
-    dim3 Biggrid(
-      (problem_size.m() + block.x * kBigMblock - 1) / (block.x * kBigMblock),
-      (problem_size.n() + block.y * kBigNblock - 1) / (block.y * kBigNblock),
-      batch_count % std::numeric_limits<uint16_t>::max()
-    );
-
-    kernel::GemmComplex<
-      ElementA,
-      LayoutA,
-      ElementB,
-      LayoutB,
-      ElementC,
-      LayoutC,
-      ScalarType,
-      ComputeType,
-      ElementD,
-      ConvertOp,
-      InnerProductOp,
-      kBigMblock,
-      kBigNblock
-    ><<< Biggrid, Bigblock >>>(
-      problem_size,
-      alpha,
-      tensor_a,
-      transform_a,
-      tensor_b,
-      transform_b,
-      beta,
-      tensor_c,
-      tensor_d,
-      initial_accum,
-      batch_count,
-      batch_stride_A,
-      batch_stride_B,
-      batch_stride_C,
-      batch_stride_D
-    );
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ElementD = ElementC
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d) {
-
-  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/gemm_planar_complex.h b/csrc/sparse/cutlass/example/util/reference/device/gemm_planar_complex.h
deleted file mode 100644
index 37c103c3fcb45..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/gemm_planar_complex.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in device code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/matrix_coord.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_ref_planar_complex.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-static int const kGemmPlanarComplexBlockSize = 4;
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<complex<ComputeType>>
->
-__global__ void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
-  complex<ComputeType> initial_accum) {
-
-  int const kMblock = kGemmPlanarComplexBlockSize;
-  int const kNblock = kGemmPlanarComplexBlockSize;
-
-  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
-  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
-  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  complex<ComputeType> accum[kMblock][kNblock];
-  
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int j = 0; j < kNblock; j++) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kMblock; i++) {
-      accum[i][j] = initial_accum;
-    }
-  }
-
-  CUTLASS_PRAGMA_NO_UNROLL
-  for (int k_block = 0; k_block < K; ++k_block) {
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-
-        int row = row_block + i;
-        int col = col_block + j;
-
-        if (row < M && col < N) {
-
-          ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
-          ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
-
-          complex<ComputeType> a = complex<ComputeType>{
-            ComputeType(a_ik.real()),
-            ComputeType(a_ik.imag())
-          };
-
-          complex<ComputeType> b = complex<ComputeType>{
-            ComputeType(b_kj.real()),
-            ComputeType(b_kj.imag())
-          };
-
-          if (transform_a == ComplexTransform::kConjugate) {
-            a = conj(a);
-          }
-
-          if (transform_b == ComplexTransform::kConjugate) {
-            b = conj(b);
-          }
-
-          accum[i][j] = inner_product_op(a, b,  accum[i][j]);
-        }
-      }
-    }
-  }
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int j = 0; j < kNblock; j++) {
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < kMblock; i++) {
-
-      int row = row_block + i;
-      int col = col_block + j;
-
-      MatrixCoord coord = MatrixCoord(row, col);
-
-      if (row < M && col < N) {
-
-        complex<ScalarType> acc{
-          ScalarType(accum[i][j].real()),
-          ScalarType(accum[i][j].imag())
-        };
-
-        ComplexC c_ij = ComplexC();
-
-        if (beta.real() != ScalarType() || beta.imag() != ScalarType()) {
-          c_ij = tensor_c.at(coord);
-        }
-
-        complex<ScalarType> src{
-          ScalarType(c_ij.real()),
-          ScalarType(c_ij.imag())
-        };
-
-        complex<ScalarType> result = alpha * acc + beta * src;
-
-        ComplexC d_ij;
-
-        d_ij.real() = convert_op(result.real());
-        d_ij.imag() = convert_op(result.imag());
-
-        tensor_d.at(coord) = d_ij;
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<complex<ComputeType>>
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
-  complex<ComputeType> initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  int const kMblock = kernel::kGemmPlanarComplexBlockSize;
-  int const kNblock = kernel::kGemmPlanarComplexBlockSize;
-
-  dim3 block(16, 8);
-
-  dim3 grid(
-    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
-    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
-    1);
-
-  kernel::GemmPlanarComplex<
-    ElementA, LayoutA,
-    ElementB, LayoutB,
-    ElementC, LayoutC,
-    ScalarType,
-    ComputeType,
-    ConvertOp,
-    InnerProductOp
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    transform_a,
-    tensor_b,
-    transform_b,
-    beta,    
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
-
-  GemmPlanarComplex(
-    problem_size, 
-    alpha, 
-    tensor_a, transform_a, 
-    tensor_b, transform_b, 
-    beta, 
-    tensor_c,
-    tensor_d,
-    complex<ScalarType>());
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/device/gett.hpp b/csrc/sparse/cutlass/example/util/reference/device/gett.hpp
deleted file mode 100644
index 78586ad62dc18..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/gett.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-  \brief GETT device reference code
-*/
-#pragma once
-
-#include <cute/tensor.hpp>
-
-namespace cutlass::reference::device {
-
-template <
-  class ATensor,
-  class BTensor,
-  class CTensor,
-  class DTensor,
-  class ElementAccumulator,
-  class ElementEpilogue>
-__global__ static
-void
-gett_kernel(
-  DTensor       D,
-  ATensor const A,
-  BTensor const B,
-  CTensor const C,
-  ElementEpilogue alpha, ElementEpilogue beta,
-  ElementAccumulator acc_init)
-{
-  using namespace cute;
-
-  static_assert(DTensor::rank == 3, "(M,N,L)");
-  static_assert(ATensor::rank == 3, "(M,K,L)");
-  static_assert(BTensor::rank == 3, "(N,K,L)");
-  static_assert(CTensor::rank == 3, "(M,N,L)");
-
-  assert(size<0>(A) == size<0>(D));  // M
-  assert(size<0>(C) == size<0>(D));  // M
-  assert(size<0>(B) == size<1>(D));  // N
-  assert(size<1>(C) == size<1>(D));  // N
-  assert(size<1>(A) == size<1>(B));  // K
-  assert(size<2>(A) == size<2>(D));  // L
-  assert(size<2>(B) == size<2>(D));  // L
-  assert(size<2>(C) == size<2>(D));  // L
-
-  NumericConverter<ElementAccumulator, typename ATensor::value_type> a_converter;
-  NumericConverter<ElementAccumulator, typename BTensor::value_type> b_converter;
-  NumericConverter<ElementEpilogue, ElementAccumulator> acc_converter;
-  NumericConverter<ElementEpilogue, typename CTensor::value_type> source_converter;
-  NumericConverter<typename DTensor::value_type, ElementEpilogue> output_converter;
-
-  // Thread id to each element of D
-  for (int tid = threadIdx.x + blockDim.x * blockIdx.x;
-       tid < size(D);
-       tid += blockDim.x * gridDim.x) {
-    // (m,n,l) coordinate
-    auto mnl_coord = idx2crd(tid, product_each(shape(D)));
-    auto m = get<0>(mnl_coord);
-    auto n = get<1>(mnl_coord);
-    auto l = get<2>(mnl_coord);
-
-    auto A_ml = A(m,_,l);
-    auto B_nl = B(n,_,l);
-
-    ElementAccumulator accum = ElementAccumulator(0);
-    for (int k = 0; k < size<1>(A); ++k) {
-      ElementAccumulator a = a_converter(A_ml(k));
-      ElementAccumulator b = b_converter(B_nl(k));
-      accum += a * b;
-    }
-
-    ElementEpilogue scaled_output = (alpha * acc_converter(accum)) + (beta * source_converter(C(m,n,l)));
-    D(m,n,l) = output_converter(scaled_output);
-  }
-}
-
-// Most general version
-template <
-  class ProblemShapeMNKL,
-  class ElementA,
-  class StrideA,
-  class ElementB,
-  class StrideB,
-  class ElementAccumulator,
-  class ElementC,
-  class StrideC,
-  class ElementD,
-  class StrideD,
-  class ElementEpilogue>
-void
-gett(
-    ProblemShapeMNKL problem_shape_mnkl,
-    ElementA const* ptr_A, StrideA stride_a_mkl,
-    ElementB const* ptr_B, StrideB stride_b_nkl,
-    ElementAccumulator _,
-    ElementC const* ptr_C, StrideC stride_c_mnl,
-    ElementD      * ptr_D, StrideD stride_d_mnl,
-    ElementEpilogue alpha, ElementEpilogue beta,
-    cudaStream_t stream = 0) {
-  using namespace cute;
-
-  static_assert(cute::rank(ProblemShapeMNKL{}) == 4);
-  auto M = get<0>(problem_shape_mnkl);
-  auto N = get<1>(problem_shape_mnkl);
-  auto K = get<2>(problem_shape_mnkl);
-  auto L = get<3>(problem_shape_mnkl);
-
-  // Represent the full tensors
-  auto A = make_tensor(make_gmem_ptr(ptr_A), make_shape(M,K,L), stride_a_mkl); // (M,K,L)
-  auto B = make_tensor(make_gmem_ptr(ptr_B), make_shape(N,K,L), stride_b_nkl); // (N,K,L)
-  auto C = make_tensor(make_gmem_ptr(ptr_C), make_shape(M,N,L), stride_c_mnl); // (M,N,L)
-  auto D = make_tensor(make_gmem_ptr(ptr_D), make_shape(M,N,L), stride_d_mnl); // (M,N,L)
-
-  dim3 dimBlock(256);
-  dim3 dimGrid(240);
-  gett_kernel<<< dimGrid, dimBlock, 0, stream >>>(D, A, B, C, alpha, beta, ElementAccumulator(0));
-}
-
-} // namespace cutlass::reference::device
diff --git a/csrc/sparse/cutlass/example/util/reference/device/kernel/gemm.h b/csrc/sparse/cutlass/example/util/reference/device/kernel/gemm.h
deleted file mode 100644
index f7731213013d5..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/kernel/gemm.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/util/reference/device/thread/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename TensorRefA,
-  typename TensorRefB,
-  typename TensorRefC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename OutputTile,
-  typename InnerProductOp,
-  typename ConvertOp
->
-__global__ void Gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRefA tensor_a,
-  TensorRefB tensor_b,
-  ScalarType beta,
-  TensorRefC tensor_c,
-  TensorRefC tensor_d,
-  AccumulatorType initial_accum) {
-
-  // Map each thread to a unique tile of the output matrix
-  MatrixCoord output_coord(
-    MatrixCoord::Index((threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kRow),
-    MatrixCoord::Index((threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kColumn)
-  );
-
-  // Compute the general matrix product
-  thread::Gemm<
-    TensorRefA,
-    TensorRefB,
-    TensorRefC,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  > gemm(initial_accum);
-
-  gemm.multiply_add(
-    problem_size,
-    tensor_a,
-    tensor_b,
-    output_coord);
-
-  gemm.epilogue(problem_size, alpha, beta, tensor_c, tensor_d, output_coord);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename OutputTile,
-  typename InnerProductOp,
-  typename ConvertOp
->
-__global__ void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRefCollectionA tensor_collection_a,
-  TensorRefCollectionB tensor_collection_b,
-  ScalarType beta,
-  TensorRefCollectionC tensor_collection_c,
-  AccumulatorType initial_accum) {
-
-  // Obtain batch ID
-  int batch_id = blockIdx.z;
-
-  // Dereference based on batch_id
-  typename TensorRefCollectionA::TensorRef tensor_a = tensor_collection_a.at(batch_id);
-  typename TensorRefCollectionB::TensorRef tensor_b = tensor_collection_b.at(batch_id);
-  typename TensorRefCollectionC::TensorRef tensor_c = tensor_collection_c.at(batch_id);
-
-  // Map each thread to a unique tile of the output matrix
-  MatrixCoord output_coord(
-    (threadIdx.x + blockIdx.x * blockDim.x) * OutputTile::kColumn,
-    (threadIdx.y + blockIdx.y * blockDim.y) * OutputTile::kRow
-  );
-
-  // Compute the general matrix product
-  thread::Gemm<
-    typename TensorRefCollectionA::TensorRef,
-    typename TensorRefCollectionB::TensorRef,
-    typename TensorRefCollectionC::TensorRef,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    InnerProductOp,
-    ConvertOp
-  > gemm(initial_accum);
-
-  gemm.multiply_add(
-    problem_size,
-    tensor_a,
-    tensor_b,
-    output_coord);
-
-  gemm.epilogue(problem_size, alpha, beta, tensor_c, output_coord);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_elementwise.h b/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_elementwise.h
deleted file mode 100644
index c703f07f78a24..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_elementwise.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include <curand_kernel.h>
-
-#include "cutlass/cutlass.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace kernel {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel to initialize tensor to uniform random distribution
-template <typename T>
-__global__ void TensorInitializeUniform(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-
-  tensor += s_idx * ldm + c_idx;
-
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      double range = dist.uniform.max - dist.uniform.min;
-
-      double rnd = curand_uniform(&rng_state[threadIdx.x]);
-
-      rnd = dist.uniform.min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-      if (dist.int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << dist.int_scale)));
-        *tensor = T(rnd / double(1 << dist.int_scale));
-      } else {
-        *tensor = T(rnd);
-      }
-
-      tensor += ldm;
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel to initialize tensor to uniform distribution
-template <typename T>
-__global__ void TensorInitializeGaussian(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-
-  tensor += s_idx * ldm + c_idx;
-
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-
-      double rnd = curand_normal(&rng_state[threadIdx.x]);
-
-      rnd = dist.gaussian.mean + dist.gaussian.stddev * rnd;
-
-      if (dist.int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << dist.int_scale)));
-        *tensor = T(rnd / double(1 << dist.int_scale));
-      } else {
-        *tensor = T(rnd);
-      }
-    }
-  }
-}
-
-/// Kernel to initialize tensor to an identity matrix
-template <typename T>
-__global__ void TensorInitializeLinear(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-
-  tensor += s_idx * ldm + c_idx;
-
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      *tensor =
-          dist.linear.offset + dist.linear.delta_row * c_idx + dist.linear.delta_column * s_idx;
-    }
-  }
-}
-
-/// Kernel to initialize tensor to an identity matrix
-template <typename T>
-__global__ void TensorInitializeIdentity(
-    Distribution dist, int64_t seed, int dim_contiguous, int dim_strided, T *tensor, int ldm) {
-  __shared__ curandState_t rng_state[1024];
-
-  uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x;
-
-  curand_init(seed, gtid, 0, &rng_state[threadIdx.x]);
-
-  int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int s_idx = blockIdx.y * blockDim.x;
-
-  tensor += s_idx * ldm + c_idx;
-
-  for (int s_offset = 0; s_offset < blockDim.x; ++s_offset, ++s_idx) {
-    if (s_idx < dim_strided && c_idx < dim_contiguous) {
-      *tensor = (c_idx == s_idx ? T(1) : T(0));
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_foreach.h b/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_foreach.h
deleted file mode 100644
index a64a419d8a193..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/kernel/tensor_foreach.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#include "cutlass/cutlass.h"
-#include "cutlass/coord.h"
-#include "cutlass/subbyte_reference.h"
-#include "cutlass/fast_math.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace kernel {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines several helpers
-namespace detail {
-
-/// Helper to perform for-each operation
-template <typename Func, int Rank, int RankRemaining>
-struct TensorForEachHelper {
-
-  /// Constructor for general rank
-  __inline__ __device__
-  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
-
-    int64_t product = 1;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = Rank - RankRemaining; i < Rank; ++i) {
-      product *= size[i];
-    }
-
-    coord[Rank - 1 - RankRemaining] = index / product;
-    int64_t remaining = index % product;
-    
-    TensorForEachHelper<Func, Rank, RankRemaining-1>(func, size, coord, remaining);
-  }
-};
-
-/// Helper to perform for-each operation
-template <typename Func, int Rank>
-struct TensorForEachHelper<Func, Rank, 0> {
-
-  /// Constructor for fastest changing rank
-  __inline__ __device__
-  TensorForEachHelper(Func &func, Coord<Rank> const &size, Coord<Rank> &coord, int64_t index) {
-
-    coord[Rank - 1] = index;
-
-    if (coord < size) {
-      func(coord);
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel calls a functor for each element in a tensor's index space
-template <typename Func, int Rank, typename Params>
-__global__ void TensorForEach(Coord<Rank> size, Params params = Params()) {
-
-  Func func(params);
-
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
-  int64_t max_index = 1;
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 0; i < Rank; ++i) {
-    max_index *= size[i];
-  }
-
-  CUTLASS_PRAGMA_NO_UNROLL
-  while  (index < max_index) {
-    Coord<Rank> coord;
-
-    detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, size, coord, index); 
-    index += blockDim.x * gridDim.x;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Kernel calls a functor for each element along a tensor's diagonal
-template <typename Func, int Rank, typename Params>
-__global__ void TensorDiagonalForEach(Coord<Rank> size, Params params, int start, int end) {
-
-  Func func(params);
-
-  int64_t index = threadIdx.x + blockIdx.x * blockDim.x + start;
-
-  if (index < end) {
-    Coord<Rank> coord;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Rank; ++i) {
-      coord[i] = index;
-    }
-
-    func(coord);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element, typename Func>
-__global__ void BlockForEach(
-  Element *ptr, 
-  size_t capacity, 
-  typename Func::Params params) {
-
-  Func func(params);
-
-  size_t index = threadIdx.x + blockIdx.x * blockDim.x;
-
-  for (; index < capacity; index += blockDim.x * gridDim.x) {
-    ReferenceFactory<Element>::get(ptr, index) = func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace kernel
-} // namespace device
-} // namespace reference
-} // namespace cutlass
-
diff --git a/csrc/sparse/cutlass/example/util/reference/device/rank_2k_complex.h b/csrc/sparse/cutlass/example/util/reference/device/rank_2k_complex.h
deleted file mode 100644
index d5892457ca942..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/rank_2k_complex.h
+++ /dev/null
@@ -1,355 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in device-side code.
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  int kMblock = 4,
-  int kNblock = 4
->
-__global__ void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  assert(M=N);
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  
-  int row_block = (blockIdx.x * blockDim.x + threadIdx.x) * kMblock;
-  int col_block = (blockIdx.y * blockDim.y + threadIdx.y) * kNblock; 
-  int batch_idx = blockIdx.z;
-
-  tensor_a.add_pointer_offset(batch_idx * batch_stride_A);
-  tensor_b.add_pointer_offset(batch_idx * batch_stride_B);
-  tensor_c.add_pointer_offset(batch_idx * batch_stride_C);
-  tensor_d.add_pointer_offset(batch_idx * batch_stride_D);
-
-  for (; batch_idx < batch_count; batch_idx += gridDim.z) {
-
-    // Compute matrix product using blocks
-    ComputeType accum[kMblock][kNblock];
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        accum[i][j] = initial_accum;
-      }
-    }
-
-    for (int k_block = 0; k_block < K; ++k_block) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < kNblock; j++) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < kMblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          if (row < M && col < N &&
-             ( (fill_mode_c == FillMode::kLower && row >= col) || 
-              (fill_mode_c == FillMode::kUpper && row <= col) )               
-            ) {
-
-            // A x B^T (Symmetric) or A x B^H (Hermitian)
-            // complex conjugation on operandB (b_t) is function of blas3 computation
-            ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-            ElementB b_t = (blas_mode == BlasMode::kHermitian) ? 
-                          conj(tensor_b.at(MatrixCoord(col, k_block))) : 
-                          tensor_b.at(MatrixCoord(col, k_block));
-
-            ComputeType a_ik = ComputeType(a);
-            ComputeType b_jk = ComputeType(b_t);
-
-            // complex conjugation is a function of operand layouts
-            if (transform_a == ComplexTransform::kConjugate) {
-              a_ik = conj(a_ik);
-            }
-            // complex conjugation is a function of operand layouts
-            if (transform_b == ComplexTransform::kConjugate) {
-              b_jk = conj(b_jk);
-            }
-
-            accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
-
-            // B x A^T (Symmetric) or B x A^H (Hermitian)
-            // complex conjugation on operandB (a_t) is function of blas3 computation
-            ElementB b = tensor_b.at(MatrixCoord(row, k_block));
-            ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
-                            conj(tensor_a.at(MatrixCoord(col, k_block))):
-                            tensor_a.at(MatrixCoord(col, k_block));
-
-            ComputeType b_ik = ComputeType(b);
-            ComputeType a_jk = ComputeType(a_t);
-            
-            // complex conjugation here is a function of operand layouts
-            if (transform_b == ComplexTransform::kConjugate) {
-              b_ik = conj(b_ik);
-            }
-            // complex conjugation here is a function of operand layouts
-            if (transform_a == ComplexTransform::kConjugate) {
-              a_jk = conj(a_jk);
-            }
-
-            accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-          }
-        }
-      }
-    }
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < kNblock; j++) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < kMblock; i++) {
-        int row = row_block + i;
-        int col = col_block + j;
-
-        MatrixCoord coord = MatrixCoord(row, col);
-
-        if (row < M && col < N && 
-            ((fill_mode_c == FillMode::kLower && row >= col) || 
-             (fill_mode_c == FillMode::kUpper && row <= col))
-          ) {
-
-          ScalarType c = tensor_c.at(coord);
-          // The imaginary parts of the diagonal elements of 
-          // a complex data type are assumed and set to zero
-          if (blas_mode == BlasMode::kHermitian) {
-            c = (row == col) ? real(c) : c;
-          }
-
-          tensor_d.at(coord) = convert_op(
-            alpha * ScalarType(accum[i][j]) + 
-            beta * c);
-        }
-      }
-    }
-
-    tensor_a.add_pointer_offset(batch_stride_A * gridDim.z);
-    tensor_b.add_pointer_offset(batch_stride_B * gridDim.z);
-    tensor_c.add_pointer_offset(batch_stride_C * gridDim.z);
-    tensor_d.add_pointer_offset(batch_stride_D * gridDim.z);
-
-  } // for (batch_idx)
-}
-
-} // namespace kernel
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
- 
-  int const kMblock = 4;
-  int const kNblock = 4;
-
-  dim3 block(16, 8);
-  dim3 grid(
-    (problem_size.m() + block.x * kMblock - 1) / (block.x * kMblock),
-    (problem_size.n() + block.y * kNblock - 1) / (block.y * kNblock),
-    batch_count % std::numeric_limits<uint16_t>::max()
-  );
-
-  kernel::Rank2KComplex<
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    ElementC,
-    LayoutC,
-    ScalarType,
-    ComputeType,
-    ConvertOp,
-    InnerProductOp,
-    kMblock,
-    kNblock
-  ><<< grid, block >>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    transform_a,
-    tensor_b,
-    transform_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum,
-    fill_mode_c,
-    blas_mode,
-    batch_count,
-    batch_stride_A,
-    batch_stride_B,
-    batch_stride_C,
-    batch_stride_D
-  );
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  FillMode fill_mode_c,
-  BlasMode blas_mode) {
-
-  Rank2KComplex(    
-    problem_size, alpha, 
-    tensor_a, transform_a, 
-    tensor_b, transform_b, 
-    beta, tensor_c, tensor_d, 
-    ScalarType(0),
-    fill_mode_c,
-    blas_mode);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_compare.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_compare.h
deleted file mode 100644
index e6b36990f0f1a..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_compare.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines host-side elementwise operations on TensorView.
-*/
-
-#pragma once
-// Standard Library includes
-#include <utility>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/relatively_equal.h"
-
-#include "cutlass/util/distribution.h"
-
-#include "tensor_foreach.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-template <typename Element>
-__global__ void BlockCompareEqual(
-  int *equal, 
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity) {
-
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
-
-    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
-    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
-
-    if (a != b) {
-      *equal = 0;
-
-      return;
-    }
-  }
-}
-
-template <typename Element>
-__global__ void BlockCompareRelativelyEqual(
-  int *equal, 
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity,
-  Element epsilon,
-  Element nonzero_floor) {
-
-  size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-
-  for (; idx < capacity; idx += gridDim.x * blockDim.x) {
-
-    Element a = cutlass::ReferenceFactory<Element>::get(ptr_A, idx);
-    Element b = cutlass::ReferenceFactory<Element>::get(ptr_B, idx);
-
-    if (!relatively_equal(a, b, epsilon, nonzero_floor)) {
-      *equal = 0;
-      return;
-    }
-  }
-}
-
-} // namespace kernel
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Performs a bit-level equality check between two blocks
-template <typename Element>
-bool BlockCompareEqual(
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity,
-  int grid_size = 0, 
-  int block_size = 0) {
-
-  int equal_flag = 1;
-  int *device_equal_flag = nullptr;
-
-  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
-    throw std::runtime_error("Failed to allocate device flag.");
-  }
-
-  if (cudaMemcpy(
-    device_equal_flag, 
-    &equal_flag, 
-    sizeof(int), 
-    cudaMemcpyHostToDevice) != cudaSuccess) {
-
-    throw std::runtime_error("Failed to copy equality flag to device.");
-  }
-
-  if (!grid_size || !block_size) {
-
-    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-      &grid_size,
-      &block_size,
-      reinterpret_cast<void const *>(kernel::BlockCompareEqual<Element>));
-
-    if (result != cudaSuccess) {
-      throw std::runtime_error("Failed to query occupancy.");
-    }
-
-    // Limit block size. This has the effect of increasing the number of items processed by a
-    // single thread and reduces the impact of initialization overhead.
-    block_size = (block_size < 128 ? block_size : 128);
-  }
-
-  dim3 grid(grid_size, 1, 1);
-  dim3 block(block_size, 1, 1);
-
-  kernel::BlockCompareEqual<Element><<< grid, block >>>(device_equal_flag, ptr_A, ptr_B, capacity);
-
-  if (cudaMemcpy(
-    &equal_flag, 
-    device_equal_flag,
-    sizeof(int), 
-    cudaMemcpyDeviceToHost) != cudaSuccess) {
-    
-    cudaFree(device_equal_flag);
-
-    throw std::runtime_error("Failed to copy equality flag from device.");
-  }
-
-  cudaFree(device_equal_flag);
-
-  return equal_flag;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Performs a bit-level equality check between two blocks
-template <typename Element>
-bool BlockCompareRelativelyEqual(
-  Element const *ptr_A,
-  Element const *ptr_B,
-  size_t capacity,
-  Element epsilon,
-  Element nonzero_floor,
-  int grid_size = 0, 
-  int block_size = 0) {
-
-  int equal_flag = 1;
-  int *device_equal_flag = nullptr;
-
-  if (cudaMalloc((void **)&device_equal_flag, sizeof(int)) != cudaSuccess) {
-    throw std::runtime_error("Failed to allocate device flag.");
-  }
-
-  if (cudaMemcpy(
-    device_equal_flag, 
-    &equal_flag, 
-    sizeof(int), 
-    cudaMemcpyHostToDevice) != cudaSuccess) {
-
-    throw std::runtime_error("Failed to copy equality flag to device.");
-  }
-
-  if (!grid_size || !block_size) {
-
-    // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-    cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-      &grid_size,
-      &block_size,
-      reinterpret_cast<void const *>(kernel::BlockCompareRelativelyEqual<Element>));
-
-    if (result != cudaSuccess) {
-      throw std::runtime_error("Failed to query occupancy.");
-    }
-
-    // Limit block size. This has the effect of increasing the number of items processed by a
-    // single thread and reduces the impact of initialization overhead.
-    block_size = (block_size < 128 ? block_size : 128);
-  }
-
-  dim3 grid(grid_size, 1, 1);
-  dim3 block(block_size, 1, 1);
-
-  kernel::BlockCompareRelativelyEqual<Element><<< grid, block >>>(
-    device_equal_flag, 
-    ptr_A, 
-    ptr_B, 
-    capacity, 
-    epsilon, 
-    nonzero_floor
-  );
-
-  if (cudaMemcpy(
-    &equal_flag, 
-    device_equal_flag,
-    sizeof(int), 
-    cudaMemcpyDeviceToHost) != cudaSuccess) {
-    
-    cudaFree(device_equal_flag);
-
-    throw std::runtime_error("Failed to copy equality flag from device.");
-  }
-
-  cudaFree(device_equal_flag);
-
-  return equal_flag;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // device
-} // reference
-} // cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_fill.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_fill.h
deleted file mode 100644
index 13aedf14d113f..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_fill.h
+++ /dev/null
@@ -1,2077 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
-    in this header are not specialized for any particular data layout and are therefore not
-    intended to offer the best possible performance. Rather, they are intended to be generic
-    reference implementations to support the CUTLASS unit tests.
-*/
-
-#pragma once
-
-#if !defined(__CUDACC_RTC__)
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-#include <type_traits>
-#include <cstdint>
-
-#endif
-
-// CUDA includes
-#include <curand_kernel.h>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/array.h"
-#include "cutlass/complex.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/blas3.h"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/layout/vector.h"
-
-#include "cutlass/util/reference/device/tensor_foreach.h"
-#include "cutlass/util/distribution.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename FloatType>
-CUTLASS_DEVICE
-FloatType random_normal_float(curandState_t *state) {
-  return curand_normal(state);
-}
-
-template <>
-CUTLASS_DEVICE
-double random_normal_float<double>(curandState_t *state) {
-  return curand_normal_double(state);
-}
-
-template <typename FloatType>
-CUTLASS_DEVICE
-FloatType random_uniform_float(curandState_t *state) {
-  return curand_uniform(state);
-}
-
-template <>
-CUTLASS_DEVICE
-double random_uniform_float<double>(curandState_t *state) {
-  return curand_uniform_double(state);
-}
-
-template <typename Element>
-struct RandomGaussianFunc {
-
-  using FloatType = typename std::conditional<(sizeof(Element) > 4), double, float>::type;
-  using IntType = typename std::conditional<(sizeof(Element) > 4), int64_t, int>::type;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType mean;
-    FloatType stddev;
-    int int_scale;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0,
-      Element mean_ = 0, 
-      Element stddev_ = 1,
-      int int_scale_ = -1,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_), 
-      mean(static_cast<FloatType>(mean_)), 
-      stddev(static_cast<FloatType>(stddev_)), 
-      int_scale(int_scale_),
-      exclude_zero(exclude_zero_) {
-
-      float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
-      float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomGaussianFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-
-    FloatType rnd = random_normal_float<FloatType>(&rng_state);
-    rnd = params.mean + params.stddev * rnd;
-
-    Element result;
-    if (params.int_scale >= 0) {
-      rnd = FloatType(IntType(std::llround(rnd * params.float_scale_up)));
-      result = Element(IntType(rnd * params.float_scale_down));
-    }
-    else {
-      result = Element(rnd);
-    }
-
-    if (params.exclude_zero >=0 && result == Element(0.0)) {
-      if (rnd > FloatType(0)) {
-        rnd += FloatType(1);
-      } else {
-        rnd -= FloatType(1);
-      }
-      result = Element(rnd);
-    }
-
-    return result;
-  }
-};
-
-
-template <typename Real>
-struct RandomGaussianFunc<complex<Real>> {
-
-  using Element = complex<Real>;
-  using FloatType = typename std::conditional<(sizeof(Real) > 4), double, float>::type;
-  using IntType = typename std::conditional<(sizeof(Real) > 4), int64_t, int>::type;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType mean;
-    FloatType stddev;
-    int int_scale;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0,
-      Real mean_ = 0, 
-      Real stddev_ = 1,
-      int int_scale_ = -1,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_), 
-      mean(static_cast<FloatType>(mean_)), 
-      stddev(static_cast<FloatType>(stddev_)), 
-      int_scale(int_scale_),
-      exclude_zero(exclude_zero_) {
-
-      float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_up += FloatType(0.5) * float_scale_up;
-      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomGaussianFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-
-    FloatType rnd_r = random_normal_float<FloatType>(&rng_state);
-    FloatType rnd_i = random_normal_float<FloatType>(&rng_state);
-    rnd_r = params.mean + params.stddev * rnd_r;
-    rnd_i = params.mean + params.stddev * rnd_i;
-
-    Element result;
-    if (params.int_scale >= 0) {
-      rnd_r = FloatType(IntType(rnd_r * params.float_scale_up));
-      rnd_i = FloatType(IntType(rnd_i * params.float_scale_down));
-
-      result = {
-        Real(rnd_r * params.float_scale_down),
-        Real(rnd_i * params.float_scale_down)
-      };
-    }
-    else {
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-
-    if (params.exclude_zero >= 0 && 
-        result.real() == Real(0.0) &&
-        result.imag() == Real(0.0)) {
-
-      if (rnd_r > FloatType(0)) {
-        rnd_r += FloatType(1);
-      } else {
-        rnd_r -= FloatType(1);
-      }
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomGaussianFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  using RandomFunc = RandomGaussianFunc<Element>;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    typename RandomFunc::Params random;
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      typename RandomFunc::Params random_ = typename RandomFunc::Params()
-    ):
-      view(view_), random(random_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  Params params;
-  RandomFunc random;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillRandomGaussianFunc(Params const &params): params(params), random(params.random) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    params.view.at(coord) = random();
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomGaussian(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  typename RealType<Element>::Type mean = Element(0),   ///< Gaussian distribution's mean
-  typename RealType<Element>::Type stddev = Element(1), ///< Gaussian distribution's standard deviation
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  int exclude_zero = -1,                  ///< If non-negative, excludes zeros from tensor init
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomGaussianFunc<Element>;
-  using Func = detail::TensorFillRandomGaussianFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, typename RandomFunc::Params(seed, mean, stddev, bits, exclude_zero)),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <typename Element>               ///< Element type
-void BlockFillRandomGaussian(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                              ///< seed for RNG
-  typename RealType<Element>::Type mean,      ///< Gaussian distribution's mean
-  typename RealType<Element>::Type stddev,    ///< Gaussian distribution's standard deviation
-  int bits = -1,                              ///< If non-negative, specifies number of fractional bits that
-                                              ///  are not truncated to zero. Permits reducing precision of
-                                              ///  data.
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomGaussianFunc<Element>;
-
-  typename RandomFunc::Params params(seed, mean, stddev, bits);
-
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random uniform distribution
-template <typename Element>                ///< Element type 
-struct RandomUniformFunc {
-
-  using FloatType = typename std::conditional<
-    (sizeof(Element) > 4),
-    double,
-    float>::type;
-
-  using IntType = typename std::conditional<
-    (sizeof(Element) > 4),
-    int64_t,
-    int>::type;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType range;
-    FloatType max;
-    int int_scale;
-    double pnan;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0, 
-      Element max_ = 1,
-      Element min = 0,
-      int int_scale_ = -1,
-      double pnan_ = 0,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_), 
-      range(static_cast<FloatType>(max_) - static_cast<FloatType>(min)), 
-      max(static_cast<FloatType>(max_)),
-      int_scale(int_scale_),
-      pnan(pnan_),
-      exclude_zero(exclude_zero_) {
-      
-      float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
-      float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
-
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero >= 0) {
-        range = (min == Element(0)) ? range - FloatType(1): range;
-        max = (max_ == Element(0)) ? max - FloatType(1): max; 
-      }
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomUniformFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-
-    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
-        return Element(NAN);
-      }
-    }
-
-    FloatType rnd = random_uniform_float<FloatType>(&rng_state);
-    rnd = params.max - params.range * rnd;
-
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-
-    if (params.int_scale >= 0) {
-      rnd = FloatType(IntType(std::llround(rnd * params.float_scale_up)));
-      result = Element(IntType(rnd * params.float_scale_down));
-    }
-    else {
-      result = Element(rnd);
-    }
-
-    if (params.exclude_zero >=0 && result == Element(0.0)) {
-      if (rnd > FloatType(0)) {
-        rnd = std::min(params.max, rnd + FloatType(1));
-      } else {
-        rnd = std::max((params.max - params.range), rnd - FloatType(1));
-      }
-      result = Element(rnd);
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random Gaussian distribution
-template <typename Real>
-struct RandomUniformFunc<complex<Real>> {
-
-  using Element = complex<Real>;
-
-  using FloatType = typename std::conditional<
-    (sizeof(Real) > 4),
-    double,
-    float>::type;
-
-  using IntType = typename std::conditional<
-    (sizeof(Real) > 4),
-    int64_t,
-    int>::type;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType range;
-    FloatType min;
-    int int_scale;
-    double pnan;
-    FloatType float_scale_up;
-    FloatType float_scale_down;
-    int exclude_zero;           ///< If non-negative, excludes zeros
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0, 
-      FloatType max = 1,
-      FloatType min_ = 0,
-      int int_scale_ = -1,
-      double pnan_ = 0,
-      int exclude_zero_ = -1
-    ):
-      seed(seed_), 
-      range(static_cast<FloatType>(max - min_)), 
-      min(static_cast<FloatType>(min_)), 
-      int_scale(int_scale_),
-      pnan(pnan_),
-      exclude_zero(exclude_zero_) {
-
-      float_scale_up = FloatType(IntType(1) << int_scale);
-      float_scale_up += FloatType(0.5) * float_scale_up;
-      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
-
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero >= 0) {
-        min = (min == FloatType(0)) ? min + FloatType(1): min;
-        range = (max == FloatType(0)) ? range - FloatType(1): range; 
-      }
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomUniformFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-
-    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
-        return Element(Real(NAN), Real(NAN));
-      }
-    }
-
-    FloatType rnd_r = random_uniform_float<FloatType>(&rng_state);
-    FloatType rnd_i = random_uniform_float<FloatType>(&rng_state);
-
-    rnd_r = params.min + params.range * rnd_r;
-    rnd_i = params.min + params.range * rnd_i;
-
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-
-    if (params.int_scale >= 0) {
-      rnd_r = FloatType(IntType(rnd_r * params.float_scale_up));
-      rnd_i = FloatType(IntType(rnd_i * params.float_scale_up));
-
-      result = {
-        Real(rnd_r * params.float_scale_down),
-        Real(rnd_i * params.float_scale_down)
-      };
-    }
-    else {
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-
-    if (params.exclude_zero >= 0 && 
-        result.real() == Real(0.0) &&
-        result.imag() == Real(0.0)) {
-
-      if (rnd_r > FloatType(0)) {
-        rnd_r = std::min(params.min + params.range, rnd_r + FloatType(1));
-      } else {
-        rnd_r = std::max((params.min), rnd_r - FloatType(1));
-      }
-      result = Element(Real(rnd_r), Real(rnd_i));
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random uniform distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomUniformFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  using RandomFunc = RandomUniformFunc<Element>;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    typename RandomFunc::Params random;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      typename RandomFunc::Params random_ = RandomFunc::Params()
-    ):
-      view(view_), random(random_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  Params params;
-  RandomFunc random;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillRandomUniformFunc(Params const &params): params(params), random(params.random) {
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    params.view.at(coord) = random();
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomUniform(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  typename RealType<Element>::Type max = Element(1), ///< upper bound of distribution
-  typename RealType<Element>::Type min = Element(0), ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0,                        ///< Percentage of NaN elements.
-  int exclude_zero = -1,               ///< If non-negative, excludes zeros from tensor init
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomUniformFunc<Element>;
-  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  typename RandomFunc::Params random(seed, max, min, bits, pnan, exclude_zero);
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, random),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <typename Element>
-void BlockFillRandomUniform(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  typename RealType<Element>::Type max,   ///< upper bound of distribution
-  typename RealType<Element>::Type min,   ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0,                        ///< Percentage of NaN elements.
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomUniformFunc<Element>;
-
-  typename RandomFunc::Params params(seed, max, min, bits, pnan);
-
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random sparse meta 
-template <typename Element>               ///< Element type
-struct RandomSparseMetaFunc {
-
-  using FloatType = float;
-
-  using IntType = int32_t;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    uint64_t seed;
-    FloatType range;
-    int MetaSizeInBits;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      uint64_t seed_ = 0, 
-      int MetaSizeInBits_ = 2 
-    ):
-      seed(seed_), 
-      MetaSizeInBits(MetaSizeInBits_) {
-      if (MetaSizeInBits_ == 2) {
-        range = 6;
-      }
-      else if (MetaSizeInBits_ == 4) {
-        range = 2;
-      }
-      else {
-        throw std::invalid_argument("Invalid MetaSizeInBits");
-      }
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  /// RNG state object
-  curandState_t rng_state;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  RandomSparseMetaFunc(Params const &params): params(params) {
-
-    uint64_t gtid = threadIdx.x + blockIdx.x * blockDim.x;
-
-    curand_init(params.seed, gtid, 0, &rng_state);
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  Element operator()() {
-    Element FourToTwoMeta[6] = {0x4, 0x8, 0x9, 0xc, 0xd, 0xe};
-    Element TwoToOneMeta[2] = {0x4, 0xe};
-
-    Element *MetaArray =
-        (params.MetaSizeInBits == 2) ? FourToTwoMeta : TwoToOneMeta;
-
-    Element result = 0x0;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
-      FloatType rnd = random_uniform_float<FloatType>(&rng_state);
-      rnd = params.range * rnd;
-      Element meta = MetaArray[(int)rnd];
-
-      result = (Element)(result | ((Element)(meta << (i * 4))));
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomSparseMetaFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  using RandomFunc = RandomSparseMetaFunc<Element>;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    typename RandomFunc::Params random;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      typename RandomFunc::Params random_ = RandomFunc::Params()
-    ):
-      view(view_), random(random_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  Params params;
-  RandomFunc random;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillRandomSparseMetaFunc(Params const &params): params(params), random(params.random) {
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    params.view.at(coord) = random();
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomSparseMeta(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits = 2,                 ///< meta data size
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
-  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  typename RandomFunc::Params random(seed, MetaSizeInBits);
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, random),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <typename Element>
-void BlockFillRandomSparseMeta(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits = 2,                 ///< meta data size
-  cudaStream_t stream = nullptr) {
-
-  using RandomFunc = detail::RandomSparseMetaFunc<Element>;
-
-  typename RandomFunc::Params params(seed, MetaSizeInBits);
-
-  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Functor to fill a tensor with zeros off the diagonal and a uniform value on the diagonal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillDiagonalFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element diag;
-    Element other;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    Params(
-      TensorView view_ = TensorView(),
-      Element diag_ = Element(1),
-      Element other_ = Element(0)
-    ):
-      view(view_), diag(diag_), other(other_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillDiagonalFunc(Params const &params): params(params) {
-
-  }
-
-  /// Updates the tensor
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    params.view.at(coord) = (is_diag ? params.diag : params.other);
-  }
-};
-
-// Overwrites the elements of a tensor with a uniform value depending on fill mode
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillPartialFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element element;
-    FillMode fill_mode;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params(): fill_mode(FillMode::kNone) { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,
-      Element element_,
-      FillMode fill_mode_
-    ):
-      view(view_), element(element_), fill_mode(fill_mode_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  TensorFillPartialFunc(Params const &params): params(params) {
-
-  }
-
-  /// Overwrites the element if it is within the covered region.
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool predicate = true;
-      
-    switch (params.fill_mode) {
-    case FillMode::kFull:
-      predicate = true;
-      break;
-
-    case FillMode::kLower:
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < Layout::kRank; ++i) {
-        if (coord[i - 1] < coord[i]) {
-          predicate = false;
-          break;
-        }
-      }
-      break;
-
-    case FillMode::kUpper:
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < Layout::kRank; ++i) {
-        if (coord[i - 1] > coord[i]) {
-          predicate = false;
-          break;
-        }
-      }
-      break;
-
-    case FillMode::kDiagonal:
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < Layout::kRank; ++i) {
-        if (coord[i - 1] != coord[i]) {
-          predicate = false;
-          break;
-        }
-      }
-      break;
-
-    case FillMode::kNone: // fall-through
-    
-    default:
-      predicate = false;
-      break;
-    }
-    
-    if (predicate) {
-      params.view.at(coord) = params.element;
-    }
-  }
-};
-
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorClearPartialFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// 
-  static_assert((Layout::kRank == 2), "TensorClearPartial is only supported for matrices");
-
-  /// Parameters structure
-  struct Params {
-    TensorView view{};
-    Element element{};
-    FillMode fill_mode{FillMode::kNone};
-    int alignment{0};
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  TensorClearPartialFunc(Params const &params): params(params) {
-
-  }
-
-  /// Overwrites the element if it is within the covered region.
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool predicate = true;
-      
-    switch (params.fill_mode) {
-
-    case FillMode::kLower:
-      if ((coord[0] >= coord[1]) || 
-          ((coord[1] - coord[0]) >= params.alignment))  {
-          predicate = false;
-        break;
-      }
-      break;
-
-    case FillMode::kUpper:
-      if ((coord[0] <= coord[1]) ||
-          ((coord[0] - coord[1]) >= params.alignment))  {
-          predicate = false;
-        break;
-      }
-      break;
-
-    case FillMode::kNone: // fall-through
-    
-    default:
-      predicate = false;
-      break;
-    }
-    
-    if (predicate) {
-      params.view.at(coord) = params.element;
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor everywhere with a unique value for its diagonal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillDiagonal(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element diag = Element(1),              ///< value to write in the diagonal
-  Element other = Element(0),             ///< value to write off the diagonal
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorFillDiagonalFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, diag, other),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-/// Fills a tensor partially depending on fill mode. Elements not covered by the fillmode are
-/// not written.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillPartial(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element element,
-  FillMode fill_mode,
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorFillPartialFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, element, fill_mode),
-    stream
-  );
-}
-
-/// Clears a tensor partially depending on fill mode and alignment. Elements on the wrong-side
-/// of fillmode (upto the alignment) are overwritten with the user supplied element (typically zeros)
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorClearPartial(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element element,
-  FillMode fill_mode,
-  int alignment,
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorClearPartialFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params{view, element, fill_mode, alignment},
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with a uniform value
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFill(
-  TensorView<Element, Layout> view,         ///< destination tensor
-  Element val = Element(0),                 ///< value to uniformly fill it with
-  cudaStream_t stream = nullptr) {
-
-  TensorFillDiagonal(view, val, val, stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor's diagonal with 1 and 0 everywhere else.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillIdentity(
-  TensorView<Element, Layout> view,                 ///< destination tensor
-  cudaStream_t stream = nullptr) {
-
-  TensorFillDiagonal(view, Element(1), Element(0), stream);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorUpdateDiagonalFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element diag;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      Element diag_ = Element(1)
-    ):
-      view(view_), diag(diag_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorUpdateDiagonalFunc(Params const &params): params(params) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    if (is_diag) {
-      params.view.at(coord) = params.diag;  
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateDiagonal(
-  TensorView<Element, Layout> view,                 ///< destination tensor
-  Element diag = Element(1),
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorUpdateDiagonalFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, diag),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorUpdateOffDiagonalFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element other;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_ = TensorView(),
-      Element other_ = Element(0)
-    ):
-      view(view_), other(other_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorUpdateOffDiagonalFunc(Params const &params): params(params) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    if (!is_diag) {
-      params.view.at(coord) = params.other;  
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Writes a uniform value to all elements in the tensor without modifying diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateOffDiagonal(
-  TensorView<Element, Layout> view,      ///< destination tensor
-  Element other = Element(1),
-  cudaStream_t stream = nullptr) {
-
-  typedef detail::TensorUpdateOffDiagonalFunc<Element, Layout> Func;
-  typedef typename Func::Params Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, other),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillLinearFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Array<Element, Layout::kRank> v;
-    Element s;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,      ///< destination tensor
-      Array<Element, Layout::kRank> const & v_,
-      Element s_ = Element(0)
-    ):
-      view(view_), v(v_), s(s_) { 
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorFillLinearFunc(Params const &params): params(params) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    Element sum = params.s;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Layout::kRank; ++i) {
-      if constexpr (is_complex<Element>::value) {
-        if constexpr (sizeof_bits<Element>::value <= 32) {
-          sum = Element(static_cast<complex<float>>(sum) + 
-                  static_cast<complex<float>>(params.v[i]) * static_cast<complex<float>>(coord[i]));
-        }
-      }
-      else if constexpr (sizeof_bits<Element>::value <= 32) {
-        if constexpr (std::numeric_limits<Element>::is_integer) {
-          sum = Element(static_cast<int32_t>(sum) + 
-                  static_cast<int32_t>(params.v[i]) * static_cast<int32_t>(coord[i]));
-        }
-        else {
-          sum = Element(static_cast<float>(sum) + 
-                  static_cast<float>(params.v[i]) * static_cast<float>(coord[i]));
-        }
-      }
-      else {
-        sum += params.v[i] * coord[i];
-      }
-    }
-
-    params.view.at(coord) = sum;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills tensor with a linear combination of its coordinate and another vector
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillLinear(
-  TensorView<Element, Layout> view,      ///< destination tensor
-  Array<Element, Layout::kRank> const & v,
-  Element s = Element(0),
-  cudaStream_t stream = nullptr) {
-
-  using Func = detail::TensorFillLinearFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, v, s),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values from a distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandom(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,
-  Distribution dist,
-  cudaStream_t stream = nullptr,
-  int exclude_zero = -1                   ///< If non-negative, excludes 0.
-                                          ///  Note that setting this flag will result in more 1's,
-                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
-  ) {
-
-  using Real = typename RealType<Element>::Type;
-
-  if (dist.kind == Distribution::Gaussian) {
-    TensorFillRandomGaussian<Element, Layout>(
-      view,
-      seed,
-      static_cast<Real>(dist.gaussian.mean),
-      static_cast<Real>(dist.gaussian.stddev),
-      dist.int_scale,
-      exclude_zero,
-      stream);
-  } else if (dist.kind == Distribution::Uniform) {
-    TensorFillRandomUniform<Element, Layout>(
-      view,
-      seed,
-      static_cast<Real>(dist.uniform.max),
-      static_cast<Real>(dist.uniform.min),
-      dist.int_scale,
-      dist.uniform.pnan,
-      exclude_zero,
-      stream);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequential(
-  Element *ptr,
-  int64_t capacity,
-  Element v = Element(1),
-  Element s = Element(0)) {
-
-  using Layout = layout::PackedVectorLayout;
-  Layout::TensorCoord size(static_cast<Layout::Index>(capacity)); // -Wconversion
-  Layout layout = Layout::packed(size);
-  TensorView<Element, Layout> view(ptr, layout, size);
-
-  Array<Element, Layout::kRank> c{};
-  c[0] = v;
-
-  TensorFillLinear(view, c, s);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillRandom(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,
-  Distribution dist,
-  cudaStream_t stream = nullptr) {
-
-  using Real = typename RealType<Element>::Type;
-
-  if (dist.kind == Distribution::Gaussian) {
-    BlockFillRandomGaussian<Element>(
-      ptr,
-      capacity,
-      seed,
-      static_cast<Real>(dist.gaussian.mean),
-      static_cast<Real>(dist.gaussian.stddev),
-      dist.int_scale,
-      stream);
-  }
-  else if (dist.kind == Distribution::Uniform) {
-    BlockFillRandomUniform<Element>(
-      ptr,
-      capacity,
-      seed,
-      static_cast<Real>(dist.uniform.max),
-      static_cast<Real>(dist.uniform.min),
-      dist.int_scale,
-      dist.uniform.pnan,
-      stream);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorCopyDiagonalInFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element const *ptr;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,      ///< destination tensor
-      Element const *ptr_
-    ):
-      view(view_), ptr(ptr_) { 
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorCopyDiagonalInFunc(Params const &params): params(params) {
-
-  }
-
-  /// Only update the diagonal element
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool is_diagonal = true;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[0]) {
-        is_diagonal = false;
-      }
-    }
-    if (is_diagonal) {
-      params.view.at(coord) = params.ptr[coord[0]];
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies a diagonal in from host memory without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalIn(
-  TensorView<Element, Layout> view,   ///< destination tensor
-  Element const *ptr,                        ///< dense buffer of elements
-  cudaStream_t stream = nullptr) {
-
-  using Func = detail::TensorCopyDiagonalInFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, ptr),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-namespace detail {
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorCopyDiagonalOutFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Scalar type
-  typedef typename TensorView::Element T;
-
-  /// Coordinate in tensor's index space
-  typedef typename TensorView::TensorCoord TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element *ptr;
-
-    /// Default ctor
-    CUTLASS_HOST_DEVICE
-    Params() { }
-
-    //
-    // Methods
-    //
-
-    /// Construction of Gaussian RNG functor.
-    Params(
-      TensorView view_,      ///< destination tensor
-      Element *ptr_
-    ):
-      view(view_), ptr(ptr_) { 
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  /// Parameters object
-  Params params;
-
-  //
-  // Methods
-  //
-
-  /// Device-side initialization of RNG
-  CUTLASS_DEVICE
-  TensorCopyDiagonalOutFunc(Params const &params): params(params) {
-
-  }
-
-  /// Compute random value and update RNG state
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-    bool is_diagonal = true;
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[0]) {
-        is_diagonal = false;
-      }
-    }
-    if (is_diagonal) {
-      params.ptr[coord[0]] = params.view.at(coord);  
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies the diagonal of a tensor into a dense buffer in host memory.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalOut(
-  Element *ptr,                               ///< dense buffer of elements
-  TensorView<Element, Layout> view,      ///< source tensor
-  cudaStream_t stream = nullptr) {
-
-  using Func = detail::TensorCopyDiagonalOutFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, ptr),
-    /*grid_size*/0, /*block_size*/0,
-    stream
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_foreach.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_foreach.h
deleted file mode 100644
index 3911b0240c6d2..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_foreach.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/util/reference/device/kernel/tensor_foreach.h"
-
-namespace cutlass  {
-namespace reference {
-namespace device {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Launches a kernel calling a functor for each element in a tensor's index space.
-template <typename Func, int Rank, typename Params>
-struct TensorForEach {
-
-  /// Constructor performs the operation.
-  TensorForEach(
-    Coord<Rank> size, Params params = Params(),
-    int grid_size = 0, int block_size = 0,
-    cudaStream_t stream = nullptr) {
-
-    if (!grid_size || !block_size) {
-
-      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-        &grid_size,
-        &block_size,
-        reinterpret_cast<void const *>(kernel::TensorForEach<Func, Rank, Params>));
-
-      if (result != cudaSuccess) {
-        throw std::runtime_error("Failed to query occupancy.");
-      }
-
-      // Limit block size. This has the effect of increasing the number of items processed by a
-      // single thread and reduces the impact of initialization overhead.
-      block_size = (block_size < 128 ? block_size : 128);
-    }
-
-    dim3 grid(grid_size, 1, 1);
-    dim3 block(block_size, 1, 1);
-
-    kernel::TensorForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(size, params);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Launches a kernel calling a functor for each element along a tensor's diagonal
-template <typename Func, int Rank, typename Params>
-struct TensorDiagonalForEach {
-
-  /// Constructor performs the operation
-  TensorDiagonalForEach(
-    Coord<Rank> size, Params params = Params(),
-    int start = 0, int end = -1,
-    int block_size = 128, cudaStream_t stream = nullptr) {
-
-    if (end < 0) {
-      end = size.min();
-    }
-
-    dim3 block(block_size, 1, 1);
-    dim3 grid((end - start + block_size - 1) / block_size, 1, 1);
-
-    kernel::TensorDiagonalForEach<Func, Rank, Params><<< grid, block, 0, stream >>>(
-      size, params, start, end);
-  }
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element, typename Func>
-struct BlockForEach {
-
-  /// Constructor performs the operation.
-  BlockForEach(
-    Element *ptr,
-    size_t capacity,
-    typename Func::Params params = typename Func::Params(),
-    int grid_size = 0,
-    int block_size = 0,
-    cudaStream_t stream = nullptr) {
-
-    if (!grid_size || !block_size) {
-
-      // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
-      cudaError_t result = cudaOccupancyMaxPotentialBlockSize(
-        &grid_size,
-        &block_size,
-        reinterpret_cast<void const *>(kernel::BlockForEach<Element, Func>));
-
-      if (result != cudaSuccess) {
-        throw std::runtime_error("Failed to query occupancy.");
-      }
-
-      // Limit block size. This has the effect of increasing the number of items processed by a
-      // single thread and reduces the impact of initialization overhead.
-      block_size = (block_size < 128 ? block_size : 128);
-    }
-
-    dim3 grid(grid_size, 1, 1);
-    dim3 block(block_size, 1, 1);
-
-    kernel::BlockForEach<Element, Func><<< grid, block, 0, stream >>>(ptr, capacity, params);
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_reduce.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_reduce.h
deleted file mode 100644
index 47b898b4fd161..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_reduce.h
+++ /dev/null
@@ -1,510 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cmath>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/util/device_memory.h"
-#include "cutlass/util/reference/detail/linear_to_coordinate.h"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace kernel {
-
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp,
-  int kBlockSize = 128
->
-__global__ void TensorTransformReducePartial(
-  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  
-  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int64_t size = view.size();
-
-  __shared__ ComputeType scratchpad[kBlockSize];
-
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-
-    // Map linear thread ID onto tensor coordinate
-    typename Layout::TensorCoord coord;
-
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
-
-    if (view.contains(coord)) {
-
-      // Fetch element
-      Element x = view.at(coord);
-
-      // Transform 
-      identity = reduce(identity, transform(x));
-    }
-  }
-
-  scratchpad[threadIdx.x] = identity;
-
-  __syncthreads();
-
-  // One thread performs the final reduction and stores out. This could be enhanced via
-  // a tree reduction and pipelining.
-  if (threadIdx.x == 0) {
-
-    for (int i = 1; i < kBlockSize; ++i) {
-      identity = reduce(identity, scratchpad[i]);
-    }
-    
-    workspace[blockIdx.x] = identity;
-  }
-}
-
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp,
-  int kBlockSize = 128
->
-__global__ void TensorTransformReducePartial(
-  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
-  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace) {             /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  
-  int64_t idx = threadIdx.x + blockIdx.x * blockDim.x;
-  auto size = static_cast<int64_t>(view_A.size());
-
-  __shared__ ComputeType scratchpad[kBlockSize];
-
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-
-    // Map linear thread ID onto tensor coordinate
-    typename Layout::TensorCoord coord;
-
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
-
-    if (view_A.contains(coord)) {
-
-      // Fetch element
-      Element a = view_A.at(coord);
-      Element b = view_B.at(coord);
-
-      // Transform 
-      identity = reduce(identity, transform(a, b));
-    }
-  }
-
-  scratchpad[threadIdx.x] = identity;
-
-  __syncthreads();
-
-  // One thread performs the final reduction and stores out. This could be enhanced via
-  // a tree reduction and pipelining.
-  if (threadIdx.x == 0) {
-
-    for (int i = 1; i < kBlockSize; ++i) {
-      identity = reduce(identity, scratchpad[i]);
-    }
-
-    workspace[blockIdx.x] = identity;
-  }
-}
-
-
-template <
-  typename ComputeType,
-  typename ReduceOp,
-  int kBlockSize = 32
->
-__global__ void TensorTransformReduceFinalize(
-  ComputeType *workspace, 
-  ComputeType identity,
-  int workspace_size,
-  ReduceOp reduce) {
-
-  __shared__ ComputeType scratchpad[kBlockSize];
-
-  for (int idx = threadIdx.x; idx < workspace_size; idx += kBlockSize) {
-    identity = reduce(identity, workspace[idx]);
-  }
-
-  scratchpad[threadIdx.x] = identity;
-
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-
-    for (int i = 1; i < kBlockSize; ++i) {
-      identity = reduce(identity, scratchpad[i]);
-    }
-
-    workspace[0] = identity;
-  }
-}
-
-} // namespace kernel
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Transform-reduce operation over the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view,     /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  int workspace_size,                   /// Number of elements in workspace
-  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
-  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
-) {
-
-  int const kBlockSize = 128;
-
-  dim3 block(kBlockSize, 1);
-  dim3 grid(workspace_size, 1);
-
-  kernel::TensorTransformReducePartial<
-    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
-  ><<< grid, block, 0, stream >>>(
-    view, identity, reduce, transform, workspace
-  );
-
-  int const kFinalizeBlockSize = 32;
-
-  kernel::TensorTransformReduceFinalize<
-    ComputeType, ReduceOp, kFinalizeBlockSize
-  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
-    workspace, identity, workspace_size, reduce
-  );
-
-  if (copy_out) {
-    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaMemcpy() failed");
-    }
-  }
-
-  return identity;
-}
-
-/// Transform-reduce operation over the elements of two tensors, zipped together
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view_A,   /// View of the tensor to reduce over
-  TensorView<Element, Layout> view_B,   /// View of the tensor to reduce over
-  ComputeType identity,                 /// Identity element of the reduction operation
-  ReduceOp reduce,                      /// Reduces an accumulated value with a transformed element: f(ComputeType, ComputeType) => ComputeType
-  TransformOp transform,                /// Transforms the tensor element to ComputeType: g(Element) => ComputeType
-  ComputeType *workspace,               /// Device-side workspace for accumulating partial results. The reduced element is stored in workspace[0]
-  int workspace_size,                   /// Number of elements in workspace
-  cudaStream_t stream = nullptr,        /// CUDA stream to launch into
-  bool copy_out = true                  /// If true, the value of workspace[0] is copied to host and returned. Otherwise, `identity` is returned.
-) {
-
-  if (view_A.extent() != view_B.extent()) {
-    throw std::runtime_error("Extents must be equal.");
-  }
-
-  int const kBlockSize = 128;
-
-  dim3 block(kBlockSize, 1);
-  dim3 grid(workspace_size, 1);
-
-  kernel::TensorTransformReducePartial<
-    Element, Layout, ComputeType, ReduceOp, TransformOp, kBlockSize
-  ><<< grid, block, 0, stream >>>(
-    view_A, view_B, identity, reduce, transform, workspace
-  );
-
-  int const kFinalizeBlockSize = 32;
-
-  kernel::TensorTransformReduceFinalize<
-    ComputeType, ReduceOp, kFinalizeBlockSize
-  ><<< dim3(1, 1), dim3(kFinalizeBlockSize, 1), 0, stream >>>(
-    workspace, identity, workspace_size, reduce
-  );
-
-  if (copy_out) {
-    cudaError_t result = cudaMemcpy(&identity, workspace, sizeof(identity), cudaMemcpyDeviceToHost);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaMemcpy() failed");
-    }
-  }
-
-  return identity;
-}
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view,
-  ComputeType identity,            
-  ReduceOp reduce,                 
-  TransformOp transform,
-  cudaStream_t stream = nullptr, 
-  int workspace_size = 0           
-) {
-
-  // Optionally query for the SM count to size the workspace.
-  if (!workspace_size) {
-
-    int device_idx = 0;
-    cudaDeviceProp prop;
-
-    cudaError_t result = cudaGetDevice(&device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDevice() failed");
-    }
-
-    result = cudaGetDeviceProperties(&prop, device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDeviceProp() failed");
-    }
-
-    workspace_size = int(prop.multiProcessorCount);
-  }
-
-  DeviceAllocation<ComputeType> workspace(workspace_size);
-
-  ComputeType output = TensorTransformReduce(
-    view, 
-    identity, 
-    reduce, 
-    transform, 
-    workspace.get(), 
-    workspace_size, 
-    stream, 
-    true);
-
-  return output;
-}
-
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity,            
-  ReduceOp reduce,                 
-  TransformOp transform,
-  cudaStream_t stream = nullptr, 
-  int workspace_size = 0           
-) {
-
-  // Optionally query for the SM count to size the workspace.
-  if (!workspace_size) {
-
-    int device_idx = 0;
-    cudaDeviceProp prop;
-
-    cudaError_t result = cudaGetDevice(&device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDevice() failed");
-    }
-
-    result = cudaGetDeviceProperties(&prop, device_idx);
-    if (result != cudaSuccess) {
-      throw std::runtime_error("cudaGetDeviceProp() failed");
-    }
-
-    workspace_size = int(prop.multiProcessorCount);
-  }
-
-  DeviceAllocation<ComputeType> workspace(workspace_size);
-
-  ComputeType output = TensorTransformReduce(
-    view_A,
-    view_B, 
-    identity, 
-    reduce, 
-    transform, 
-    workspace.get(), 
-    workspace_size, 
-    stream, 
-    true);
-
-  return output;
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to compute the sum of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSum(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  plus<ComputeType> reduce;
-  NumericConverter<ComputeType, Element> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform, stream, workspace_size);
-}
-
-/// Helper to compute the sum of the squares of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSumSq(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared<Element, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform, stream, workspace_size);
-}
-
-/// Helper to compute the norm of the elements of a tensor.
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNorm(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  return std::sqrt(TensorSumSq(view, identity, stream, workspace_size));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to compute the sum of the squares of the differences of two tensors
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorSumSqDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared_difference<Element, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view_A, view_B, identity, reduce, transform, stream, workspace_size);
-}
-
-
-/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNormDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType(),
-  cudaStream_t stream = nullptr,
-  int workspace_size = 0
-) {
-
-  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity, stream, workspace_size));
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/device/tensor_relu.h b/csrc/sparse/cutlass/example/util/reference/device/tensor_relu.h
deleted file mode 100644
index 4e5a50403cf8d..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/tensor_relu.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines device-side elementwise operations on TensorView. Note, the operations defined
-    in this header are not specialized for any particular data layout and are therefore not
-    intended to offer the best possible performance. Rather, they are intended to be generic
-    reference implementations to support the CUTLASS unit tests.
-*/
-
-#pragma once
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/tensor_view.h"
-
-#include "cutlass/util/reference/device/tensor_foreach.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace device {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorReLuFunc {
-
-  /// View type
-  using TensorView = TensorView<Element, Layout>;
-
-  /// Coordinate in tensor's index space
-  using TensorCoord = typename TensorView::TensorCoord;
-
-  /// Parameters structure
-  struct Params {
-
-    //
-    // Data members
-    //
-
-    TensorView view;
-    Element threshold;
-
-
-    //
-    // Methods
-    //
-
-    Params(
-      TensorView view_ = TensorView(),
-      Element threshold_ = Element(0)
-    ):
-      view(view_), threshold(threshold_) {
-
-    }
-  };
-
-  //
-  // Data members
-  //
-
-  Params params;
-
-  //
-  // Methods
-  //
-
-  CUTLASS_DEVICE
-  TensorReLuFunc(Params const &params): params(params) {
-
-  }
-
-  CUTLASS_DEVICE
-  void operator()(TensorCoord const &coord) {
-
-    Element const & value = params.view.at(coord);
-    params.view.at(coord) = (value < params.threshold) ? params.threshold : value;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Apply ReLu on a tensor
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorReLu(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  Element threshold = Element(0)) {         ///< ReLu threshold
-  
-  using Func = detail::TensorReLuFunc<Element, Layout>;
-  using Params = typename Func::Params;
-
-  TensorForEach<Func, Layout::kRank, Params>(
-    view.extent(),
-    Params(view, threshold)
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/device/thread/gemm.h b/csrc/sparse/cutlass/example/util/reference/device/thread/gemm.h
deleted file mode 100644
index 04775a746ad16..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/device/thread/gemm.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace device {
-namespace thread {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Thread-level blocked general matrix product.
-//
-// Note, this is a reference implementation. Performance is not expected to approach peak.
-//
-template <
-  typename TensorRefA,
-  typename TensorRefB,
-  typename TensorRefC,
-  typename ScalarType,
-  typename AccumulatorType,
-  typename OutputTile,
-  typename InnerProductOp = multiply_add<AccumulatorType>,
-  typename ConvertOp = NumericConverter<typename TensorRefC::Element, ScalarType>
->
-struct Gemm {
-
-  using ElementA = typename TensorRefA::Element;
-  using ElementB = typename TensorRefB::Element;
-  using ElementC = typename TensorRefC::Element;
-
-  //
-  // Data members
-  //
-
-  /// Tile for A operand
-  ElementA A_tile[OutputTile::kColumn];
-
-  /// Tile for B operand
-  ElementB B_tile[OutputTile::kRow];
-
-  /// Tile for Accumulator
-  AccumulatorType accum[OutputTile::kColumn][OutputTile::kRow];
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  CUTLASS_HOST_DEVICE
-  Gemm(AccumulatorType initial_accum = AccumulatorType(0)) {
-
-    // Clear fetch registers
-    for (int i = 0; i < OutputTile::kColumn; ++i) {
-      A_tile[i] = ElementA(0);
-    }
-
-    for (int j = 0; j < OutputTile::kRow; ++j) {
-      B_tile[j] = ElementB(0);
-    }
-
-    // Clear accumulators
-    CUTLASS_PRAGMA_UNROLL
-    for (int j = 0; j < OutputTile::kColumn; ++j) {
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < OutputTile::kRow; ++i) {
-        accum[j][i] = initial_accum;
-      }
-    }
-  }
-
-  /// Computes a matrix product
-  CUTLASS_HOST_DEVICE
-  Gemm & multiply_add(
-    gemm::GemmCoord problem_size,
-    TensorRefA tensor_a,
-    TensorRefB tensor_b,
-    MatrixCoord output_coord = MatrixCoord()) {
-
-    InnerProductOp inner_product_op;
-
-    // Loop over the GEMM K dimension
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (int k = 0; k < problem_size.k(); ++k) {
-
-      // Fetch a slice of the A matrix
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < OutputTile::kColumn; ++i) {
-        if (output_coord.row() + i < problem_size.m()) {
-          A_tile[i] = tensor_a.at(make_Coord(output_coord.row() + i, k));
-        }
-      }
-
-      // Fetch a slice of the B matrix
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < OutputTile::kRow; ++j) {
-        if (output_coord.column() + j < problem_size.n()) {
-          B_tile[j] = tensor_b.at(make_Coord(k, output_coord.column() + j));
-        }
-      }
-
-      // Compute an accumulated matrix product
-      CUTLASS_PRAGMA_UNROLL
-      for (int j = 0; j < OutputTile::kRow; ++j) {
-        CUTLASS_PRAGMA_UNROLL
-        for (int i = 0; i < OutputTile::kColumn; ++i) {
-          accum[j][i] = inner_product_op(A_tile[i], B_tile[j], accum[j][i]);
-        }
-      }
-    }
-
-    return *this;
-  }
-
-  /// Performs linear scaling of matrix product and updates output tensor
-  CUTLASS_HOST_DEVICE
-  Gemm & epilogue(
-    gemm::GemmCoord problem_size,
-    ScalarType alpha,
-    ScalarType beta,
-    TensorRefC tensor_c,
-    TensorRefC tensor_d,
-    MatrixCoord output_coord = MatrixCoord()) {
-
-    ConvertOp convert_op;
-    
-    // Update the output tensor
-    for (int j = 0; j < OutputTile::kRow; ++j) {
-      for (int i = 0; i < OutputTile::kColumn; ++i) {
-        MatrixCoord coord = output_coord + MatrixCoord(i, j);
-        if (coord.row() < problem_size.m() && coord.column() < problem_size.n()) {
-
-          tensor_d.at(coord) = convert_op(
-            alpha * ScalarType(accum[j][i]) +
-            beta * ScalarType(tensor_c.at(coord))
-          );
-        }
-      }
-    }
-
-    return *this;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace thread
-} // namespace device
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/conv.hpp b/csrc/sparse/cutlass/example/util/reference/host/conv.hpp
deleted file mode 100644
index 545dbba9a4e89..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/conv.hpp
+++ /dev/null
@@ -1,698 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for CONV in host-side code.
-*/
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-
-#include "cute/tensor.hpp"
-
-#include <cuda_runtime.h>
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::reference::host {
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template<class EngineAct, class LayoutAct>
-bool
-is_activation_in_bounds(
-    cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t d_, int32_t h_, int32_t w_, int32_t c_) {
-  return ((n_ >= 0 && n_ < size<4>(activation)) &&
-          (d_ >= 0 && d_ < size<3>(activation)) &&
-          (h_ >= 0 && h_ < size<2>(activation)) &&
-          (w_ >= 0 && w_ < size<1>(activation)) &&
-          (c_ >= 0 && c_ < size<0>(activation)));
-}
-
-template<class EngineAct, class LayoutAct>
-bool
-is_activation_in_bounds(
-    cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t h_, int32_t w_, int32_t c_) {
-  return ((n_ >= 0 && n_ < size<3>(activation)) &&
-          (h_ >= 0 && h_ < size<2>(activation)) &&
-          (w_ >= 0 && w_ < size<1>(activation)) &&
-          (c_ >= 0 && c_ < size<0>(activation)));
-}
-
-template<class EngineAct, class LayoutAct>
-bool
-is_activation_in_bounds(
-    cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t w_, int32_t c_) {
-  return ((n_ >= 0 && n_ < size<2>(activation)) &&
-          (w_ >= 0 && w_ < size<1>(activation)) &&
-          (c_ >= 0 && c_ < size<0>(activation)));
-}
-
-} // namespace detail
-
-template<
-  class ElementAcc_,
-  class ElementScalar_,
-  class ElementCompute_,
-  class ElementC_,
-  class ElementOut_,
-  class TensorAlpha_,
-  class TensorBeta_,
-  class TensorBias_,
-  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>
->
-struct ConvEpilogueFusionParams {
-  using ElementAcc = ElementAcc_;
-  using ElementScalar = ElementScalar_;
-  using ElementCompute = ElementCompute_;
-  using ElementC = ElementC_;
-  using ElementOut = ElementOut_;
-  using TensorAlpha = TensorAlpha_;
-  using TensorBeta = TensorBeta_;
-  using TensorBias = TensorBias_;
-  using ActivationFunctor = ActivationFunctor_;
-  ElementScalar alpha = ElementScalar(1);
-  ElementScalar beta = ElementScalar(0);
-
-  TensorAlpha tensor_alpha{};
-  TensorBeta tensor_beta{};
-  TensorBias tensor_bias{};
-};
-
-template<
-  cutlass::conv::Operator ConvOp,
-  int NumSpatialDims,
-  class TensorA,
-  class TensorB,
-  class TensorC,
-  class TensorD,
-  class ShapePadding,
-  class StrideTraversal,
-  class ShapeDilation,
-  class EpilogueFusionParams
->
-struct ConvReferenceImpl {
-  // Hard code accumlulator type to float to avoid data lost in accumulating add.
-  using ElementAcc = cutlass::platform::conditional_t<cutlass::platform::is_same_v<typename EpilogueFusionParams::ElementAcc, double>, double, float>;
-  using ElementC = typename EpilogueFusionParams::ElementC;
-  using ElementOut = typename EpilogueFusionParams::ElementOut;
-  using ElementScalar = typename EpilogueFusionParams::ElementScalar;
-  using ElementCompute = typename EpilogueFusionParams::ElementCompute;
-  using ElementBias = typename EpilogueFusionParams::TensorBias::value_type;
-  using ActivationFunctor = typename EpilogueFusionParams::ActivationFunctor;
-
-  // Input related converter
-  NumericConverter<ElementCompute, ElementAcc> acc_converter;
-  NumericConverter<ElementCompute, ElementC> residual_converter;
-  NumericConverter<ElementCompute, ElementBias> bias_converter;
-  // Scale related converter
-  NumericConverter<ElementCompute, ElementScalar> scale_converter;
-  // Output related converter
-  NumericConverter<ElementOut, ElementCompute> output_converter;
-
-  EpilogueFusionParams& epi_fusion_params_;
-  TensorA const& tensor_a_;
-  TensorB const& tensor_b_;
-  TensorC const& tensor_c_;
-  TensorD& tensor_d_;
-
-  ShapePadding const& padding_;
-  StrideTraversal const& tstride_;
-  ShapeDilation const& dilation_;
-
-  // Epilogue activation operation
-  ActivationFunctor epi_activation;
-
-  ConvReferenceImpl(
-    TensorA const& tensor_a,
-    TensorB const& tensor_b,
-    TensorC const& tensor_c,
-    TensorD& tensor_d,
-    ShapePadding const& padding,
-    StrideTraversal const& tstride,
-    ShapeDilation const& dilation,
-    EpilogueFusionParams& epi_fusion_params)
-  : tensor_a_(tensor_a),
-    tensor_b_(tensor_b),
-    tensor_c_(tensor_c),
-    tensor_d_(tensor_d),
-    padding_(padding),
-    tstride_(tstride),
-    dilation_(dilation),
-    epi_fusion_params_(epi_fusion_params)
-  {
-    static_assert(rank(ShapePadding{}) == rank(ShapeDilation{}));
-    static_assert(rank(ShapePadding{}) == rank(StrideTraversal{}));
-  }
-
-  void compute_reference() {
-    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
-      fprop_reference(cute::Int<NumSpatialDims>{});
-    }
-    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
-      dgrad_reference(cute::Int<NumSpatialDims>{});
-    }
-    else {
-      wgrad_reference(cute::Int<NumSpatialDims>{});
-    }
-  }
-
-private:
-  // Specialization for 1D fprop kernel
-  void fprop_reference(cute::Int<1> spatial_dims) {
-    int32_t N = size<2>(tensor_d_);
-    int32_t Q = size<1>(tensor_d_);
-    int32_t K = size<0>(tensor_d_);
-    int32_t S = size<1>(tensor_b_);
-    int32_t C = size<0>(tensor_b_);
-
-#if defined(_OPENMP)
-  #pragma omp parallel for collapse(2)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t q = 0; q < Q; ++q) {
-        for (int32_t k = 0; k < K; ++k) {
-          auto accumulator = ElementAcc(0);
-          for (int32_t s = 0; s < S; ++s) {
-            for (int32_t c = 0; c < C; ++c) {
-              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-              if (detail::is_activation_in_bounds(tensor_a_, n, w, c)) {
-                auto a = tensor_a_(c, w, n);
-                auto b = tensor_b_(c, s, k);
-                accumulator += ElementAcc(a * b);
-              }
-            }
-          }
-          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-            epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-            epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                  scale_converter(beta) * residual_converter(tensor_c_(k, q, n));
-          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-          }
-          output = epi_activation(output);
-          tensor_d_(k, q, n) = output_converter(output);
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 2D fprop kernel
-  void fprop_reference(cute::Int<2> spatial_dims) {
-    int32_t N = size<3>(tensor_d_);
-    int32_t P = size<2>(tensor_d_);
-    int32_t Q = size<1>(tensor_d_);
-    int32_t K = size<0>(tensor_d_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-    int32_t C = size<0>(tensor_b_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t p = 0; p < P; ++p) {
-        for (int32_t q = 0; q < Q; ++q) {
-          for (int32_t k = 0; k < K; ++k) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t r = 0; r < R; ++r) {
-              for (int32_t s = 0; s < S; ++s) {
-                for (int32_t c = 0; c < C; ++c) {
-                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                  if (detail::is_activation_in_bounds(tensor_a_, n, h, w, c)) {
-                    auto a = tensor_a_(c, w, h, n);
-                    auto b = tensor_b_(c, s, r, k);
-                    accumulator += ElementAcc(a * b);
-                  }
-                }
-              }
-            }
-            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-              epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-              epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                    scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n));
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-            }
-            output = epi_activation(output);
-            tensor_d_(k, q, p, n) = output_converter(output);
-          }
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 3D fprop kernel
-  void fprop_reference(cute::Int<3> spatial_dims) {
-    int32_t N = size<4>(tensor_d_);
-    int32_t Z = size<3>(tensor_d_);
-    int32_t P = size<2>(tensor_d_);
-    int32_t Q = size<1>(tensor_d_);
-    int32_t K = size<0>(tensor_d_);
-    int32_t T = size<3>(tensor_b_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-    int32_t C = size<0>(tensor_b_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t z = 0; z < Z; ++z) {
-        for (int32_t p = 0; p < P; ++p) {
-          for (int32_t q = 0; q < Q; ++q) {
-            for (int32_t k = 0; k < K; ++k) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t t = 0; t < T; ++t) {
-                for (int32_t r = 0; r < R; ++r) {
-                  for (int32_t s = 0; s < S; ++s) {
-                    for (int32_t c = 0; c < C; ++c) {
-                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
-                      if (detail::is_activation_in_bounds(tensor_a_, n, d, h, w, c)) {
-                        auto a = tensor_a_(c, w, h, d, n);
-                        auto b = tensor_b_(c, s, r, t, k);
-                        accumulator += ElementAcc(a * b);
-                      }
-                    }
-                  }
-                }
-              }
-              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-                epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-                epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                      scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n));
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-              }
-              output = epi_activation(output);
-              tensor_d_(k, q, p, z, n) = output_converter(output);
-            }
-          }
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 1D dgrad kernel
-  void dgrad_reference(cute::Int<1> spatial_dims) {
-    int32_t N = size<2>(tensor_d_);
-    int32_t W = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-    int32_t K = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-
-#if defined(_OPENMP)
-   #pragma omp parallel for collapse(2)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t w = 0; w < W; ++w) {
-        for (int32_t c = 0; c < C; ++c) {
-          auto accumulator = ElementAcc(0);
-          for (int32_t k = 0; k < K; ++k) {
-            for (int32_t s = 0; s < S; ++s) {
-              int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-
-              if (q % cute::get<0>(tstride_) == 0) {
-                q /= cute::get<0>(tstride_);
-              } else {
-                continue;
-              }
-
-              if (detail::is_activation_in_bounds(tensor_a_, n, q, k)) {
-                accumulator += ElementAcc(tensor_a_(k, q, n) * tensor_b_(c, s, k));
-              }
-            }
-          }
-          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-            ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-            ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                  scale_converter(beta) * residual_converter(tensor_c_(c, w, n));
-          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-            output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-          }
-          output = epi_activation(output);
-          tensor_d_(c, w, n) = output_converter(output);
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 2D dgrad kernel
-  void dgrad_reference(cute::Int<2> spatial_dims) {
-    int32_t N = size<3>(tensor_d_);
-    int32_t H = size<2>(tensor_d_);
-    int32_t W = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-    int32_t K = size<3>(tensor_b_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t h = 0; h < H; ++h) {
-        for (int32_t w = 0; w < W; ++w) {
-          for (int32_t c = 0; c < C; ++c) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t k = 0; k < K; ++k) {
-              for (int32_t r = 0; r < R; ++r) {
-                for (int32_t s = 0; s < S; ++s) {
-                  int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-                  int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
-
-                  if (q % cute::get<0>(tstride_) == 0) {
-                    q /= cute::get<0>(tstride_);
-                  } else {
-                    continue;
-                  }
-
-                  if (p % cute::get<1>(tstride_) == 0) {
-                    p /= cute::get<1>(tstride_);
-                  } else {
-                    continue;
-                  }
-
-                  if (detail::is_activation_in_bounds(tensor_a_, n, p, q, k)) {
-                    accumulator += ElementAcc(tensor_a_(k, q, p, n) * tensor_b_(c, s, r, k));
-                  }
-                }
-              }
-            }
-            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-              ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-              ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                    scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n));
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-            }
-            output = epi_activation(output);
-
-            tensor_d_(c, w, h, n) = output_converter(output);
-          }
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 3D dgrad kernel
-  void dgrad_reference(cute::Int<3> spatial_dims) {
-    int32_t N = size<4>(tensor_d_);
-    int32_t D = size<3>(tensor_d_);
-    int32_t H = size<2>(tensor_d_);
-    int32_t W = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-    int32_t K = size<4>(tensor_b_);
-    int32_t T = size<3>(tensor_b_);
-    int32_t R = size<2>(tensor_b_);
-    int32_t S = size<1>(tensor_b_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t d = 0; d < D; ++d) {
-        for (int32_t h = 0; h < H; ++h) {
-          for (int32_t w = 0; w < W; ++w) {
-            for (int32_t c = 0; c < C; ++c) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t k = 0; k < K; ++k) {
-                for (int32_t t = 0; t < T; ++t) {
-                  for (int32_t r = 0; r < R; ++r) {
-                    for (int32_t s = 0; s < S; ++s) {
-                      int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-                      int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
-                      int32_t z = d + cute::get<2>(padding_) - t * cute::get<2>(dilation_);
-
-                      if (q % cute::get<0>(tstride_) == 0) {
-                        q /= cute::get<0>(tstride_);
-                      } else {
-                        continue;
-                      }
-
-                      if (p % cute::get<1>(tstride_) == 0) {
-                        p /= cute::get<1>(tstride_);
-                      } else {
-                        continue;
-                      }
-
-                      if (z % cute::get<2>(tstride_) == 0) {
-                        z /= cute::get<2>(tstride_);
-                      } else {
-                        continue;
-                      }
-
-                      if (detail::is_activation_in_bounds(tensor_a_, n, z, p, q, k)) {
-                        accumulator += ElementAcc(tensor_a_(k, q, p, z, n) * tensor_b_(c, s, r, t, k));
-                      }
-                    }
-                  }
-                }
-              }
-              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-                ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-                ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                      scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n));
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-              }
-              output = epi_activation(output);
-              tensor_d_(c, w, h, d, n) = output_converter(output);
-            }
-          }
-        }
-      }
-    }
-
-  }
-
-  // Specialization for 1D wgrad kernel
-  void wgrad_reference(cute::Int<1> spatial_dims) {
-    int32_t N =
-        size<2>(tensor_a_);
-    int32_t Q =
-        size<1>(tensor_a_);
-    int32_t K =
-        size<0>(tensor_a_);
-    int32_t S = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(2)
-#endif
-    for (int32_t k = 0; k < K; ++k) {
-      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-      for (int32_t s = 0; s < S; ++s) {
-        for (int32_t c = 0; c < C; ++c) {
-          auto accumulator = ElementAcc(0);
-          for (int32_t n = 0; n < N; ++n) {
-            for (int32_t q = 0; q < Q; ++q) {
-              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-              bool is_in_bounds =
-                  detail::is_activation_in_bounds(tensor_b_, n, w, c);
-              if (is_in_bounds) {
-                auto act =
-                    tensor_b_(c, w, n);
-                auto xformed_act =
-                    tensor_a_(k, q, n);
-                accumulator += ElementAcc(act * xformed_act);
-              }
-            }
-          }
-          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                  scale_converter(beta) * residual_converter(tensor_c_(c, s, k));
-          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-          }
-          output = epi_activation(output);
-          tensor_d_(c, s, k) = output_converter(output);
-        }
-      }
-    }
-  }
-
-  // Specialization for 2D wgrad kernel
-  void wgrad_reference(cute::Int<2> spatial_dims) {
-    int32_t N =
-        size<3>(tensor_a_);
-    int32_t P =
-        size<2>(tensor_a_);
-    int32_t Q =
-        size<1>(tensor_a_);
-    int32_t K =
-        size<0>(tensor_a_);
-    int32_t R = size<2>(tensor_d_);
-    int32_t S = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t k = 0; k < K; ++k) {
-      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-      for (int32_t r = 0; r < R; ++r) {
-        for (int32_t s = 0; s < S; ++s) {
-          for (int32_t c = 0; c < C; ++c) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t n = 0; n < N; ++n) {
-              for (int32_t p = 0; p < P; ++p) {
-                for (int32_t q = 0; q < Q; ++q) {
-                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                  bool is_in_bounds =
-                      detail::is_activation_in_bounds(tensor_b_, n, h, w, c);
-                  if (is_in_bounds) {
-                    auto act =
-                        tensor_b_(c, w, h, n);
-                    auto xformed_act =
-                        tensor_a_(k, q, p, n);
-                    accumulator += ElementAcc(act * xformed_act);
-                  }
-                }
-              }
-            }
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                    scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k));
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-            }
-            output = epi_activation(output);
-            tensor_d_(c, s, r, k) = output_converter(output);
-          }
-        }
-      }
-    }
-  }
-
-  // Specialization for 3D wgrad kernel
-  void wgrad_reference(cute::Int<3> spatial_dims) {
-    int32_t N =
-        size<4>(tensor_a_);
-    int32_t Z =
-        size<3>(tensor_a_);
-    int32_t P =
-        size<2>(tensor_a_);
-    int32_t Q =
-        size<1>(tensor_a_);
-    int32_t K =
-        size<0>(tensor_a_);
-    int32_t T = size<3>(tensor_d_);
-    int32_t R = size<2>(tensor_d_);
-    int32_t S = size<1>(tensor_d_);
-    int32_t C = size<0>(tensor_d_);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for collapse(3)
-#endif
-    for (int32_t k = 0; k < K; ++k) {
-      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-      for (int32_t t = 0; t < T; ++t) {
-        for (int32_t r = 0; r < R; ++r) {
-          for (int32_t s = 0; s < S; ++s) {
-            for (int32_t c = 0; c < C; ++c) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t n = 0; n < N; ++n) {
-                for (int32_t z = 0; z < Z; ++z) {
-                  for (int32_t p = 0; p < P; ++p) {
-                    for (int32_t q = 0; q < Q; ++q) {
-                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
-                      bool is_in_bounds =
-                          detail::is_activation_in_bounds(tensor_b_, n, d, h, w, c);
-                      if (is_in_bounds) {
-                        auto act =
-                            tensor_b_(c, w, h, d, n);
-                        auto xformed_act =
-                            tensor_a_(k, q, p, z, n);
-                        accumulator += ElementAcc(act * xformed_act);
-                      }
-                    }
-                  }
-                }
-              }
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                      scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k));
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-              }
-              output = epi_activation(output);
-              tensor_d_(c, s, r, t, k) = output_converter(output);
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // cutlass::reference::host
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/convolution.h b/csrc/sparse/cutlass/example/util/reference/host/convolution.h
deleted file mode 100644
index f28b4a658a388..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/convolution.h
+++ /dev/null
@@ -1,802 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-/*! \file
-    \brief Reference implementation for convolution in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/functional.h"
-#include "cutlass/layout/tensor.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/tensor_ref.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/conv/convolution.h"
-#include "cutlass/conv/conv2d_problem_size.h"
-#include "cutlass/conv/conv3d_problem_size.h"
-#include <iostream>
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Forward propagation
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// y = conv2d(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2dFprop(
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementD, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int p = 0; p < problem_size.P; ++p) {
-      for (int q = 0; q < problem_size.Q; ++q) {
-        for (int k = 0; k < problem_size.K; ++k) {
-
-          int group_idx = k / (problem_size.K / problem_size.groups);
-          int channels_per_group = problem_size.C / problem_size.groups;
-
-          ElementAccumulator acc = ElementAccumulator();
-
-          for (int r = 0; r < problem_size.R; ++r) {
-            for (int s = 0; s < problem_size.S; ++s) {
-              for (int c = 0; c < channels_per_group; ++c) {
-
-                int filter_r = r;
-                int filter_s = s;
-
-                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                  filter_r = problem_size.R - 1 - r;
-                  filter_s = problem_size.S - 1 - s;
-                }
-
-                int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-                int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-                if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) {
-
-                  ElementA a = tensor_x.at({n, h, w, c + group_idx * channels_per_group});
-                  ElementB b = tensor_w.at({k, r, s, c});
-
-                  acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-
-                }
-              }
-            }
-          }
-
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = ElementC();
-
-          if (beta != ElementCompute()) {
-            c_ref = tensor_y_in.at(cutlass::make_Coord(n, p, q, k));
-          }
-
-          tensor_y_out.at(cutlass::make_Coord(n, p, q, k)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-        }
-      }
-    }
-  }
-}
-
-/// Depthwise-separable convolution
-template <typename ElementA,
-          typename LayoutA,
-          typename ElementB,
-          typename LayoutB,
-          typename ElementC,
-          typename LayoutC,
-          typename ElementCompute,
-          typename ElementAccumulator = ElementCompute,
-          typename ElementD = ElementC,
-          typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-          typename InnerProductOp = multiply_add<ElementAccumulator>>
-void Depsep_Fprop(cutlass::TensorView<ElementA, LayoutA> tensor_A,
-                  cutlass::TensorView<ElementB, LayoutB> tensor_B,
-                  cutlass::TensorView<ElementC, LayoutC> tensor_C,
-                  cutlass::TensorView<ElementD, LayoutC> tensor_D,
-                  ElementCompute alpha,
-                  ElementCompute beta,
-                  cutlass::Tensor4DCoord padding = cutlass::Tensor4DCoord(),
-                  cutlass::Coord<2> conv_stride = cutlass::Coord<2>(),
-                  cutlass::Coord<2> dilation = cutlass::Coord<2>(),
-                  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < tensor_C.extent().n(); ++n) {
-    for (int p = 0; p < tensor_C.extent().h(); ++p) {
-      for (int q = 0; q < tensor_C.extent().w(); ++q) {
-        for (int g = 0; g < tensor_C.extent().c(); ++g) {
-          ElementAccumulator acc = ElementAccumulator();
-          for (int r = 0; r < tensor_B.extent().h(); ++r) {
-            for (int s = 0; s < tensor_B.extent().w(); ++s) {
-              
-              // input activation H and W
-              int h = p * conv_stride[0] - padding[0] + r * dilation[0];
-              int w = q * conv_stride[1] - padding[2] + s * dilation[1];
-
-              if (h < tensor_A.extent().h() && h >= 0 && w < tensor_A.extent().w() && w >= 0) {
-                ElementA a = tensor_A.at(cutlass::make_Coord(n, h, w, g));
-
-                ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation)
-                                   ? tensor_B.at(cutlass::make_Coord(g, r, s, 0))
-                                   : tensor_B.at(cutlass::make_Coord(
-                                         g, tensor_B.extent().h() - r - 1, tensor_B.extent().w() - s - 1, 0));
-
-                acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-              }
-            }
-          }
-
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g));
-          tensor_D.at(cutlass::make_Coord(n, p, q, g)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Dgrad / Deconv
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2dDgrad(
-  cutlass::conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementD, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  bool is_deconv = false) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int h = 0; h < problem_size.H; ++h) {
-      for (int w = 0; w < problem_size.W; ++w) {
-        for (int c = 0; c < problem_size.C; ++c) {
-
-          ElementAccumulator acc = ElementAccumulator();
-
-          for (int r = 0; r < problem_size.R; ++r) {
-            for (int s = 0; s < problem_size.S; ++s) {
-              for (int k = 0; k < problem_size.K; ++k) {
-
-                int filter_r = r;
-                int filter_s = s;
-
-                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                  filter_r = problem_size.R - 1 - r;
-                  filter_s = problem_size.S - 1 - s;
-                }
-
-                int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
-                int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
-
-                if (p >= 0 && (p % problem_size.stride_h) == 0 && 
-                    q >= 0 && (q % problem_size.stride_w) == 0) {
-
-                  p = p / problem_size.stride_h;
-                  q = q / problem_size.stride_w;
-#if 0
-                  std::cout << "row:" 
-                  << n * problem_size.H * problem_size.W +
-                    h * problem_size.W +
-                    w << " "
-                  << "n, p, q: (" 
-                  << n << ", "
-                  << p << ", "
-                  << q << ") * "
-                  << "r, s: (" 
-                  << r << ", "
-                  << s << ") [" 
-                  << ((p < problem_size.P && q < problem_size.Q) ? "true":"false") << "]"        
-                  << std::endl;
-#endif
-                  if (p < problem_size.P && q < problem_size.Q) {
-
-                    ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k));
-                    ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, r, s, k))
-                        : tensor_w.at(cutlass::make_Coord(k, r, s, c));
-
-                    acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                  }
-                }
-
-              } // for (K)
-            } // for (S)
-          } // for (R)
-
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = ElementC();
-
-          if (beta != ElementCompute()) {
-            c_ref = tensor_dx_in.at(cutlass::make_Coord(n, h, w, c));
-          }
-
-          tensor_dx_out.at(cutlass::make_Coord(n, h, w, c)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-
-        } // for (C)
-      } // for (W)
-    } // for (H)
-  } // for (N)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Wgrad
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2dWgrad(
-  cutlass::conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementD, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  
-  InnerProductOp inner_product_op;
-  ConvertOp convert_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int k = 0; k < problem_size.K; ++k) {
-    for (int r = 0; r < problem_size.R; ++r) {
-      for (int s = 0; s < problem_size.S; ++s) {
-        for (int c = 0; c < problem_size.C; ++c) {
-
-          ElementAccumulator acc = ElementAccumulator();
-
-          for (int n = 0; n < problem_size.N; ++n) {
-            for (int p = 0; p < problem_size.P; ++p) {
-              for (int q = 0; q < problem_size.Q; ++q) {
-                  
-                cutlass::Tensor4DCoord b_coord;
-                
-                int filter_r = r;
-                int filter_s = s; 
-
-                if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                  filter_r = problem_size.R - 1 - r;
-                  filter_s = problem_size.S - 1 - s;
-                }
-
-                b_coord = make_Coord(
-                    n,
-                    p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
-                    q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
-                    c);
-
-                if (b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
-                    b_coord.w() < problem_size.W && b_coord.w() >= 0) {
-
-                  ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, p, q, k)));
-                  ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
-                  acc = inner_product_op(a, b, acc);
-                }
-              }
-            }
-          }
-
-          // Apply Epilogue, compute ElementCompute, convert and store ElementC
-          ElementC c_ref = ElementC();
-
-          if (beta != ElementCompute()) {
-            c_ref = tensor_dw_in.at(cutlass::make_Coord(k, r, s, c));
-          }
-
-          tensor_dw_out.at(cutlass::make_Coord(k, r, s, c)) =
-              convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-
-        } // for (C)
-      } // for (S)
-    } // for (R)
-  } // for (K)
-}
-
-/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv2d(
-  conv::Operator convolutional_operator,
-  conv::Conv2dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementD, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta) {
-
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    Conv2dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ElementD,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-
-  case conv::Operator::kDeconv:
-  case conv::Operator::kDgrad:
-    Conv2dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ElementD,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
-    break;
-
-  case conv::Operator::kWgrad:
-    Conv2dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ElementD,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-
-  default:
-    break;  
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// 3D convolution 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// y = conv3d(x, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3dFprop(
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_x,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_y_in,
-  TensorRef<ElementC, LayoutC> tensor_y_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int z = 0; z < problem_size.Z; ++z) {
-      for (int p = 0; p < problem_size.P; ++p) {
-        for (int q = 0; q < problem_size.Q; ++q) {
-          for (int k = 0; k < problem_size.K; ++k) {
-
-            ElementAccumulator acc = ElementAccumulator();
-
-            for (int t = 0; t < problem_size.T; ++t) {
-              for (int r = 0; r < problem_size.R; ++r) {
-                for (int s = 0; s < problem_size.S; ++s) {
-                  for (int c = 0; c < problem_size.C; ++c) {
-
-                    int filter_t = t;
-                    int filter_r = r;
-                    int filter_s = s;
-
-                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                      filter_t = problem_size.T - 1 - t;
-                      filter_r = problem_size.R - 1 - r;
-                      filter_s = problem_size.S - 1 - s;
-                    }
-
-                    int d = z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d;
-                    int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h;
-                    int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w;
-
-                    if (d >= 0 && d < problem_size.D && 
-                      h >=0 && h < problem_size.H && 
-                      w >= 0 && w < problem_size.W) {
-
-                      ElementA a = tensor_x.at({n, d, h, w, c});
-                      ElementB b = tensor_w.at({k, t, r, s, c});
-                      
-                      acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                    }
-                  }
-                }
-              }
-            }
-
-            // Apply Epilogue, compute ElementCompute, convert and store ElementC
-            ElementC c_ref = ElementC();
-
-            if (beta != ElementCompute()) {
-              c_ref = tensor_y_in.at(cutlass::make_Coord(n, z, p, q, k));
-            }
-
-            tensor_y_out.at(cutlass::make_Coord(n, z, p, q, k)) =
-                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Dgrad / Deconv
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// dx = dgrad(dy, w)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3dDgrad(
-  cutlass::conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_w,
-  TensorRef<ElementC, LayoutC> tensor_dx_in,
-  TensorRef<ElementC, LayoutC> tensor_dx_out,
-  ElementCompute alpha,
-  ElementCompute beta,
-  bool is_deconv = false) {
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int n = 0; n < problem_size.N; ++n) {
-    for (int d = 0; d < problem_size.D; ++d) {
-      for (int h = 0; h < problem_size.H; ++h) {
-        for (int w = 0; w < problem_size.W; ++w) {
-          for (int c = 0; c < problem_size.C; ++c) {
-
-            ElementAccumulator acc = ElementAccumulator();
-
-            for (int t = 0; t < problem_size.T; ++t) {
-              for (int r = 0; r < problem_size.R; ++r) {
-                for (int s = 0; s < problem_size.S; ++s) {
-                  for (int k = 0; k < problem_size.K; ++k) {
-
-                    int filter_t = t;
-                    int filter_r = r;
-                    int filter_s = s;
-
-                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                      filter_t = problem_size.T - 1 - t;
-                      filter_r = problem_size.R - 1 - r;
-                      filter_s = problem_size.S - 1 - s;
-                    }
-
-                    int z = d + problem_size.pad_d - filter_t * problem_size.dilation_d;
-                    int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h;
-                    int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w;
-
-                    if (z >= 0 && (z % problem_size.stride_d) == 0 &&
-                        p >= 0 && (p % problem_size.stride_h) == 0 && 
-                        q >= 0 && (q % problem_size.stride_w) == 0) {
-
-                      z = z / problem_size.stride_d;
-                      p = p / problem_size.stride_h;
-                      q = q / problem_size.stride_w;
-                      
-                      if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) {
-
-                        ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k));
-                        ElementB b = is_deconv ? tensor_w.at(cutlass::make_Coord(c, t, r, s, k))
-                            : tensor_w.at(cutlass::make_Coord(k, t, r, s, c));
-                        acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc);
-                      }
-                    }
-
-                  } // for (K)
-                } // for (S)
-              } // for (R)
-            } // for (T)
-
-            // Apply Epilogue, compute ElementCompute, convert and store ElementC
-            ElementC c_ref = ElementC();
-
-            if (beta != ElementCompute()) {
-              c_ref = tensor_dx_in.at(cutlass::make_Coord(n, d, h, w, c));
-            }
-
-            tensor_dx_out.at(cutlass::make_Coord(n, d, h, w, c)) =
-                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-
-          } // for (C)
-        } // for (W)
-      } // for (H)
-    } // for (D)
-  } // for (N)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Wgrad
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// dw = wgrad(dy, x)
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3dWgrad(
-  cutlass::conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_dy,
-  TensorRef<ElementB, LayoutB> tensor_x,
-  TensorRef<ElementC, LayoutC> tensor_dw_in,
-  TensorRef<ElementC, LayoutC> tensor_dw_out,
-  ElementCompute alpha,
-  ElementCompute beta) {
-  
-  InnerProductOp inner_product_op;
-  ConvertOp convert_op;
-
-  // Apply MMA and accumulate ElementAccumulator
-  for (int k = 0; k < problem_size.K; ++k) {
-    for (int t = 0; t < problem_size.T; ++t) {
-      for (int r = 0; r < problem_size.R; ++r) {
-        for (int s = 0; s < problem_size.S; ++s) {
-          for (int c = 0; c < problem_size.C; ++c) {
-
-            ElementAccumulator acc = ElementAccumulator();
-
-            for (int n = 0; n < problem_size.N; ++n) {
-              for (int z = 0; z < problem_size.Z; ++z) {
-                for (int p = 0; p < problem_size.P; ++p) {
-                  for (int q = 0; q < problem_size.Q; ++q) {
-                      
-                    int filter_t = t;     
-                    int filter_r = r;
-                    int filter_s = s; 
-
-                    if (problem_size.mode == cutlass::conv::Mode::kConvolution) {
-                      filter_t = problem_size.T - 1 - t;
-                      filter_r = problem_size.R - 1 - r;
-                      filter_s = problem_size.S - 1 - s;
-                    }
-
-                    Tensor5DCoord b_coord = make_Coord(
-                        n,
-                        z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d,
-                        p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h,
-                        q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w,
-                        c);
-
-                    if (b_coord.d() < problem_size.D && b_coord.d() >= 0 &&
-                        b_coord.h() < problem_size.H && b_coord.h() >= 0 &&
-                        b_coord.w() < problem_size.W && b_coord.w() >= 0) {
-
-                      ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, z, p, q, k)));
-                      ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord));
-
-                      acc = inner_product_op(a, b, acc);
-                    }
-                  }
-                }
-              }
-            }
-
-            // Apply Epilogue, compute ElementCompute, convert and store ElementC
-            ElementC c_ref = ElementC();
-
-            if (beta != ElementCompute()) {
-              c_ref = tensor_dw_in.at(cutlass::make_Coord(k, t, r, s, c));
-            }
-
-            tensor_dw_out.at(cutlass::make_Coord(k, t, r, s, c)) =
-                convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref));
-
-          } // for (C)
-        } // for (S)
-      } // for (R)
-    } // for (T)
-  } // for (K)
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Generic 3D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ElementCompute,
-  typename ElementAccumulator = ElementCompute,
-  typename ConvertOp = NumericConverter<ElementC, ElementCompute>,
-  typename InnerProductOp = multiply_add<ElementAccumulator>
->
-void Conv3d(
-  conv::Operator convolutional_operator,
-  conv::Conv3dProblemSize problem_size,
-  TensorRef<ElementA, LayoutA> tensor_A,
-  TensorRef<ElementB, LayoutB> tensor_B,
-  TensorRef<ElementC, LayoutC> tensor_C,
-  TensorRef<ElementC, LayoutC> tensor_D,
-  ElementCompute alpha,
-  ElementCompute beta) {
-
-  switch (convolutional_operator) {
-  case conv::Operator::kFprop:
-    Conv3dFprop<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator,
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-
-  case conv::Operator::kDeconv:
-  case conv::Operator::kDgrad:
-    Conv3dDgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, (convolutional_operator == conv::Operator::kDeconv));
-    break;
-
-  case conv::Operator::kWgrad:
-    Conv3dWgrad<
-      ElementA, LayoutA,
-      ElementB, LayoutB,
-      ElementC, LayoutC,
-      ElementCompute,
-      ElementAccumulator, 
-      ConvertOp, InnerProductOp
-    >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta);
-    break;
-
-  default:
-    break;  
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-}  // namespace host
-}  // namespace reference
-}  // namespace cutlass
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
diff --git a/csrc/sparse/cutlass/example/util/reference/host/error_metrics.h b/csrc/sparse/cutlass/example/util/reference/host/error_metrics.h
deleted file mode 100644
index 86db65ccc441e..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/error_metrics.h
+++ /dev/null
@@ -1,66 +0,0 @@
-
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cmath>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/util/reference/host/tensor_reduce.h"
-#include "cutlass/core_io.h"
-
-namespace cutlass  {
-namespace reference {
-namespace host {
-
-/// Helper to compute the relative error metric for tensor A_computed  w.r.t. to tensor A_reference
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorRelativeErrorMetric(
-  TensorView<Element, Layout> view_A_computed,
-  TensorView<Element, Layout> view_B_reference,
-  ComputeType identity = ComputeType()
-) {
-
-  return cutlass::reference::host::TensorNormDiff(view_A_computed, view_B_reference, identity) /
-   cutlass::reference::host::TensorNorm(view_B_reference, identity);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/gemm.h b/csrc/sparse/cutlass/example/util/reference/host/gemm.h
deleted file mode 100644
index 03888131095fc..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/gemm.h
+++ /dev/null
@@ -1,531 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/util/host_tensor.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-template<typename Out, typename In>
-struct CastIfScalar {
-  static Out cast(In in) {
-    return Out(in);
-  }
-};
-
-template<typename OutScalar, typename In>
-struct CastIfScalar<cutlass::complex<OutScalar>, In> {
-  typedef cutlass::complex<OutScalar> Out;
-  static Out cast(In in) {
-    return Out(static_cast<OutScalar>(in));
-  }
-};
-
-template<typename OutScalar, typename InScalar>
-struct CastIfScalar<cutlass::complex<OutScalar>, cutlass::complex<InScalar>> {
-  typedef cutlass::complex<OutScalar> Out;
-  typedef cutlass::complex<InScalar> In;
-  static Out cast(In in) {
-    return Out(in);
-  }
-};
-
-template<typename Out, typename In>
-Out cast_if_scalar(In in) {
-  return CastIfScalar<Out, In>::cast(in);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-              ElementB b = tensor_b.at(MatrixCoord(k_block, col));
-
-              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
-              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
-
-              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]) +
-              beta * ScalarType(tensor_c.at(coord)));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_gemm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  ComputeType initial_accum) {
-  compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
-      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-      initial_accum);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Gemm;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAddFastBF16> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add-saturate
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAddSaturate> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>,
-                 NumericConverterClamp<ElementC, ScalarType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for XOR-popc
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpXorPopc> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, xor_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, xor_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-/// Partial specialization for AND-popc
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpAndPopc> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, and_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, and_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAddFastF32> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_gemm<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Batched GEMM
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a batch of GEMMs over a set of matrices of common dimension.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c,
-  AccumulatorType initial_accum) {
-
-  typename TensorRefCollectionA::ConstIterator tensor_a_it = tensor_a.begin();
-  typename TensorRefCollectionB::ConstIterator tensor_b_it = tensor_b.begin();
-  typename TensorRefCollectionC::ConstIterator tensor_c_it = tensor_c.begin();
-
-  for (int batch = 0;
-    batch < batch_count;
-    ++batch, ++tensor_a_it, ++tensor_b_it, ++tensor_c_it) {
-    
-    Gemm<typename TensorRefCollectionA::Element,
-         typename TensorRefCollectionA::Layout,
-         typename TensorRefCollectionB::Element,
-         typename TensorRefCollectionB::Layout,
-         typename TensorRefCollectionC::Element,
-         typename TensorRefCollectionC::Layout,
-         typename TensorRefCollectionC::Element,
-         typename TensorRefCollectionC::Element>
-        gemm;
-
-    gemm(problem_size, alpha, *tensor_a_it, *tensor_b_it, beta, *tensor_c_it,
-         initial_accum);
-  }
-}
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-//
-// TensorRefCollection* is a type satisfying the TensorRefCollection concept.
-//
-template <
-  typename TensorRefCollectionA,
-  typename TensorRefCollectionB,
-  typename TensorRefCollectionC,
-  typename ScalarType,
-  typename AccumulatorType
->
-void BatchedGemm(
-  gemm::GemmCoord problem_size,
-  int batch_count,
-  ScalarType alpha,
-  TensorRefCollectionA const& tensor_a,
-  TensorRefCollectionB const& tensor_b,
-  ScalarType beta,
-  TensorRefCollectionC &tensor_c) {
-
-  BatchedGemm(problem_size, batch_count, alpha, tensor_a, tensor_b, beta, tensor_c, ScalarType(0));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/gemm_complex.h b/csrc/sparse/cutlass/example/util/reference/host/gemm_complex.h
deleted file mode 100644
index 92da343a9c222..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/gemm_complex.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/matrix_coord.h"
-
-#include "cutlass/tensor_view.h"
-
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ElementD = ElementC,
-  typename ConvertOp = NumericConverter<ElementD, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
-
-    // Compute matrix product using blocks
-    for (int row_block = 0; row_block < M; row_block += Mblock) {
-      for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-        ComputeType accum[Mblock][Nblock];
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            accum[i][j] = initial_accum;
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N) {
-                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-                ElementB b = tensor_b.at(MatrixCoord(k_block, col));
-
-                ComputeType a_ik = ComputeType(a);
-                ComputeType b_kj = ComputeType(b);
-
-                if (transform_a == ComplexTransform::kConjugate) {
-                  a_ik = conj(a_ik);
-                }
-
-                if (transform_b == ComplexTransform::kConjugate) {
-                  b_kj = conj(b_kj);
-                }
-
-                accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-              }
-            }
-          }
-        }
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            MatrixCoord coord = MatrixCoord(row, col);
-
-            if (row < M && col < N) {
-
-              tensor_d.at(coord) = convert_op(
-                alpha * ScalarType(accum[i][j]) + 
-                beta * ScalarType(tensor_c.at(coord)));
-            }
-          }
-        }
-
-      } // for (col_block)
-    } // for (row_block)
-
-    tensor_a.add_pointer_offset(batch_stride_A);
-    tensor_b.add_pointer_offset(batch_stride_B);
-    tensor_c.add_pointer_offset(batch_stride_C);
-    tensor_d.add_pointer_offset(batch_stride_D);
-
-  } // for (batch_idx)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ElementD = ElementC
->
-void GemmComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementD, LayoutC> tensor_d) {
-
-  GemmComplex(problem_size, alpha, tensor_a, transform_a, tensor_b, transform_b, beta, tensor_c, tensor_d, ScalarType(0));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/gemm_planar_complex.h b/csrc/sparse/cutlass/example/util/reference/host/gemm_planar_complex.h
deleted file mode 100644
index 094af8b37b695..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/gemm_planar_complex.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued GEMM in host-side code.
-*/
-
-#pragma once
-
-#include "cutlass/coord.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_ref_planar_complex.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<complex<ComputeType>>
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d,
-  complex<ComputeType> initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  using ComplexA = typename TensorRefPlanarComplex<ElementA, LayoutA>::ComplexElement;
-  using ComplexB = typename TensorRefPlanarComplex<ElementB, LayoutB>::ComplexElement;
-  using ComplexC = typename TensorRefPlanarComplex<ElementC, LayoutC>::ComplexElement;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      complex<ComputeType> accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-
-              ComplexA a_ik = tensor_a.at(MatrixCoord(row, k_block));
-              ComplexB b_kj = tensor_b.at(MatrixCoord(k_block, col));
-
-              complex<ComputeType> a = complex<ComputeType>{
-                ComputeType(a_ik.real()),
-                ComputeType(a_ik.imag())
-              };
-
-              complex<ComputeType> b = complex<ComputeType>{
-                ComputeType(b_kj.real()),
-                ComputeType(b_kj.imag())
-              };
-
-              if (transform_a == ComplexTransform::kConjugate) {
-                a = conj(a);
-              }
-
-              if (transform_b == ComplexTransform::kConjugate) {
-                b = conj(b);
-              }
-
-              accum[i][j] = inner_product_op(a, b,  accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-
-            complex<ScalarType> acc{
-              ScalarType(accum[i][j].real()),
-              ScalarType(accum[i][j].imag())
-            };
-
-            ComplexC d_ij = tensor_c.at(coord);
-
-            complex<ScalarType> src{
-              ScalarType(d_ij.real()),
-              ScalarType(d_ij.imag())
-            };
-
-            complex<ScalarType> result = alpha * acc + beta * src;
-
-            d_ij.real() = convert_op(result.real());
-            d_ij.imag() = convert_op(result.imag());
-
-            tensor_d.at(coord) = d_ij;
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void GemmPlanarComplex(
-  gemm::GemmCoord problem_size,
-  complex<ScalarType> alpha,
-  TensorRefPlanarComplex<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRefPlanarComplex<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  complex<ScalarType> beta,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_c,
-  TensorRefPlanarComplex<ElementC, LayoutC> tensor_d) {
-
-  GemmPlanarComplex(
-    problem_size, 
-    alpha, 
-    tensor_a, transform_a, 
-    tensor_b, transform_b, 
-    beta, 
-    tensor_c,
-    tensor_d,
-    complex<ScalarType>());
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/gett.hpp b/csrc/sparse/cutlass/example/util/reference/host/gett.hpp
deleted file mode 100644
index f6984fb2ba9c5..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/gett.hpp
+++ /dev/null
@@ -1,538 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for GETT in host-side code.
-*/
-
-#pragma once
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/epilogue/thread/activation.h"
-#include "cutlass/relatively_equal.h"
-
-#include "cute/tensor.hpp"
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass::reference::host {
-
-template<class T, class = void>
-struct ElementTraits {
-  using type = T;
-};
-
-template<class T>
-struct ElementTraits<T, std::enable_if_t<!std::is_same_v<decltype(std::declval<T>().get()), void> > >  {
-  using type = decltype(std::declval<T>().get());
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template<
-  class ElementAccumulator_,
-  class TensorA_,                                                                                         // (M, K, L)
-  class TensorB_                                                                                          // (N, K, L)
->
-struct GettMainloopParams {
-  using ElementAccumulator = ElementAccumulator_;
-  using TensorA = TensorA_;
-  using TensorB = TensorB_;
-  using EngineA = typename TensorA::engine_type;
-  using LayoutA = typename TensorA::layout_type;
-  using EngineB = typename TensorB::engine_type;
-  using LayoutB = typename TensorB::layout_type;
-
-  TensorA A{};
-  TensorB B{};
-
-  ComplexTransform transform_A = ComplexTransform::kNone;
-  ComplexTransform transform_B = ComplexTransform::kNone;
-  
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-template<
-  class ElementScalar_,
-  class ElementScalingFactor_,
-  class ElementAccumulator_,
-  class ElementCompute_,
-  class TensorC_,                                                                                          // (M, N, L)
-  class TensorD_,                                                                                          // (M, N, L)
-  class VectorBias_ = TensorD_,                                                                            //    (M, 1)
-  class TensorAux_ = TensorD_,                                                                             // (M, N, L)
-  class VectorAlpha_ = TensorD_,                                                                           //    (M, 1)
-  class VectorBeta_ = VectorAlpha_,                                                                        //    (M, 1)
-  class ActivationFunctor_ = cutlass::epilogue::thread::Identity<ElementCompute_>,
-  class BiasBinaryOp_ = cutlass::plus<ElementCompute_>,
-  bool PerColumnBias_ = false
->
-struct GettEpilogueParams {
-  using ElementScalar = ElementScalar_;
-  using ElementScalingFactor = ElementScalingFactor_;
-  using ElementAccumulator = ElementAccumulator_;
-  using ElementCompute = ElementCompute_;
-  using TensorC = TensorC_;
-  using TensorD = TensorD_;
-  using TensorAux = TensorAux_;
-  using VectorBias = VectorBias_;
-  using VectorAlpha = VectorAlpha_;
-  using VectorBeta = VectorBeta_;
-  using ActivationFunctor = ActivationFunctor_;
-  using BiasBinaryOp = BiasBinaryOp_;
-
-  using EngineC = typename TensorC::engine_type;
-  using LayoutC = typename TensorC::layout_type;
-  using EngineD =  typename TensorD::engine_type;
-  using LayoutD = typename TensorD::layout_type;
-  static constexpr bool PerColumnBias = PerColumnBias_;
-  ElementScalar alpha = ElementScalar(1);
-  ElementScalar beta = ElementScalar(0);
-
-  TensorC C{};
-  TensorD D{};
-  VectorBias Bias{};
-  TensorAux Aux{};
-  VectorAlpha Valpha{};
-  VectorBeta Vbeta{};
-  ElementCompute st = ElementCompute(1);
-
-  ElementAccumulator* abs_max_D = nullptr;
-  ElementAccumulator* abs_max_Aux = nullptr;
-
-  ElementScalingFactor scale_a = ElementScalingFactor(1);
-  ElementScalingFactor scale_b = ElementScalingFactor(1);
-  ElementScalingFactor scale_c = ElementScalingFactor(1);
-  ElementScalingFactor scale_d = ElementScalingFactor(1);
-  ElementScalingFactor scale_aux = ElementScalingFactor(1);
-
-  bool beta_per_channel_scaling = false;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - General Tensor-Tensor contraction reference kernel
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gett(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-
-  static int constexpr kBlockM = 64;
-  static int constexpr kBlockN = 64;
-
-#if defined(_OPENMP)
-  #pragma omp parallel for collapse(3)
-#endif
-  for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
-    for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
-      for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
-        typename MainloopParams::ElementAccumulator acc[kBlockM][kBlockN];
-        gett_mainloop(mainloop_params, m, n, l, acc);
-        gett_epilogue(epilogue_params, m, n, l, acc);
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Mainloop
-template <class MainloopParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_mainloop(
-    MainloopParams const& mainloop_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename MainloopParams::LayoutB{}) == 3, "N, K, B");
-  
-  using cute::raw_pointer_cast;
-
-  using ElementA = typename ElementTraits<typename MainloopParams::EngineA::value_type>::type;
-  using ElementB = typename ElementTraits<typename MainloopParams::EngineB::value_type>::type;
-
-  using RingOp = multiply_add<ElementAccumulator, ElementAccumulator, ElementAccumulator>;
-  RingOp fma_op;
-
-  // Zero out accumulators
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      acc[m_b][n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-    }
-  }
-
-  // Compute on this k-block
-  for (int64_t k = 0; k < cute::size<1>(mainloop_params.A.layout()); ++k) {
-    // Load A
-    ElementAccumulator a_frag[kBlockM];
-    for (int m_b = 0; m_b < kBlockM; ++m_b) {
-      if (m + m_b < cute::size<0>(mainloop_params.A.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        a_frag[m_b] = static_cast<ElementAccumulator>(ElementA(mainloop_params.A(m + m_b, k, l)));
-        
-        if (mainloop_params.transform_A == ComplexTransform::kConjugate) {
-          a_frag[m_b] = conj(a_frag[m_b]);
-        }
-      } else {
-        a_frag[m_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    // Load B
-    ElementAccumulator b_frag[kBlockN];
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (n + n_b < cute::size<0>(mainloop_params.B.layout())) {
-        // Perform reference GEMM calculations at the accumulator's precision. Cast A value to accumulator type.
-        b_frag[n_b] = static_cast<ElementAccumulator>(ElementB(mainloop_params.B(n + n_b, k, l)));
-
-        if (mainloop_params.transform_B == ComplexTransform::kConjugate) {
-          b_frag[n_b] = conj(b_frag[n_b]);
-        }
-      } else {
-        b_frag[n_b] = ElementAccumulator(0); // RingOp::AdditionIdentity
-      }
-    }
-
-    // do compute
-    for (int m_b = 0; m_b < kBlockM; ++m_b) {
-      for (int n_b = 0; n_b < kBlockN; ++n_b) {
-        acc[m_b][n_b] = fma_op(a_frag[m_b], b_frag[n_b], acc[m_b][n_b]);
-      }
-    }
-
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// GETT - Epilogue
-template <class EpilogueParams, class ElementAccumulator, int kBlockM, int kBlockN>
-void gett_epilogue(
-    EpilogueParams const& epilogue_params,
-    int64_t m,
-    int64_t n,
-    int64_t l,
-    ElementAccumulator (&acc)[kBlockM][kBlockN])
-{
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == 3, "M, K, B");
-  static_assert(cute::rank(typename EpilogueParams::LayoutD{}) == 3, "N, K, B");
-
-  using cute::raw_pointer_cast;
-
-  using ElementCompute = typename EpilogueParams::ElementCompute;
-  using ElementC = typename EpilogueParams::TensorC::value_type;
-  using ElementD = typename EpilogueParams::TensorD::value_type;
-  using ElementAux = typename EpilogueParams::TensorAux::value_type;
-  using ElementBias = typename EpilogueParams::VectorBias::value_type;
-  using ElementScalar = typename EpilogueParams::ElementScalar;
-  using ElementScalingFactor = typename EpilogueParams::ElementScalingFactor;
-  using ActivationFunctor = typename EpilogueParams::ActivationFunctor;
-  using BiasBinaryOp = typename EpilogueParams::BiasBinaryOp;
-
-  constexpr bool PerColBias = EpilogueParams::PerColumnBias;
-  constexpr bool IsScalingAndAmaxOutputNeeded = 
-      cute::is_same_v<ElementD, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementD, cutlass::float_e5m2_t>;
-
-  constexpr bool IsScalingAndAmaxAuxOutputNeeded =
-      cute::is_same_v<ElementAux, cutlass::float_e4m3_t> or
-      cute::is_same_v<ElementAux, cutlass::float_e5m2_t>;
-
-  constexpr bool IsReLUAuxNeeded =
-      (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ReLu<ElementCompute>> or
-       cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) and 
-      cute::is_same_v<ElementAux, cutlass::uint1b_t>;
-  constexpr bool IsClamp =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>;
-
-  constexpr bool IsBackpropFusion =
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dGELU<ElementCompute>> or
-      cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::dReLU<ElementCompute>>;
-
-  // Input related converter
-  NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
-  NumericConverter<ElementCompute, ElementC> source_converter;
-  NumericConverter<ElementCompute, ElementBias> bias_converter;
-  [[maybe_unused]] NumericConverter<ElementCompute, ElementAux> aux_source_converter;
-
-  // Scale related converter
-  NumericConverter<ElementCompute, ElementScalar> scale_converter;
-  NumericConverter<ElementCompute, ElementScalingFactor> scaling_factor_converter;
-
-  // Abs max converter
-  [[maybe_unused]] NumericConverter<ElementAccumulator, ElementCompute> abs_max_output_converter;
-
-  // Output related converter
-  NumericConverter<ElementD, ElementCompute> destination_converter;
-  [[maybe_unused]] NumericConverter<ElementAux, ElementCompute> aux_destination_converter;
-  NumericConverter<ElementBias, ElementCompute> dBias_converter;
-
-  // Epilogue operations
-  multiply_add<ElementCompute, ElementCompute, ElementCompute> epilogue_fma;
-  multiplies<ElementCompute> mul;
-  plus<ElementCompute> add;
-
-  // Activation operation
-  ActivationFunctor activation;
-
-  // Bias binary operation
-  BiasBinaryOp bias_op;
-
-  // Do conversion
-  ElementCompute converted_alpha = scale_converter(epilogue_params.alpha);
-  ElementCompute converted_beta = scale_converter(epilogue_params.beta);
-  ElementCompute converted_scale_a = scaling_factor_converter(epilogue_params.scale_a);
-  ElementCompute converted_scale_b = scaling_factor_converter(epilogue_params.scale_b);
-  ElementCompute converted_scale_c = scaling_factor_converter(epilogue_params.scale_c);
-  ElementCompute converted_scale_d = scaling_factor_converter(epilogue_params.scale_d);
-  ElementCompute converted_scale_aux = scaling_factor_converter(epilogue_params.scale_aux);
-
-  // Init local var
-  [[maybe_unused]] ElementCompute local_abs_max_output = ElementCompute(0);
-  [[maybe_unused]] ElementCompute local_abs_max_aux_output = ElementCompute(0);
-
-  converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
-  converted_beta = mul(converted_beta, converted_scale_c);
-
-  ElementCompute inter_accum[kBlockM][kBlockN];
-
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    ElementCompute local_dBias = ElementCompute(0);
-
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        // Convert every type to ElementCompute first, do compute, convert to output type, write it out
-        ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
-        // per-row alpha
-        if (raw_pointer_cast(epilogue_params.Valpha.data())) {
-          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b, n + n_b, l));
-          converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
-        }
-        ElementCompute output = mul(converted_alpha, converted_acc);
-
-        if (raw_pointer_cast(epilogue_params.Bias.data()) && not IsBackpropFusion) {
-          ElementCompute converted_bias = bias_converter(epilogue_params.Bias(PerColBias ? n + n_b : m + m_b));
-          output = bias_op(output, converted_bias);
-        }
-
-        if (raw_pointer_cast(epilogue_params.C.data())) {
-          ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
-          // per-row beta
-          if (epilogue_params.Vbeta.data()) {
-            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b, n + n_b, l));
-            converted_beta = mul(converted_beta, converted_scale_c);
-          }
-          output = epilogue_fma(converted_beta, converted_src, output);
-        }
-
-        if constexpr (IsBackpropFusion) {
-          ElementAux aux_input = ElementAux(0);
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            aux_input = epilogue_params.Aux(m + m_b, n + n_b, l);
-          }
-
-          output = activation(output, aux_source_converter(aux_input));
-          local_dBias = add(local_dBias, output);
-        }
-        else {
-          if (raw_pointer_cast(epilogue_params.Aux.data())) {
-            auto aux_output = output;
-            if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-              maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-              local_abs_max_aux_output = amax_op(local_abs_max_aux_output, aux_output);
-              aux_output = epilogue_fma(converted_scale_aux, aux_output, ElementCompute(0));
-            }
-
-            if constexpr (IsReLUAuxNeeded) {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = not (aux_output < 0) ? uint1b_t(1) : uint1b_t(0);
-            } else {
-              epilogue_params.Aux(m + m_b, n + n_b, l) = aux_destination_converter(aux_output);
-            }
-          }
-
-          if constexpr (IsClamp) { // Treat Clamp as ReLU
-            output = activation(output, {0, std::numeric_limits<ElementCompute>::max()});
-          }
-          else {
-            output = activation(output);
-          }
-        }
-
-        if constexpr (IsScalingAndAmaxOutputNeeded) {
-          maximum_absolute_value_reduction<ElementCompute, true> amax_op;
-          local_abs_max_output = amax_op(local_abs_max_output, output);
-          output = epilogue_fma(converted_scale_d, output, ElementCompute(0));
-        }
-
-        inter_accum[m_b][n_b] = ElementCompute(output);
-      }
-    } // n_b
-
-    if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n < cute::size<1>(epilogue_params.D.layout())) {
-      if (raw_pointer_cast(epilogue_params.Bias.data()) && IsBackpropFusion) {
-        ElementCompute converted_dBias = bias_converter(epilogue_params.Bias(m + m_b));
-        local_dBias = add(local_dBias, converted_dBias);
-        epilogue_params.Bias(m + m_b) = dBias_converter(local_dBias);
-      }
-    }
-  } // m_b
-  for (int m_b = 0; m_b < kBlockM; ++m_b) {
-    for (int n_b = 0; n_b < kBlockN; ++n_b) {
-      if (m + m_b < cute::size<0>(epilogue_params.D.layout()) && n + n_b < cute::size<1>(epilogue_params.D.layout())) {
-        epilogue_params.D(m + m_b, n + n_b, l) = destination_converter(inter_accum[m_b][n_b]);
-      }
-    }
-  }
-
-#if defined(_OPENMP)
-  #pragma omp critical(Abs_Max_Data_Update)
-#endif
-  {
-    if constexpr (IsScalingAndAmaxOutputNeeded) {
-      if (epilogue_params.abs_max_D) {
-        *epilogue_params.abs_max_D = maximum_with_nan_propogation<ElementAccumulator>{}(
-          *epilogue_params.abs_max_D, abs_max_output_converter(local_abs_max_output));
-      }
-    }
-
-    if constexpr (IsScalingAndAmaxAuxOutputNeeded) {
-      if (epilogue_params.abs_max_Aux) {
-        *epilogue_params.abs_max_Aux = maximum_with_nan_propogation<ElementAccumulator>{}(
-            *epilogue_params.abs_max_Aux, abs_max_output_converter(local_abs_max_aux_output));
-      }
-    }
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <class TensorType>
-auto make_layout_rank3(const TensorType& tensor) {
-  // append a batch mode of size 1 if we do not have tensors that are rank 3
-  return make_layout(
-      make_shape(cute::get<0>(tensor.shape()), cute::get<1>(tensor.shape()), cute::Int<1>{}),
-      make_stride(cute::get<0>(tensor.stride()), cute::get<1>(tensor.stride()), int64_t(cosize(tensor.layout()))));
-}
-
-/// GEMM - General Matrix-Matrix contraction without conjugation options
-template <
-  class MainloopParams,
-  class EpilogueParams
->
-void Gemm3x(
-    MainloopParams const& mainloop_params,
-    EpilogueParams const& epilogue_params)
-{
-  using namespace cute;
-
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename MainloopParams::LayoutB{}));
-  static_assert(cute::rank(typename EpilogueParams::LayoutC{}) == cute::rank(typename EpilogueParams::LayoutD{}));
-  static_assert(cute::rank(typename MainloopParams::LayoutA{}) == cute::rank(typename EpilogueParams::LayoutC{}));
-
-  if constexpr (cute::rank(typename MainloopParams::LayoutA{}) == 2) {
-    cute::Layout layout_A = make_layout_rank3(mainloop_params.A);
-    cute::Layout layout_B = make_layout_rank3(mainloop_params.B);
-    cute::Layout layout_C = make_layout_rank3(epilogue_params.C);
-    cute::Layout layout_D = make_layout_rank3(epilogue_params.D);
-    cute::Layout layout_Aux = make_layout_rank3(epilogue_params.Aux);
-    cute::Layout layout_Bias = make_layout_rank3(epilogue_params.Bias);
-    cute::Layout layout_Valpha = make_layout_rank3(epilogue_params.Valpha);
-    cute::Layout layout_Vbeta = make_layout_rank3(epilogue_params.Vbeta);
-    
-    auto TensorA = make_tensor(mainloop_params.A.data(), layout_A);
-    auto TensorB = make_tensor(mainloop_params.B.data(), layout_B);
-    auto TensorC = make_tensor(epilogue_params.C.data(), layout_C);
-    auto TensorD = make_tensor(epilogue_params.D.data(), layout_D);
-    auto TensorAux = make_tensor(epilogue_params.Aux.data(), layout_Aux);
-    auto VectorBias = make_tensor(epilogue_params.Bias.data(), layout_Bias);
-    auto VectorAlpha = make_tensor(epilogue_params.Valpha.data(), layout_Valpha);
-    auto VectorBeta = make_tensor(epilogue_params.Vbeta.data(), layout_Vbeta);
-
-    // Reconstruct mainloop params
-    GettMainloopParams<typename MainloopParams::ElementAccumulator,
-                       decltype(TensorA),
-                       decltype(TensorB)>
-        mainloop_params_converted{TensorA,
-                                  TensorB,
-                                  mainloop_params.transform_A,
-                                  mainloop_params.transform_B};
-
-    // Reconstruct epilogue params
-    GettEpilogueParams<typename EpilogueParams::ElementScalar,
-                       typename EpilogueParams::ElementScalingFactor,
-                       typename EpilogueParams::ElementAccumulator,
-                       typename EpilogueParams::ElementCompute,
-                       decltype(TensorC),
-                       decltype(TensorD),
-                       decltype(VectorBias),
-                       decltype(TensorAux),
-                       decltype(VectorAlpha),
-                       decltype(VectorBeta)
-                      >
-        epilogue_params_converted{epilogue_params.alpha,
-                                  epilogue_params.beta,
-                                  TensorC,
-                                  TensorD,
-                                  VectorBias,
-                                  TensorAux,
-                                  VectorAlpha,
-                                  VectorBeta,
-                                  epilogue_params.abs_amax_D,
-                                  epilogue_params.abs_amax_Aux,
-                                  epilogue_params.scale_a,
-                                  epilogue_params.scale_b,
-                                  epilogue_params.scale_c,
-                                  epilogue_params.scale_d,
-                                  epilogue_params.scale_aux
-                                  };
-
-    Gett(mainloop_params_converted, epilogue_params_converted);
-  }
-  else {
-    // if we already have a batch mode, just pass it through
-    Gett(mainloop_params, epilogue_params);
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // cutlass::reference::host
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/rank_2k.h b/csrc/sparse/cutlass/example/util/reference/host/rank_2k.h
deleted file mode 100644
index 2a99bc03a35ba..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/rank_2k.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for Rank 2k update in host-side code.
-    
-    
-
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  FillMode FillModeC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_rank2k(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, 
-    "Tensors must be of rank 2");
-
-  static_assert(
-    FillModeC == FillMode::kLower || 
-    FillModeC == FillMode::kUpper, 
-    "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp = typename platform::conditional<(FillModeC == FillMode::kLower), 
-                                                    std::greater_equal<int>, 
-                                                    std::less_equal<int>>::type;
-
-  // Note: batch is ignored.
-  // Note: M is same as N for Rank 2k update
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp compare_op;
-
-  for (int row_block = 0; row_block < N; row_block += Nblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Nblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Nblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Nblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < N && col < N && compare_op(row, col)) 
-            {
-
-              // A x B^T
-              ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-              ElementB b_t = tensor_b.at(MatrixCoord(col, k_block));
-
-              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
-              ComputeType compute_b_t(cast_if_scalar<ComputeType>(b_t));
-
-              accum[i][j] = inner_product_op(compute_a, compute_b_t, accum[i][j]);
-
-              // B x A^T
-              ElementB b = tensor_b.at(MatrixCoord(row, k_block));
-              ElementA a_t = tensor_a.at(MatrixCoord(col, k_block));
-
-              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
-              ComputeType compute_a_t(cast_if_scalar<ComputeType>(a_t));
-
-              accum[i][j] = inner_product_op(compute_b, compute_a_t, accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Nblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < N && col < N && 
-              ( (FillModeC == FillMode::kLower && row >= col) || 
-                (FillModeC == FillMode::kUpper && row <= col) )
-          ) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]) +
-              beta * ScalarType(tensor_c.at(coord)));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general Rank 2k update (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  FillMode FillModeC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_rank2k(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  ComputeType initial_accum) {
-  compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
-               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
-      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-      initial_accum);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  FillMode FillModeC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Rank2K;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, 
-          typename ElementB, typename LayoutB, 
-          typename ElementC, typename LayoutC, FillMode FillModeC,
-          typename ScalarType, typename ComputeType>
-struct Rank2K<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC, ScalarType,
-            ComputeType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_rank2k<ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, FillModeC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/rank_2k_complex.h b/csrc/sparse/cutlass/example/util/reference/host/rank_2k_complex.h
deleted file mode 100644
index 090019c100396..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/rank_2k_complex.h
+++ /dev/null
@@ -1,318 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
-
-    
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include <assert.h>
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
-  assert(M==N);
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
-
-    // Compute matrix product using blocks
-    for (int row_block = 0; row_block < M; row_block += Mblock) {
-      for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-        ComputeType accum[Mblock][Nblock];
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            accum[i][j] = initial_accum;
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N &&
-                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
-                  (fill_mode_c == FillMode::kUpper && row <= col) )               
-                ) {
-                
-                // A x B^T (Symmetric) or A x B^H (Hermitian)
-                // complex conjugation on operandB (b_t) is function of blas3 computation
-                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-                ElementB b_t = (blas_mode == BlasMode::kHermitian) ? 
-                              conj(tensor_b.at(MatrixCoord(col, k_block))) : 
-                              tensor_b.at(MatrixCoord(col, k_block));
-
-                ComputeType a_ik = ComputeType(a);
-                ComputeType b_jk = ComputeType(b_t);
-
-                // complex conjugation is a function of operand layouts
-                if (transform_a == ComplexTransform::kConjugate) {
-                  a_ik = conj(a_ik);
-                }
-                // complex conjugation is a function of operand layouts
-                if (transform_b == ComplexTransform::kConjugate) {
-                  b_jk = conj(b_jk);
-                }
-
-                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
-              }
-            }
-          }
-        }
-
-        /* HER2K need two epilogues to handle complex alpha value */
-        if ( blas_mode == BlasMode::kHermitian ) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              MatrixCoord coord = MatrixCoord(row, col);
-
-              if (row < M && col < N && 
-                  ((fill_mode_c == FillMode::kLower && row >= col) || 
-                  (fill_mode_c == FillMode::kUpper && row <= col))
-                ) {
-
-                ScalarType c = tensor_c.at(coord);
-                // The imaginary parts of the diagonal elements of 
-                // a complex data type are assumed and set to zero
-                if (blas_mode == BlasMode::kHermitian) {
-                  c = (row == col) ? real(c) : c;
-                }
-
-                tensor_d.at(coord) = convert_op(alpha * 
-                  ScalarType(accum[i][j]) + 
-                  beta * c);
-              }
-            }
-          }
-          
-          /* Zeoring out accum for second HERK */
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              accum[i][j] = initial_accum;
-            }
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N &&
-                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
-                  (fill_mode_c == FillMode::kUpper && row <= col) )               
-                ) {
-
-                // B x A^T (Symmetric) or B x A^H (Hermitian)
-                // complex conjugation on operandB (a_t) is function of blas3 computation
-                ElementB b = tensor_b.at(MatrixCoord(row, k_block));
-                ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
-                                conj(tensor_a.at(MatrixCoord(col, k_block))):
-                                tensor_a.at(MatrixCoord(col, k_block));
-
-                ComputeType b_ik = ComputeType(b);
-                ComputeType a_jk = ComputeType(a_t);
-                
-                // complex conjugation here is a function of operand layouts
-                if (transform_b == ComplexTransform::kConjugate) {
-                  b_ik = conj(b_ik);
-                }
-                // complex conjugation here is a function of operand layouts
-                if (transform_a == ComplexTransform::kConjugate) {
-                  a_jk = conj(a_jk);
-                }
-
-                accum[i][j] = inner_product_op(b_ik, a_jk, accum[i][j]);
-              }
-            }
-          }
-        }
-
-        ScalarType alpha_hermitian = (blas_mode == BlasMode::kHermitian) ? 
-                                      conj(alpha) : alpha;
-        ScalarType beta_hermitian = (blas_mode == BlasMode::kHermitian) ? 
-                                      1 : beta;
-        
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            MatrixCoord coord = MatrixCoord(row, col);
-
-            if (row < M && col < N && 
-                ((fill_mode_c == FillMode::kLower && row >= col) || 
-                 (fill_mode_c == FillMode::kUpper && row <= col))
-              ) {
-
-              ScalarType d = (blas_mode == BlasMode::kHermitian) ? 
-                             tensor_d.at(coord) : tensor_c.at(coord);
-
-              ScalarType tmp_d = convert_op(
-                alpha_hermitian * ScalarType(accum[i][j]) + 
-                beta_hermitian * d);
-
-              if (blas_mode == BlasMode::kHermitian && row == col ) {
-                tensor_d.at(coord) = real(tmp_d);
-              } else {
-                tensor_d.at(coord) = tmp_d;
-              }
-            }
-          }
-        }
-
-      } // for (col_block)
-    } // for (row_block)
-
-    tensor_a.add_pointer_offset(batch_stride_A);
-    tensor_b.add_pointer_offset(batch_stride_B);
-    tensor_c.add_pointer_offset(batch_stride_C);
-    tensor_d.add_pointer_offset(batch_stride_D);
-
-  } // for (batch_idx)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ComplexTransform transform_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  FillMode fill_mode_c,
-  BlasMode blas_mode) {
-
-  Rank2KComplex(
-    problem_size, alpha, 
-    tensor_a, transform_a, 
-    tensor_b, transform_b, 
-    beta, tensor_c, tensor_d, 
-    ScalarType(0),
-    fill_mode_c,
-    blas_mode);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/rank_k_complex.h b/csrc/sparse/cutlass/example/util/reference/host/rank_k_complex.h
deleted file mode 100644
index ef44270a314a4..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/rank_k_complex.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued Rank 2K update in host-side code.
-
-    
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include <assert.h>
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>,
-  typename InnerProductOp = multiply_add<ComputeType>
->
-void Rank2KComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  FillMode fill_mode_c,
-  BlasMode blas_mode,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  int const K = problem_size.k();
-
-  // Rank2K update operates on A=NxK, B=NxK, and C=NxN
-  assert(M==N);
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-
-  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
-
-    // Compute matrix product using blocks
-    for (int row_block = 0; row_block < M; row_block += Mblock) {
-      for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-        ComputeType accum[Mblock][Nblock];
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            accum[i][j] = initial_accum;
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N &&
-                 ( (fill_mode_c == FillMode::kLower && row >= col) || 
-                  (fill_mode_c == FillMode::kUpper && row <= col) )               
-                ) {
-                
-                // A x A^T (Symmetric) or A x A^H (Hermitian)
-                // complex conjugation on operandB (a_t) (function of blas3 computation)
-                ElementA a = tensor_a.at(MatrixCoord(row, k_block));
-                ElementA a_t = (blas_mode == BlasMode::kHermitian) ? 
-                              conj(tensor_a.at(MatrixCoord(col, k_block))) : 
-                              tensor_a.at(MatrixCoord(col, k_block));
-
-                ComputeType a_ik = ComputeType(a);
-                ComputeType b_jk = ComputeType(a_t);
-
-                // complex conjugation (function of input layouts)
-                if (transform_a == ComplexTransform::kConjugate) {
-                  a_ik = conj(a_ik);
-                }
-                // complex conjugation (function of input layouts)
-                if (transform_a == ComplexTransform::kConjugate) {
-                  b_jk = conj(b_jk);
-                }
-
-                accum[i][j] = inner_product_op(a_ik, b_jk,  accum[i][j]);
-
-              }
-            }
-          }
-        }
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            MatrixCoord coord = MatrixCoord(row, col);
-
-            if (row < M && col < N && 
-                ((fill_mode_c == FillMode::kLower && row >= col) || 
-                 (fill_mode_c == FillMode::kUpper && row <= col))
-              ) {
-
-              ScalarType c = tensor_c.at(coord);
-              // The imaginary parts of the diagonal elements of 
-              // a complex data type are assumed and set to zero
-              if (blas_mode == BlasMode::kHermitian) {
-                c = (row == col) ? real(c) : c;
-              }
-
-              ScalarType tmp_d = convert_op(
-                alpha * ScalarType(accum[i][j]) + 
-                beta * c);
-
-              if (blas_mode == BlasMode::kHermitian && row == col ) {
-                tensor_d.at(coord) = real(tmp_d);
-              } else {
-                tensor_d.at(coord) = tmp_d;
-              }
-            }
-          }
-        }
-
-      } // for (col_block)
-    } // for (row_block)
-
-    tensor_a.add_pointer_offset(batch_stride_A);
-    tensor_c.add_pointer_offset(batch_stride_C);
-    tensor_d.add_pointer_offset(batch_stride_D);
-
-  } // for (batch_idx)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// This assumes the accumulator type is the same type as the scalars.
-template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType
->
-void RankKComplex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  ComplexTransform transform_a,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  FillMode fill_mode_c,
-  BlasMode blas_mode) {
-
-  Rank2KComplex(
-    problem_size, alpha, 
-    tensor_a, transform_a, 
-    beta, tensor_c, tensor_d, 
-    ScalarType(0),
-    fill_mode_c,
-    blas_mode);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/symm.h b/csrc/sparse/cutlass/example/util/reference/host/symm.h
deleted file mode 100644
index a585caf73f64f..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/symm.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for SYMM update in host-side code.
-    
-    
-
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/numeric_conversion.h"
-
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_symm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, 
-    "Tensors must be of rank 2");
-
-  static_assert(SideModeA != SideMode::kInvalid
-                , "Side Mode can either be Left or Right.");
-
-  static_assert(
-    FillModeA == FillMode::kLower || 
-    FillModeA == FillMode::kUpper, 
-    "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp_w_diag =  typename TrMatrixCompareOp<FillModeA, DiagType::kNonUnit>::Type;
-  using CompareOp_wo_diag = typename TrMatrixCompareOp<FillModeA, DiagType::kZero>::Type;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  // Assuming correct k-dimension value is passed
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp_w_diag compare_op_1;
-  CompareOp_wo_diag compare_op_2;
-
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-              ElementA a_1 = ElementA();
-              ElementB b_1 = ElementB();
-              ElementA a_2 = ElementA();
-              ElementB b_2 = ElementB();
-
-              // A x B or B x A (with diagonal)
-              if (SideModeA == SideMode::kLeft) {
-                a_1 = (compare_op_1(row, k_block)) ? 
-                      (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
-                b_1 = tensor_b.at(MatrixCoord(k_block, col));
-              } else if (SideModeA == SideMode::kRight) {
-                a_1 = tensor_b.at(MatrixCoord(row, k_block));
-                b_1 = (compare_op_1(k_block, col)) ? 
-                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
-              }
-
-              ComputeType compute_a_1(cast_if_scalar<ComputeType>(a_1));
-              ComputeType compute_b_1(cast_if_scalar<ComputeType>(b_1));
-
-              accum[i][j] = inner_product_op(compute_a_1, compute_b_1, accum[i][j]);
-
-              // A^T x B or B x A^T (without diagonal)
-              if (SideModeA == SideMode::kLeft) {
-                a_2 = (compare_op_2(k_block, row)) ? 
-                      (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
-                b_2 = tensor_b.at(MatrixCoord(k_block, col));
-              } else if (SideModeA == SideMode::kRight) {
-                a_2 = tensor_b.at(MatrixCoord(row, k_block));
-                b_2 = (compare_op_2(col, k_block)) ? 
-                      tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
-              }
-
-              ComputeType compute_a_2(cast_if_scalar<ComputeType>(a_2));
-              ComputeType compute_b_2(cast_if_scalar<ComputeType>(b_2));
-
-              accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]) +
-              beta * ScalarType(tensor_c.at(coord)));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general Symm update (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_symm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  ComputeType initial_accum) {
-  compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
-               ScalarType, ComputeType, InnerProductOp, ConvertOp>(
-      problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_c,
-      initial_accum);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Symm;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, 
-          SideMode SideModeA, FillMode FillModeA,
-          typename ElementB, typename LayoutB, 
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum);
-  }
-  
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_symm<ElementA, LayoutA, SideModeA, FillModeA, ElementB, LayoutB, ElementC, LayoutC,
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-        problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/symm_complex.h b/csrc/sparse/cutlass/example/util/reference/host/symm_complex.h
deleted file mode 100644
index 2618feaa70cee..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/symm_complex.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued SYMM update in host-side code.
-
-    
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include <assert.h>
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef
-/// objects.
-///
-/// Explicitly naming types needed by this template can be cumbersome, particularly for the
-/// accumulator type, so a function argument 'initial_accum' is exposed. Passing
-/// AccumulatorType(0) as the last function argument can be easier than naming all template
-/// arguments explicitly.
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  BlasMode BlasMode_ = BlasMode::kSymmetric,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_symm_complex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  TensorRef<ElementC, LayoutC> tensor_c,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum,
-  int batch_count = 1,
-  int64_t batch_stride_A = 0,
-  int64_t batch_stride_B = 0,
-  int64_t batch_stride_C = 0,
-  int64_t batch_stride_D = 0) {
-  
-  static SideMode const kSideModeA = SideModeA;
-  static FillMode const kFillModeA = FillModeA;
-  static BlasMode const kBlasMode  = BlasMode_;
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutB::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  static_assert(kSideModeA != SideMode::kInvalid
-                , "Side Mode can either be Left or Right.");
-
-  static_assert(
-    kFillModeA == FillMode::kLower || 
-    kFillModeA == FillMode::kUpper, 
-    "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp_w_diag =  typename TrMatrixCompareOp<kFillModeA, DiagType::kNonUnit>::Type;
-  using CompareOp_wo_diag = typename TrMatrixCompareOp<kFillModeA, DiagType::kZero>::Type;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  // Assuming correct k-dimension value is passed
-  int const K = problem_size.k();
-
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp_w_diag compare_op_1;
-  CompareOp_wo_diag compare_op_2;
-
-  for (int batch_idx = 0; batch_idx < batch_count; ++batch_idx) {
-
-    // Compute matrix product using blocks
-    for (int row_block = 0; row_block < M; row_block += Mblock) {
-      for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-        ComputeType accum[Mblock][Nblock];
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            accum[i][j] = initial_accum;
-          }
-        }
-
-        for (int k_block = 0; k_block < K; ++k_block) {
-          for (int j = 0; j < Nblock; j++) {
-            for (int i = 0; i < Mblock; i++) {
-              int row = row_block + i;
-              int col = col_block + j;
-
-              if (row < M && col < N) 
-              {
-                ElementA a_1 = ElementA();
-                ElementB b_1 = ElementB();
-                ElementA a_2 = ElementA();
-                ElementB b_2 = ElementB();
-                
-                // A x B or B x A (with diagonal)
-                if (kSideModeA == SideMode::kLeft) {
-                  a_1 = (compare_op_1(row, k_block)) ? 
-                        (tensor_a.at(MatrixCoord(row, k_block))) : ElementA();
-                  b_1 = tensor_b.at(MatrixCoord(k_block, col));
-                } else if (kSideModeA == SideMode::kRight) {
-                  a_1 = tensor_b.at(MatrixCoord(row, k_block));
-                  b_1 = (compare_op_1(k_block, col)) ? 
-                        tensor_a.at(MatrixCoord(k_block, col)) : ElementA();
-                }
-                ComputeType compute_a_1 = ComputeType(a_1);
-                ComputeType compute_b_1 = ComputeType(b_1);
-
-                // The imaginary parts of the diagonal elements of 
-                // a complex data type are assumed and set to zero
-                if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kLeft && row == k_block) {
-                  compute_a_1 = real(compute_a_1);
-                } else if (kBlasMode == BlasMode::kHermitian && kSideModeA == SideMode::kRight && k_block == col) {
-                  compute_b_1 = real(compute_b_1);
-                }
-
-                accum[i][j] = inner_product_op(compute_a_1, compute_b_1,  accum[i][j]);
-
-                // A^T x B or B x A^T (without diagonal)
-                if (kSideModeA == SideMode::kLeft) {
-                  a_2 = (compare_op_2(k_block, row)) ? 
-                        (tensor_a.at(MatrixCoord(k_block, row))) : ElementA();
-                  b_2 = tensor_b.at(MatrixCoord(k_block, col));
-                  if (kBlasMode == BlasMode::kHermitian)
-                    a_2 = conj(a_2);
-                } else if (kSideModeA == SideMode::kRight) {
-                  a_2 = tensor_b.at(MatrixCoord(row, k_block));
-                  b_2 = (compare_op_2(col, k_block)) ? 
-                        tensor_a.at(MatrixCoord(col, k_block)) : ElementA();
-                  if (kBlasMode == BlasMode::kHermitian)
-                    b_2 = conj(b_2);
-                }
-
-                ComputeType compute_a_2 = ComputeType(a_2);
-                ComputeType compute_b_2 = ComputeType(b_2);
-
-                accum[i][j] = inner_product_op(compute_a_2, compute_b_2, accum[i][j]);
-              }
-            }
-          }
-        }
-
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            MatrixCoord coord = MatrixCoord(row, col);
-
-            if (row < M && col < N) {
-
-              ScalarType c = tensor_c.at(coord);
-
-              tensor_d.at(coord) = convert_op(
-                alpha * ScalarType(accum[i][j]) + 
-                beta * c);
-            }
-          }
-        }
-
-      } // for (col_block)
-    } // for (row_block)
-
-    tensor_a.add_pointer_offset(batch_stride_A);
-    tensor_b.add_pointer_offset(batch_stride_B);
-    tensor_c.add_pointer_offset(batch_stride_C);
-    tensor_d.add_pointer_offset(batch_stride_D);
-
-  } // for (batch_idx)
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  BlasMode BlasMode_ = cutlass::BlasMode::kSymmetric,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
->
-struct SymmComplex;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA,
-          SideMode SideModeA, FillMode FillModeA, 
-          typename ElementB, typename LayoutB,
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType,
-          BlasMode BlasMode_>
-struct SymmComplex<ElementA, LayoutA, 
-                   SideModeA, FillModeA,
-                   ElementB, LayoutB,
-                   ElementC, LayoutC, ScalarType,
-                   ComputeType, BlasMode_,
-                   arch::OpMultiplyAddComplex> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_symm_complex<ElementA, LayoutA,
-                 SideModeA, FillModeA,
-                 ElementB, LayoutB,
-                 ElementC, LayoutC, 
-                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for gaussian multiply-add 
-template <typename ElementA, typename LayoutA,
-          SideMode SideModeA, FillMode FillModeA,
-          typename ElementB, typename LayoutB,
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType,
-          BlasMode BlasMode_>
-struct SymmComplex<ElementA, LayoutA, 
-                   SideModeA, FillModeA, 
-                   ElementB, LayoutB,
-                   ElementC, LayoutC, ScalarType,
-                   ComputeType, BlasMode_,
-                   arch::OpMultiplyAddGaussianComplex> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b, ScalarType beta,
-                  TensorRef<ElementC, LayoutC> tensor_c,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_symm_complex<ElementA, LayoutA,
-                 SideModeA, FillModeA,
-                 ElementB, LayoutB,
-                 ElementC, LayoutC, 
-                 ScalarType, ComputeType, BlasMode_, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.h
deleted file mode 100644
index df164a37e9297..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.h
+++ /dev/null
@@ -1,423 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines host-side elementwise operations on TensorView.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/relatively_equal.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/tensor_view_planar_complex.h"
-
-#include "cutlass/util/distribution.h"
-#include "tensor_foreach.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorEqualsFunc {
-
-  //
-  // Data members
-  //
-
-  TensorView<Element, Layout> lhs;
-  TensorView<Element, Layout> rhs;
-  bool result;
-
-  /// Ctor
-  TensorEqualsFunc(): result(true) { }
-
-  /// Ctor
-  TensorEqualsFunc(
-    TensorView<Element, Layout> const &lhs_,
-    TensorView<Element, Layout> const &rhs_
-  ) :
-    lhs(lhs_), rhs(rhs_), result(true) { }
-
-  /// Visits a coordinate
-  void operator()(Coord<Layout::kRank> const &coord) {
-
-    Element lhs_ = lhs.at(coord);
-    Element rhs_ = rhs.at(coord);
-
-    if (lhs_ != rhs_) {
-      result = false;
-    }
-  }
-
-  /// Returns true if equal
-  operator bool() const {
-    return result;
-  }
-};
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorRelativelyEqualsFunc {
-
-  //
-  // Data members
-  //
-
-  TensorView<Element, Layout> lhs;
-  TensorView<Element, Layout> rhs;
-  Element epsilon;
-  Element nonzero_floor;
-  bool result;
-
-  /// Ctor
-  TensorRelativelyEqualsFunc(
-    TensorView<Element, Layout> const &lhs_,
-    TensorView<Element, Layout> const &rhs_,
-    Element epsilon_,
-    Element nonzero_floor_
-  ) :
-    lhs(lhs_),
-    rhs(rhs_),
-    epsilon(epsilon_),
-    nonzero_floor(nonzero_floor_),
-    result(true) { }
-
-  /// Visits a coordinate
-  void operator()(Coord<Layout::kRank> const &coord) {
-
-    Element lhs_ = lhs.at(coord);
-    Element rhs_ = rhs.at(coord);
-
-    if (!relatively_equal(lhs_, rhs_, epsilon, nonzero_floor)) {
-      result = false;
-    }
-  }
-
-  /// Returns true if equal
-  operator bool() const {
-    return result;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if two tensor views are equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorEquals(
-  TensorView<Element, Layout> const &lhs,
-  TensorView<Element, Layout> const &rhs) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return false;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> func(lhs, rhs);
-  TensorForEach(
-    lhs.extent(),
-    func
-  );
-
-  return bool(func);
-}
-
-/// Returns true if two tensor views are equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorEquals(
-  TensorViewPlanarComplex<Element, Layout> const &lhs,
-  TensorViewPlanarComplex<Element, Layout> const &rhs) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return false;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> real_func(
-    {lhs.data(), lhs.layout(), lhs.extent()},
-    {rhs.data(), rhs.layout(), rhs.extent()}
-  );
-
-  TensorForEach(
-    lhs.extent(),
-    real_func
-  );
-
-  if (!bool(real_func)) {
-    return false;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> imag_func(
-    {lhs.data() + lhs.imaginary_stride(), lhs.layout(), lhs.extent()}, 
-    {rhs.data() + rhs.imaginary_stride(), rhs.layout(), rhs.extent()}
-    );
-
-  TensorForEach(
-    lhs.extent(),
-    imag_func
-  );
-
-  return bool(imag_func);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if two tensor views are relatively equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorRelativelyEquals(
-  TensorView<Element, Layout> const &lhs,
-  TensorView<Element, Layout> const &rhs,
-  Element epsilon,
-  Element nonzero_floor) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return false;
-  }
-
-  detail::TensorRelativelyEqualsFunc<Element, Layout> func(lhs, rhs, epsilon, nonzero_floor);
-  TensorForEach(
-    lhs.extent(),
-    func
-  );
-
-  return bool(func);
-}
-
-/// Returns true if two tensor views are relatively equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorRelativelyEquals(
-  TensorViewPlanarComplex<Element, Layout> const &lhs,
-  TensorViewPlanarComplex<Element, Layout> const &rhs,
-  Element epsilon,
-  Element nonzero_floor) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return false;
-  }
-
-  detail::TensorRelativelyEqualsFunc<Element, Layout> real_func(
-    {lhs.data(), lhs.layout(), lhs.extent()},
-    {rhs.data(), rhs.layout(), rhs.extent()},
-    epsilon,
-    nonzero_floor
-  );
-
-  TensorForEach(
-    lhs.extent(),
-    real_func
-  );
-
-  if (!bool(real_func)) {
-    return false;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> imag_func(
-    {lhs.data() + lhs.imaginary_stride(), lhs.layout(), lhs.extent()},
-    {rhs.data() + rhs.imaginary_stride(), rhs.layout(), rhs.extent()},
-    epsilon,
-    nonzero_floor
-  );
-
-  TensorForEach(
-    lhs.extent(),
-    imag_func
-  );
-
-  return bool(imag_func);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if two tensor views are NOT equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorNotEquals(
-  TensorView<Element, Layout> const &lhs,
-  TensorView<Element, Layout> const &rhs) {
-
-  // Extents must be identical
-  if (lhs.extent() != rhs.extent()) {
-    return true;
-  }
-
-  detail::TensorEqualsFunc<Element, Layout> func(lhs, rhs);
-  TensorForEach(
-    lhs.extent(),
-    func
-  );
-
-  return !bool(func);
-}
-
-/// Returns true if two tensor views are equal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorNotEquals(
-  TensorViewPlanarComplex<Element, Layout> const &lhs,
-  TensorViewPlanarComplex<Element, Layout> const &rhs) {
-
-  return !TensorEquals(lhs, rhs);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorContainsFunc {
-
-  //
-  // Data members
-  //
-
-  TensorView<Element, Layout> view;
-  Element value;
-  bool contains;
-  Coord<Layout::kRank> location;
-
-  //
-  // Methods
-  //
-
-  /// Ctor
-  TensorContainsFunc(): contains(false) { }
-
-  /// Ctor
-  TensorContainsFunc(
-    TensorView<Element, Layout> const &view_,
-    Element value_
-  ) :
-    view(view_), value(value_), contains(false) { }
-
-  /// Visits a coordinate
-  void operator()(Coord<Layout::kRank> const &coord) {
-
-    if (view.at(coord) == value) {
-      if (!contains) {
-        location = coord;
-      }
-      contains = true;
-    }
-  }
-
-  /// Returns true if equal
-  operator bool() const {
-    return contains;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if a value is present in a tensor
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-bool TensorContains(
-  TensorView<Element, Layout> const & view,
-  Element value) {
-
-  detail::TensorContainsFunc<Element, Layout> func(
-    view,
-    value
-  );
-
-  TensorForEach(
-    view.extent(),
-    func
-  );
-
-  return bool(func);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns a pair containing a boolean of whether a value exists in a tensor and the location of
-/// of the first occurrence. If the value is not contained in the tensor, the second element of the
-/// pair is undefined.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-std::pair<bool, Coord<Layout::kRank> > TensorFind(
-  TensorView<Element, Layout> const & view,
-  Element value) {
-
-  detail::TensorContainsFunc<Element, Layout> func(
-    view,
-    value
-  );
-
-  TensorForEach(
-    view.extent(),
-    func
-  );
-
-  return std::make_pair(bool(func), func.location);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.hpp b/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.hpp
deleted file mode 100644
index a1f3f5b14e6f0..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_compare.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Provides several functions for filling tensors with data.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-
-// Cute includes
-#include "cute/tensor.hpp"
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Returns true if two tensor views are equal.
-template <
-  typename TensorL,
-  typename TensorR
->
-bool TensorEquals(
-  TensorL lhs,
-  TensorR rhs) {
-
-  // Extents must be identical
-  if (cute::size(lhs) != cute::size(rhs)) {
-    return false;
-  }
-
-  for (int64_t idx = 0; idx < cute::size(lhs); ++idx) {
-    if (lhs(idx) != rhs(idx)) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-/// Returns true if two tensor views are NOT equal.
-template <
-  typename TensorL,
-  typename TensorR
->
-bool TensorNotEquals(
-  TensorL lhs,
-  TensorR rhs) {
-
-  return TensorEquals(lhs, rhs);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_copy.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_copy.h
deleted file mode 100644
index 0b963b72e9152..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_copy.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines host-side elementwise operations on TensorView.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "tensor_foreach.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Helper to convert between types
-template <
-  typename DstElement,
-  typename SrcElement
->
-struct TrivialConvert {
-
-  TrivialConvert() { }
-
-  DstElement operator()(SrcElement src) const {
-    return DstElement(src);
-  }
-};
-
-/// Helper to conditionally copy between tensor views.
-template <
-  typename DstElement,
-  typename DstLayout,
-  typename SrcElement,
-  typename SrcLayout,
-  typename F
->
-struct TensorCopyIf {
-
-  using DstTensorView = TensorView<DstElement, DstLayout>;
-  using SrcTensorView = TensorView<SrcElement, SrcLayout>;
-
-  //
-  // Data members
-  //
-
-  DstTensorView dst;
-  SrcTensorView src;
-  F convert;
-
-  //
-  // Methods
-  //
-
-  TensorCopyIf() { }
-
-  TensorCopyIf(
-    DstTensorView const &dst_, 
-    SrcTensorView const &src_,
-    F const &convert_): dst(dst_), src(src_), convert(convert_) {}
-
-  /// Copies based on destination and source bounds
-  void operator()(Coord<DstLayout::kRank> const &coord) {
-    if (dst.contains(coord) && src.contains(coord)) {
-      dst.at(coord) = convert(src.at(coord));
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from one tensor view into another, satisfying bounds of each tensor.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout,           /// Source tensor's layout
-  typename F                    /// Transformation functor
->
-void TensorCopy(
-  TensorView<DstElement, DstLayout> dst,
-  TensorView<SrcElement, SrcLayout> src,
-  F const &transform) {
-
-  using CopyIf = detail::TensorCopyIf<
-    DstElement,
-    DstLayout,
-    SrcElement,
-    SrcLayout,
-    F>;
-
-  CopyIf copy_if(dst, src, transform);
-
-  TensorForEach(dst.extent(), copy_if);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from a TensorRef into a TensorView. Assumes source tensor has sufficient extent
-/// to avoid out of bounds accesses.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout,           /// Source tensor's layout
-  typename F                    /// Transformation functor
->
-void TensorCopy(
-  TensorView<DstElement, DstLayout> dst,
-  TensorRef<SrcElement, SrcLayout> src,
-  F const &transform) {
-
-  using CopyIf = detail::TensorCopyIf<
-    DstElement,
-    DstLayout,
-    SrcElement,
-    SrcLayout,
-    F>;
-
-  TensorView<SrcElement, SrcLayout> src_view(src, dst.extent());
-
-  CopyIf copy_if(dst, src_view, transform);
-
-  TensorForEach(dst.extent(), copy_if);
-}
-
-/// Copies elements from a TensorRef into a TensorView. Assumes source tensor has sufficient extent
-/// to avoid out of bounds accesses.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout,           /// Source tensor's layout
-  typename F                    /// Transformation functor
->
-void TensorCopy(
-  TensorRef<DstElement, DstLayout> dst,
-  TensorView<SrcElement, SrcLayout> src,
-  F const &transform) {
-
-  using CopyIf = detail::TensorCopyIf<
-    DstElement,
-    DstLayout,
-    SrcElement,
-    SrcLayout,
-    F>;
-
-  TensorView<DstElement, DstLayout> dst_view(dst, src.extent());
-
-  CopyIf copy_if(dst_view, src, transform);
-
-  TensorForEach(src.extent(), copy_if);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
-/// if SrcElement can be converted to DstElement.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout            /// Source tensor's layout
->
-void TensorCopy(
-  TensorView<DstElement, DstLayout> dst,
-  TensorView<SrcElement, SrcLayout> src) {
-
-  detail::TrivialConvert<DstElement, SrcElement> convert;
-
-  TensorCopy(dst, src, convert);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
-/// if SrcElement can be converted to DstElement.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout,           /// Source tensor's layout
-  typename F                    /// Transformation functor
->
-void TensorCopy(
-  TensorView<DstElement, DstLayout> dst,
-  TensorRef<SrcElement, SrcLayout> src) {
-
-  detail::TrivialConvert<DstElement, SrcElement> convert;
-
-  TensorCopy(dst, src, convert);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies elements from one tensor view into another, satisfying bounds of each tensor. Succeeds
-/// if SrcElement can be converted to DstElement.
-template <
-  typename DstElement,          /// Destination tensor's element type
-  typename DstLayout,           /// Destination tensor's layout
-  typename SrcElement,          /// Source tensor's element type
-  typename SrcLayout            /// Source tensor's layout
->
-void TensorCopy(
-  TensorRef<DstElement, DstLayout> dst,
-  TensorView<SrcElement, SrcLayout> src) {
-
-  detail::TrivialConvert<DstElement, SrcElement> convert;
-
-  TensorCopy(dst, src, convert);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_elementwise.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_elementwise.h
deleted file mode 100644
index 42ce2183b6a24..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_elementwise.h
+++ /dev/null
@@ -1,341 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Defines host-side elementwise operations on TensorView.
-*/
-
-#pragma once
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/functional.h"
-
-#include "tensor_foreach.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to apply a binary operator in place
-template <
-  typename ElementA, 
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementD,
-  typename LayoutD,
-  typename BinaryFunc>
-struct TensorFuncBinaryOp {
-
-  //
-  // Data members
-  //
-
-  /// View of left-hand-side tensor
-  TensorView<ElementD, LayoutD> view_d;
-  TensorRef<ElementA, LayoutA> view_a;
-  TensorRef<ElementB, LayoutB> view_b;
-  BinaryFunc func;
-
-  //
-  // Methods
-  //
-
-  /// Constructor
-  TensorFuncBinaryOp() { }
-
-  /// Constructor
-  TensorFuncBinaryOp(
-    TensorView<ElementD, LayoutD> const & view_d_,
-    TensorRef<ElementA, LayoutA> const & view_a_,
-    TensorRef<ElementB, LayoutB> const & view_b_,
-    BinaryFunc func = BinaryFunc()
-  ):
-    view_d(view_d_), view_a(view_a_), view_b(view_b_), func(func) { }
-
-  /// Equality check
-  void operator()(Coord<LayoutD::kRank> const &coord) const {
-    view_d.at(coord) = func(
-      ElementD(view_a.at(coord)),
-      ElementD(view_b.at(coord))
-    );
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Adds two tensors and stores in the destination tensor: d = a + b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorAdd(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-) {
-
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::plus<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func); 
-}
-
-/// Adds a tensor in place: d = d .+ a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA
->
-void TensorAdd(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-) {
-  TensorAdd(d, d, a);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Subtracts two tensors and stores in the destination tensor: d = a - b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorSub(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-  ) {
-
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::minus<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func);
-}
-
-/// Subtracts two tensors in place: d = d .- a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorSub(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-  ) {
-  
-  TensorSub(d, d, a);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Multiplies two tensors and stores in the destination tensor: d = a .* b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorMul(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-) {
-  
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::multiplies<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func);
-}
-
-/// Multiplies tensors in place: d = d .* a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA
->
-void TensorMul(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-) {
-  TensorMul(d, d, a);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Divides two tensors and stores in the destination tensor: d = a ./ b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorDiv(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-) {
-  
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::divides<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func);
-}
-
-/// Divides tensors in place: d = d ./ a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA
->
-void TensorDiv(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-) {
-  TensorDiv(d, d, a);
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Divides two tensors and stores in the destination tensor: d = a ./ b
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB
->
-void TensorModulus(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a,       ///< A tensor reference
-  TensorRef<ElementB, LayoutB> b        ///< B tensor reference
-) {
-  
-  detail::TensorFuncBinaryOp<
-    ElementD, 
-    LayoutD,
-    ElementA,
-    LayoutA,
-    ElementB,
-    LayoutB,
-    cutlass::divides<ElementD>
-  > func(d, a, b);
-
-  TensorForEach(
-    d.extent(),
-    func);
-}
-
-/// Divides tensors in place: d = d ./ a
-template <
-  typename ElementD,
-  typename LayoutD,
-  typename ElementA,
-  typename LayoutA
->
-void TensorModulus(
-  TensorView<ElementD, LayoutD> d,      ///< destination tensor view
-  TensorRef<ElementA, LayoutA> a        ///< A tensor reference
-) {
-  TensorDiv(d, d, a);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.h
deleted file mode 100644
index b9f0c84d9a2a9..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.h
+++ /dev/null
@@ -1,1718 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Provides several functions for filling tensors with data.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-#include <random>
-#include <stdexcept>
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-#include "cutlass/subbyte_reference.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/tensor_view_planar_complex.h"
-#include "cutlass/blas3.h"
-
-#include "cutlass/util/distribution.h"
-#include "tensor_foreach.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  Element value;
-
-  //
-  // Methods
-  //
-
-  TensorFillFunc(
-    TensorView const &view_ = TensorView(), 
-    Element value_ = Element(0)
-  ): view(view_), value(value_) { }
-
-  void operator()(Coord<Layout::kRank> const & coord) const {
-    view.at(coord) = value;
-  }
-};
-
-/// Returns a pair of values of the Gaussian distribution generated by the Box Muller method 
-struct BoxMullerFunc {
-
-  BoxMullerFunc() {}
-
-  void operator()(
-    double* rnd,                     ///< Size-2 vector to be filled with random values
-    double  mean = 0,                ///< Mean of the Gaussian distribution
-    double  stddev = 1,              ///< Standard deviation of the Gaussian distribution
-    double  pi = std::acos(-1)) const {
-
-    double u1 = double(std::rand()) / double(RAND_MAX);
-    double u2 = double(std::rand()) / double(RAND_MAX);
-    rnd[0] = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
-    rnd[1] = std::sqrt(-2 * std::log(u1)) * std::sin(2 * pi * u2);
-    rnd[0] = mean + stddev * rnd[0];
-    rnd[1] = mean + stddev * rnd[1];
-  }
-};
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with a uniform value
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFill(
-  TensorView<Element, Layout> dst,    ///< destination tensor 
-  Element val = Element(0)) {               ///< value to uniformly fill it with
-
-  detail::TensorFillFunc<Element, Layout> func(dst, val);
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-/// Fills a tensor with a uniform value
-template <
-  typename Element,                                                   ///< Element type
-  typename Layout>                                                    ///< Layout function
-void TensorFill(
-  TensorViewPlanarComplex<Element, Layout> dst,                       ///< destination tensor 
-  cutlass::complex<Element> val = cutlass::complex<Element>(0)) {     ///< value to uniformly fill it with
-
-  TensorFill(dst.view_real(), val.real());
-  TensorFill(dst.view_imag(), val.imag());
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomGaussianFunc {
-
-  uint64_t seed;
-  double mean;
-  double stddev;
-  int int_scale;
-  double pi;
-  double pnz;
-  bool exclude_zero;
-
-  //
-  // Methods
-  //
-  RandomGaussianFunc(
-    uint64_t seed_ = 0, 
-    double mean_ = 0, 
-    double stddev_ = 1,
-    int int_scale_ = -1,
-    double pnz_ = 1.0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
-      std::srand((unsigned)seed);
-  }
-
-  /// Compute random value and update RNG state
-  Element operator()() const {
-
-    // Box-Muller transform to generate random numbers with Normal distribution
-    double u1 = double(std::rand()) / double(RAND_MAX);
-    double u2 = double(std::rand()) / double(RAND_MAX);
-
-    // Compute Gaussian random value
-    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
-    rnd = mean + stddev * rnd;
-
-    // Scale and convert final result
-    Element result;
-
-    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
-    std::random_device rnd_device;
-    std::mt19937 bernoulli_rnd(rnd_device());
-    std::bernoulli_distribution bernoulli_dist(pnz);
-    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
-
-    // Sample from the Gaussian distribution for a nonzero element
-    if (bernoulli_result) {
-      if (int_scale >= 0) {
-        rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
-        result = static_cast<Element>(rnd);
-      }
-      else {
-        result = static_cast<Element>(rnd);
-      }
-    }
-    else {
-      result = static_cast<Element>(0);
-    }
-
-    // Note that exclude_zero = true will disable the bernoulli_result above by unsetting zeros
-    if (exclude_zero && result == Element(0)) {
-      if (rnd > 0) {
-        rnd += 1;
-      } else {
-        rnd -= 1;
-      }
-      result = Element(rnd);
-    }    
-
-    return result;
-  }
-};
-
-/// Partial specialization for initializing a complex value.
-template <typename Element>
-struct RandomGaussianFunc<complex<Element> > {
-
-  uint64_t seed;
-  double mean;
-  double stddev;
-  int int_scale;
-  double pi;
-  double pnz;
-  bool exclude_zero;
-
-  //
-  // Methods
-  //
-  RandomGaussianFunc(
-    uint64_t seed_ = 0, 
-    double mean_ = 0, 
-    double stddev_ = 1,
-    int int_scale_ = -1,
-    double pnz_ = 1.0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
-      std::srand((unsigned)seed);
-  }
-
-  /// Compute random value and update RNG state
-  complex<Element> operator()() const {
-
-    Element reals[2];
-
-    double rnd[2];
-    detail::BoxMullerFunc func;
-    func(rnd, mean, stddev, pi);
-
-    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
-    std::random_device rnd_device;
-    std::mt19937 bernoulli_rnd(rnd_device());
-    std::bernoulli_distribution bernoulli_dist(pnz);
-    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
-
-    // Sample from the Gaussian distribution for a nonzero element
-    if (bernoulli_result) {
-      if (int_scale >= 0) {
-        rnd[0] = double(int(rnd[0] * double(1 << int_scale)));
-        rnd[1] = double(int(rnd[1] * double(1 << int_scale)));
-        reals[0] = from_real<Element>(rnd[0] / double(1 << int_scale));
-        reals[1] = from_real<Element>(rnd[1] / double(1 << int_scale));
-      }
-      else {
-        reals[0] = from_real<Element>(rnd[0]);
-        reals[1] = from_real<Element>(rnd[1]);
-      }
-    }
-    else {
-      reals[0] = from_real<Element>(0);
-      reals[1] = from_real<Element>(0);
-    }
-
-    // Note that this will invalidate the above else statement because it unsets zero elements
-    if (exclude_zero &&
-        reals[0] == from_real<Element>(0.0) &&
-        reals[1] == from_real<Element>(0.0)) {
-
-      if (rnd[0] > 0.0) {
-        rnd[0] += 1.0;
-      } else {
-        rnd[0] -= 1.0;
-      }
-      reals[0] = from_real<Element>(rnd[0]);
-    }
-
-    return complex<Element>(reals[0], reals[1]);
-  }
-};
-
-/// Partial specialization for initializing a complex value.
-template <typename Element>
-struct RandomGaussianFunc<Quaternion<Element> > {
-
-  uint64_t seed;
-  double mean;
-  double stddev;
-  int int_scale;
-  double pi;
-  double pnz;
-  bool exclude_zero;
-
-  //
-  // Methods
-  //
-  RandomGaussianFunc(
-    uint64_t seed_ = 0,
-    double mean_ = 0,
-    double stddev_ = 1,
-    int int_scale_ = -1,
-    double pnz_ = 1.0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
-      std::srand((unsigned)seed);
-  }
-
-  /// Compute random value and update RNG state
-  Quaternion<Element> operator()() const {
-
-    Element reals[4];
-
-    double rnd1[2];
-    double rnd2[2];
-    detail::BoxMullerFunc func;
-    func(rnd1, mean, stddev, pi);
-    func(rnd2, mean, stddev, pi);
-
-    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
-    std::random_device rnd_device;
-    std::mt19937 bernoulli_rnd(rnd_device());
-    std::bernoulli_distribution bernoulli_dist(pnz);
-    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
-
-    // Sample from the Gaussian distribution for a nonzero element
-    if (bernoulli_result) {
-      if (int_scale >= 0) {
-        rnd1[0] = double(int(rnd1[0] * double(1 << int_scale)));
-        rnd1[1] = double(int(rnd1[1] * double(1 << int_scale)));
-        rnd2[0] = double(int(rnd2[0] * double(1 << int_scale)));
-        rnd2[1] = double(int(rnd2[1] * double(1 << int_scale)));
-
-        reals[0] = from_real<Element>(rnd1[0] / double(1 << int_scale));
-        reals[1] = from_real<Element>(rnd1[1] / double(1 << int_scale));
-        reals[2] = from_real<Element>(rnd2[0] / double(1 << int_scale));
-        reals[3] = from_real<Element>(rnd2[1] / double(1 << int_scale));
-      }
-      else {
-        reals[0] = from_real<Element>(rnd1[0]);
-        reals[1] = from_real<Element>(rnd1[1]);
-        reals[2] = from_real<Element>(rnd2[0]);
-        reals[3] = from_real<Element>(rnd2[1]);
-      }
-    }
-    else {
-      reals[0] = from_real<Element>(0);
-      reals[1] = from_real<Element>(0);
-      reals[2] = from_real<Element>(0);
-      reals[3] = from_real<Element>(0);
-    }
-
-    // Note that this will invalidate the above else statement because it unsets zero elements
-    if (exclude_zero &&
-        reals[0] == from_real<Element>(0) &&
-        reals[1] == from_real<Element>(0) &&
-        reals[2] == from_real<Element>(0) &&
-        reals[3] == from_real<Element>(0)) {
-
-      if (rnd1[0] > 0.0) {
-        rnd1[0] += 1.0;
-      } else {
-        rnd1[0] -= 1.0;
-      }
-      reals[0] = from_real<Element>(rnd1[0]);
-    }
-
-    return Quaternion<Element>(reals[0], reals[1], reals[2], reals[3]);
-  }
-};
-
-/// Computes a random Gaussian distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillGaussianFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomGaussianFunc<Element> func;
-
-  //
-  // Methods
-  //
-
-  /// Construction of Gaussian RNG functor.
-  TensorFillGaussianFunc(
-    TensorView view_ = TensorView(),
-    RandomGaussianFunc<Element> func_ = RandomGaussianFunc<Element>()
-  ):
-    view(view_), func(func_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
-    view.at(coord) = func();
-  }
-};
-
-/// Computes a random Gaussian distribution for a rank-2 tensor
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillSymmetricGaussianFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomGaussianFunc<Element> func;
-  cutlass::FillMode fill_mode;
-
-  //
-  // Methods
-  //
-
-  /// Construction of Gaussian RNG functor.
-  TensorFillSymmetricGaussianFunc(
-    TensorView view_ = TensorView(),
-    RandomGaussianFunc<Element> func_ = RandomGaussianFunc<Element>(),
-    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
-  ):
-    view(view_), func(func_), fill_mode(fill_mode_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
-    // Fill half of matrix based on FillMode
-    if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kLower &&
-        coord[0] >= coord[1]) {
-      view.at(coord) = func();
-    } else if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kUpper &&
-        coord[0] <= coord[1]) {
-      view.at(coord) = func();
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomGaussian(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 1.0,                     ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  bool exclude_zero = false) {            ///< Exclude zeros from tensor init.
-  
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz, exclude_zero);
-
-  detail::TensorFillGaussianFunc<Element, Layout> func(
-    dst,
-    random_func
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomGaussian(
-  TensorViewPlanarComplex<Element, Layout> dst,         ///< destination tensor
-  uint64_t seed,                                        ///< seed for RNG
-  double mean = 0,                                      ///< Gaussian distribution's mean
-  double stddev = 1,                                    ///< Gaussian distribution's standard deviation
-  int bits = -1,                                        ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 1.0,                                   ///  are not truncated to zero. Permits reducing precision of
-                                                        ///  data.
-  bool exclude_zero = false) {                          ///< Exclude zeros from tensor init.
-  
-  TensorFillRandomGaussian(dst.view_real(), seed, mean, stddev, bits, pnz);
-  TensorFillRandomGaussian(dst.view_imag(), ~seed, mean, stddev, bits, pnz);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a Gaussian distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillSymmetricRandomGaussian(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
-
-  detail::TensorFillSymmetricGaussianFunc<Element, Layout> func(
-    dst,
-    random_func,
-    fill_mode
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values of a Gaussian distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomGaussian(
-  Element *ptr,                           ///< destination buffer
-  size_t capacity,                        ///< number of elements
-  uint64_t seed,                          ///< seed for RNG
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  
-
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ReferenceFactory<Element>::get(ptr, i) = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomUniformFunc {
-
-  using Real = typename RealType<Element>::Type;
-  
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  double pnan;
-private:
-  using engine_type = std::mt19937;
-public:
-  engine_type bernoulli_rnd;
-  std::bernoulli_distribution bernoulli_dist;
-
-  bool exclude_zero;
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0, 
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1,
-    double pnan_ = 0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
-    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
-    , bernoulli_dist(pnan_)
-    , exclude_zero(exclude_zero_) 
-    {
-      std::srand((unsigned)seed);
-      
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero) {
-        min = (min == 0.0) ? min + 1: min;
-        range = (max == 0.0) ? range - 1: range; 
-      }
-  }
-
-
-  /// Compute random value and update RNG state
-  Element operator()() {
-
-    // Sample from NaN distribution.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
-        return Element(NAN);
-      }
-    }
-
-    double rnd = double(std::rand()) / double(RAND_MAX);
-
-    rnd = min + range * rnd;
-
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-    if (int_scale >= 0) {
-      rnd = double(std::llround(rnd * double(1 << int_scale))) / double(1 << int_scale);
-      result = static_cast<Element>(Real(rnd));
-    }
-    else {
-      result = static_cast<Element>(Real(rnd));
-    }
-
-    if (exclude_zero && result == Element(0)) {
-      if (rnd > 0.0) {
-        rnd = std::min(min + range, rnd + 1.0);
-      } else {
-        rnd = std::max(min, rnd - 1.0);
-      }
-      result = static_cast<Element>(Real(rnd));
-    }
-
-    return result;
-  }
-};
-
-/// Partial specialization for initializing a complex value.
-template <typename Element>
-struct RandomUniformFunc<complex<Element> > {
-
-  using Real = typename RealType<Element>::Type;
-  
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  double pnan;
-private:
-  using engine_type = std::mt19937;
-public:
-  engine_type bernoulli_rnd;
-  std::bernoulli_distribution bernoulli_dist;
-
-  bool exclude_zero;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0, 
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1,
-    double pnan_ = 0,
-    bool exclude_zero_ = false
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
-    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
-    , bernoulli_dist(pnan_)
-    , exclude_zero(exclude_zero_) {
-      std::srand((unsigned)seed);
-
-      // Handle cases where min = 0 or max = 0 for excluding zeros
-      if (exclude_zero) {
-        min = (min == 0.0) ? min + 1: min;
-        range = (max == 0.0) ? range - 1: range; 
-      }
-  }
-
-
-  /// Compute random value and update RNG state
-  complex<Element> operator()() {
-
-    // Sample from NaN distribution.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
-        return Element(NAN);
-      }
-    }
-
-    Element reals[2];
-
-    for (int i = 0; i < 2; ++i) {
-      double rnd = double(std::rand()) / double(RAND_MAX);
-
-      rnd = min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-      
-      if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
-      }
-      else {
-        reals[i] = from_real<Element>(Real(rnd));
-      }
-
-      if (exclude_zero && 
-          i == 0 &&
-          reals[0] == from_real<Element>(0.0)) {
-
-        if (rnd > 0.0) {
-          rnd = std::min(min + range, rnd + 1.0);
-        } else {
-          rnd = std::max(min, rnd - 1.0);
-        }
-        reals[0] = from_real<Element>(Real(rnd));
-      }
-
-    }
-
-    return complex<Element>(reals[0], reals[1]);
-  }
-};
-
-/// Partial specialization for initializing a Quaternion value.
-template <typename Element>
-struct RandomUniformFunc<Quaternion<Element> > {
-
-  using Real = typename RealType<Element>::Type;
-
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  double pnan;
-private:
-  using engine_type = std::mt19937;
-public:
-  engine_type bernoulli_rnd;
-  std::bernoulli_distribution bernoulli_dist;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0,
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1,
-    double pnan_ = 0
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_),
-    bernoulli_rnd{static_cast<engine_type::result_type>(seed_)},
-    bernoulli_dist(pnan_)
-  {
-    std::srand((unsigned)seed);
-  }
-
-
-  /// Compute random value and update RNG state
-  Quaternion<Element> operator()() {
-
-    // Sample from NaN distribution.
-    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
-      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
-        return Element(NAN);
-      }
-    }
-
-    Element reals[4];
-
-    for (int i = 0; i < 4; ++i) {
-      double rnd = double(std::rand()) / double(RAND_MAX);
-
-      rnd = min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-
-      if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
-      }
-      else {
-        reals[i] = from_real<Element>(Real(rnd));
-      }
-    }
-
-    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
-  }
-};
-
-/// Computes a random uniform distribution
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomUniformFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomUniformFunc<Element> func;
-
-  //
-  // Methods
-  //
-
-  /// Construction of uniform RNG functor.
-  TensorFillRandomUniformFunc(
-    TensorView view_ = TensorView(),
-    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>()
-  ):
-    view(view_), func(func_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) {
-
-    view.at(coord) = func();
-  }
-};
-
-/// Fills the upper or lower part of a symmetric rank-2 tensor with random values of a uniform distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillSymmetricRandomUniformFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomUniformFunc<Element> func;
-  cutlass::FillMode fill_mode;
-
-  //
-  // Methods
-  //
-
-  /// Construction of uniform RNG functor.
-  TensorFillSymmetricRandomUniformFunc(
-    TensorView view_ = TensorView(),
-    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
-    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid
-  ):
-    view(view_), func(func_), fill_mode(fill_mode_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) {
-    // Fill half of matrix based on FillMode
-    if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kLower &&
-        coord[0] >= coord[1]) {
-      view.at(coord) = func();
-    } else if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kUpper &&
-        coord[0] <= coord[1]) {
-      view.at(coord) = func();
-    }
-  }
-};
-
-/// Computes a random Uniform distribution and pads diagonal with zeros
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillPadDiagonalRandomUniformFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomUniformFunc<Element> func;
-  cutlass::FillMode fill_mode;
-  int alignment;
-
-  //
-  // Methods
-  //
-
-  /// Construction of uniform RNG functor.
-  TensorFillPadDiagonalRandomUniformFunc(
-    TensorView view_ = TensorView(),
-    RandomUniformFunc<Element> func_ = RandomUniformFunc<Element>(),
-    cutlass::FillMode fill_mode_ = cutlass::FillMode::kInvalid,
-    int alignment_ = 1
-  ):
-    view(view_), func(func_), fill_mode(fill_mode_), alignment(alignment_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) {
-    // Fill half of matrix based on FillMode
-    if (Layout::kRank == 2 && 
-        (fill_mode == cutlass::FillMode::kLower) &&
-        (coord[0] >= coord[1]) || 
-        ((coord[1] - coord[0]) >= alignment)) {
-      view.at(coord) = func();
-    } else if (Layout::kRank == 2 && 
-        fill_mode == cutlass::FillMode::kUpper &&
-        (coord[0] <= coord[1]) ||
-        ((coord[0] - coord[1]) >= alignment)) {
-      view.at(coord) = func();
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values of a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomUniform(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0,                        ///< Percentage of NaN elements.
-  bool exclude_zero = false) {            ///< Exclude zero from tensor init  
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan, exclude_zero);
-
-  detail::TensorFillRandomUniformFunc<Element, Layout> func(
-    dst,
-    random_func
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-/// Fills a tensor with random values of a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomUniform(
-  TensorViewPlanarComplex<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                                       ///< seed for RNG
-  double max = 1,                                      ///< upper bound of distribution
-  double min = 0,                                      ///< lower bound for distribution
-  int bits = -1,                                       ///< If non-negative, specifies number of fractional bits that
-                                                       ///  are not truncated to zero. Permits reducing precision of
-                                                       ///  data.
-  double pnan = 0,                                     ///< Percentage of NaN elements.
-  bool exclude_zero = false) {                         ///< Exclude zero from tensor init 
-
-  TensorFillRandomUniform(dst.view_real(), seed, max, min, bits, pnan, exclude_zero);
-  TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits, pnan, exclude_zero);
-}
-
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandomUniform(
-  TensorView<Quaternion<Element>, Layout> dst,        ///< destination tensor
-  uint64_t seed,                                      ///< seed for RNG
-  double max = 1,                                     ///< upper bound of distribution
-  double min = 0,                                     ///< lower bound for distribution
-  int bits = -1) {                                    ///< If non-negative, specifies number of fractional bits that 
-                                                      ///  are not truncated to zero. Permits reducing precision of
-                                                      ///  data.                 
-  detail::RandomUniformFunc<Quaternion<Element>> random_func(seed, max, min, bits);
-
-  detail::TensorFillRandomUniformFunc<Quaternion<Element>, Layout> func(
-    dst,
-    random_func
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillSymmetricRandomUniform(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
-
-  detail::TensorFillSymmetricRandomUniformFunc<Element, Layout> func(
-    dst,
-    random_func,
-    fill_mode
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-/// Fills a tensor with random values with a uniform random distribution pads zeros along diagonal
-template <
-  typename Element,                       ///< Element type
-  typename Layout>                        ///< Layout function
-void TensorFillPadDiagonalRandomUniform(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  cutlass::FillMode fill_mode,            ///< FillMode for symmetric matrices
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  int alignment = 1 
-) {
-
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
-
-  detail::TensorFillPadDiagonalRandomUniformFunc<Element, Layout> func(
-    dst,
-    random_func,
-    fill_mode,
-    alignment
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with a uniform value
-template <
-  typename Element                        ///< Element type
->
-void BlockFill(
-  Element *ptr,
-  size_t capacity,
-  Element val
-  ) {                                       
-  for (size_t i = 0; i < capacity; ++i) {
-    ReferenceFactory<Element>::get(ptr, i) = val;
-  }
-}
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomUniform(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  double pnan = 0) {                      ///< Percentage of NaN elements.
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ReferenceFactory<Element>::get(ptr, i) = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillDiagonalFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  Element diag;
-  Element other;
-
-  //
-  // Methods
-  //
-
-  TensorFillDiagonalFunc(
-    TensorView const &view_ = TensorView(),
-    Element diag_ = Element(1),
-    Element other_ = Element(0)
-  ):
-    view(view_), diag(diag_), other(other_) { }
-
-  void operator()(Coord<Layout::kRank> const & coord) const {
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    view.at(coord) = (is_diag ? diag : other);
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor everywhere with a unique value for its diagonal.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillDiagonal(
-  TensorView<Element, Layout> dst,        ///< destination tensor
-  Element diag = Element(1),              ///< value to write in the diagonal
-  Element other = Element(0)) {           ///< value to write off the diagonal
-
-  detail::TensorFillDiagonalFunc<Element, Layout> func(
-    dst,
-    diag,
-    other
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Helper to fill a tensor's diagonal with 1 and 0 everywhere else.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillIdentity(
-  TensorView<Element, Layout> dst) {               ///< destination tensor
-
-  TensorFillDiagonal(dst, Element(1), Element(0));
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Writes a uniform value to the diagonal of a tensor without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateDiagonal(
-  TensorView<Element, Layout> dst,                 ///< destination tensor
-  Element val = Element(1)) {
-
-  typename Layout::Index extent = dst.extent().min();
-
-  for (typename Layout::Index i = 0; i < extent; ++i) {
-    Coord<Layout::kRank> coord(i);
-    dst.at(coord) = val;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorUpdateOffDiagonalFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  Element other;
-
-  //
-  // Methods
-  //
-
-  TensorUpdateOffDiagonalFunc(
-    TensorView const &view_ = TensorView(),
-    Element other_ = Element(0)
-  ):
-    view(view_), other(other_) { }
-
-  void operator()(Coord<Layout::kRank> const & coord) const {
-    bool is_diag = true;
-    
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 1; i < Layout::kRank; ++i) {
-      if (coord[i] != coord[i - 1]) {
-        is_diag = false;
-        break;
-      }
-    }
-
-    if (!is_diag) {
-      view.at(coord) = other;
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Writes a uniform value to all elements in the tensor without modifying diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorUpdateOffDiagonal(
-  TensorView<Element, Layout> dst,      ///< destination tensor
-  Element other = Element(1)) {
-
-  detail::TensorUpdateOffDiagonalFunc<Element, Layout> func(
-    dst,
-    other
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillLinearFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  Array<Element, Layout::kRank> v;
-  Element s;
-
-  //
-  // Methods
-  //
-  
-  TensorFillLinearFunc() { }
-
-  /// Constructs functor
-  TensorFillLinearFunc(
-    TensorView const &view_,
-    Array<Element, Layout::kRank> const & v_,
-    Element s_ = Element(0)
-  ):
-    view(view_), v(v_), s(s_) { }
-
-  /// Updates the tensor
-  void operator()(Coord<Layout::kRank> const & coord) const {
-    
-    Element sum(s);
-
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < Layout::kRank; ++i) {
-      sum += Element(coord[i]) * v[i];
-    }
-
-    view.at(coord) = sum;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills tensor with a linear combination of its coordinate and another vector
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillLinear(
-  TensorView<Element, Layout> dst,      ///< destination tensor
-  Array<Element, Layout::kRank> const & v,
-  Element s = Element(0)) {
-
-  detail::TensorFillLinearFunc<Element, Layout> func(
-    dst,
-    v,
-    s
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills tensor with a linear combination of its coordinate and another vector
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillSequential(
-  TensorView<Element, Layout> dst,     ///< destination tensor
-  Element s = Element(0)) {
-
-  Array<Element, Layout::kRank> stride;
-
-  stride[0] = Element(1);
-
-  CUTLASS_PRAGMA_UNROLL
-  for (int i = 1; i < Layout::kRank; ++i) {
-    stride[i] = stride[i - 1] * Element(dst.extent()[i - 1]);
-  }
-
-  TensorFillLinear(dst, stride, s);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values from a distribution.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorFillRandom(
-  TensorView<Element, Layout> view,       ///< destination tensor
-  uint64_t seed,
-  Distribution dist,
-  bool exclude_zero = false               ///< If true, excludes 0.
-                                          ///  Note that setting this flag will result in more 1's,
-                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
-) {
-
-  using Real = typename RealType<Element>::Type;
-
-  if (dist.kind == Distribution::Gaussian) {
-    TensorFillRandomGaussian(
-      view,
-      seed,
-      dist.gaussian.mean,
-      dist.gaussian.stddev,
-      dist.int_scale,
-      dist.gaussian.pnz,
-      exclude_zero);
-  } else if (dist.kind == Distribution::Uniform) {
-    TensorFillRandomUniform(
-      view,
-      seed,
-      dist.uniform.max,
-      dist.uniform.min,
-      dist.int_scale,
-      dist.uniform.pnan,
-      exclude_zero);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequential(
-  Element *ptr,
-  int64_t capacity,
-  Element v = Element(1),
-  Element s = Element(0)) {
-  int i = 0;
-
-  while (i < capacity) {
-    cutlass::ReferenceFactory<Element, (cutlass::sizeof_bits<Element>::value <
-                                        8)>::get(ptr, i) = s;
-
-    s = Element(s + v);
-    ++i;
-  }
-}
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequentialModN(
-  Element *ptr,
-  int64_t capacity,
-  int64_t mod,
-  int64_t v = int64_t(1),
-  int64_t s = int64_t(0)) {
-  int i = 0;
-
-  while (i < capacity) {
-    cutlass::ReferenceFactory<Element, (cutlass::sizeof_bits<Element>::value <
-                                        8)>::get(ptr, i) = Element(s);
-
-    s = int64_t(s + v) % mod;
-    ++i;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillRandom(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,
-  Distribution dist) {
-
-  if (dist.kind == Distribution::Gaussian) {
-    BlockFillRandomGaussian<Element>(
-      ptr, 
-      capacity, 
-      seed, 
-      dist.gaussian.mean, 
-      dist.gaussian.stddev, 
-      dist.int_scale,
-      dist.gaussian.pnz);
-  }
-  else if (dist.kind == Distribution::Uniform) {
-    BlockFillRandomUniform<Element>(
-      ptr, 
-      capacity, 
-      seed, 
-      dist.uniform.max,
-      dist.uniform.min, 
-      dist.int_scale,
-      dist.uniform.pnan);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomSparseMetaFunc {
-  
-  uint64_t seed;
-  int range;
-  int MetaSizeInBits;
-
-  //
-  // Methods
-  //
-
-  RandomSparseMetaFunc(
-    uint64_t seed_ = 0, 
-    int MetaSizeInBits_ = 2
-  ):
-    seed(seed_), MetaSizeInBits(MetaSizeInBits_) {
-      std::srand((unsigned)seed);
-      if (MetaSizeInBits_ == 2) {
-        range = 6;
-      }
-      else if (MetaSizeInBits_ == 4) {
-        range = 2;
-      }
-      else {
-        throw std::invalid_argument("Invalid MetaSizeInBits");
-      }
-    }
-
-  /// Compute random value and update RNG state
-  Element operator()() const {
-    Element FourToTwoMeta[6] = {0x4, 0x8, 0x9, 0xc, 0xd, 0xe};
-    Element TwoToOneMeta[2] = {0x4, 0xe};
-
-    Element * MetaArray = (MetaSizeInBits == 2) ? FourToTwoMeta : TwoToOneMeta;
-
-    Element result = 0x0;
-
-    for (int i = 0; i < cutlass::sizeof_bits<Element>::value / 4; ++i) {
-      int rnd = std::rand() % range;
-      Element meta = MetaArray[rnd];
-
-      result = (Element)(result | ((Element)(meta << (i * 4))));
-    }
-
-    return result;
-  }
-};
-
-/// Computes a random sparse meta
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-struct TensorFillRandomSparseMetaFunc {
-
-  using TensorView = TensorView<Element, Layout>;
-
-  //
-  // Data members
-  //
-
-  TensorView view;
-  RandomSparseMetaFunc<Element> func;
-
-  //
-  // Methods
-  //
-
-  /// Construction of Gaussian RNG functor.
-  TensorFillRandomSparseMetaFunc(
-    TensorView view_ = TensorView(),
-    RandomSparseMetaFunc<Element> func_ = RandomSparseMetaFunc<Element>()
-  ):
-    view(view_), func(func_) {
-
-  }
-
-  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
-
-    view.at(coord) = func();
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element,                 ///< Element type
-  typename Layout>                  ///< Layout function
-void TensorFillRandomSparseMeta(
-  TensorView<Element, Layout> dst,  ///< destination tensor
-  uint64_t seed,                    ///< seed for RNG
-  int MetaSizeInBits) {             ///< 2 bit or 4 bit
-
-  detail::RandomSparseMetaFunc<Element> random_func(seed, MetaSizeInBits);
-
-  detail::TensorFillRandomSparseMetaFunc<Element, Layout> func(
-    dst,
-    random_func
-  );
-
-  TensorForEach(
-    dst.extent(),
-    func
-  );
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomSparseMeta(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  int MetaSizeInBits) {                   ///< 2 bit or 4bit
-
-  detail::RandomSparseMetaFunc<Element> random_func(seed, MetaSizeInBits);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ptr[i] = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a ell block index matrix with random values with a uniform random distribution.
-template <
-  typename Element,                                ///< Element type
-  typename Layout>                                 ///< Layout function
-void TensorFillRandomEllIdx(
-  TensorView<Element, Layout> dst,                 ///< destination tensor
-  uint64_t seed,                                   ///< seed for RNG
-  int rows, int ell_cols, int cols) {              ///< dimension of the matrix 
-
-  std::srand((unsigned)seed);
-
-  for (int i = 0; i < rows; ++i) {
-    int col_idx = std::rand() % cols;
-   
-    for (int j = 0; j < ell_cols; ++j) {
-      dst.at({i, j}) = col_idx;
-
-      if (col_idx != -1) {
-        if (col_idx == (cols - 1)) {
-          col_idx = -1;
-        } else {
-          col_idx = std::rand() % (cols - col_idx - 1) + col_idx + 1;
-        }
-      }
-    }
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies a diagonal in from host memory without modifying off-diagonal elements.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalIn(
-  TensorView<Element, Layout> dst,          ///< destination tensor
-  Element const *ptr) {                     ///< dense buffer of elements
-
-  typename Layout::Index extent = dst.extent().min();
-  
-  for (typename Layout::Index i = 0; i < extent; ++i) {
-    Coord<Layout::kRank> coord(i);
-    dst.at(coord) = ReferenceFactory<Element>::get(ptr, i);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Copies the diagonal of a tensor into a dense buffer in host memory.
-template <
-  typename Element,               ///< Element type
-  typename Layout>                ///< Layout function
-void TensorCopyDiagonalOut(
-  Element *ptr,                               ///< dense buffer of elements
-  TensorView<Element, Layout> src) {          ///< source tensor
-
-  typename Layout::Index extent = src.extent().min();
-  
-  for (typename Layout::Index i = 0; i < extent; ++i) {
-    Coord<Layout::kRank> coord(i);
-    ReferenceFactory<Element>::get(ptr, i) = src.at(coord);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.hpp b/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.hpp
deleted file mode 100644
index 86a54e2ee06b7..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_fill.hpp
+++ /dev/null
@@ -1,432 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Provides several functions for filling tensors with data.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-
-// Cute includes
-#include "cute/tensor.hpp"
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Uniform and procedural tensor fills
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with a scalar element
-template <typename Tensor>
-void TensorFill(Tensor dst, typename Tensor::value_type element) {
-
-  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
-    dst(idx) = element;
-  }
-}
-
-/// Fills a tensor with the contents of its layout
-template <typename Tensor>
-void TensorFillSequential(Tensor dst) {
-
-  auto layout = dst.layout();
-
-  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
-    dst(idx) = layout(idx);
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Random uniform values
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomUniformFunc {
-
-  using Real = typename RealType<Element>::Type;
-  
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0, 
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
-      std::srand((unsigned)seed);
-    }
-
-
-  /// Compute random value and update RNG state
-  Element operator()() const {
-
-    double rnd = double(std::rand()) / double(RAND_MAX);
-
-    rnd = min + range * rnd;
-
-    // Random values are cast to integer after scaling by a power of two to facilitate error
-    // testing
-    Element result;
-    
-    if (int_scale >= 0) {
-      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
-      result = static_cast<Element>(Real(rnd));
-    }
-    else {
-      result = static_cast<Element>(Real(rnd));
-    }
-
-    return result;
-  }
-};
-
-/// Partial specialization for initializing a complex value.
-template <typename Element>
-struct RandomUniformFunc<complex<Element> > {
-
-  using Real = typename RealType<Element>::Type;
-  
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0, 
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
-      std::srand((unsigned)seed);
-    }
-
-
-  /// Compute random value and update RNG state
-  complex<Element> operator()() const {
-
-    Element reals[2];
-
-    for (int i = 0; i < 2; ++i) {
-      double rnd = double(std::rand()) / double(RAND_MAX);
-
-      rnd = min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-      
-      if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
-      }
-      else {
-        reals[i] = from_real<Element>(Real(rnd));
-      }
-    }
-
-    return complex<Element>(reals[0], reals[1]);
-  }
-};
-
-/// Partial specialization for initializing a Quaternion value.
-template <typename Element>
-struct RandomUniformFunc<Quaternion<Element> > {
-
-  using Real = typename RealType<Element>::Type;
-
-  uint64_t seed;
-  double range;
-  double min;
-  int int_scale;
-
-  //
-  // Methods
-  //
-
-  RandomUniformFunc(
-    uint64_t seed_ = 0,
-    double max = 1,
-    double min_ = 0,
-    int int_scale_ = -1
-  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
-      std::srand((unsigned)seed);
-    }
-
-
-  /// Compute random value and update RNG state
-  Quaternion<Element> operator()() const {
-
-    Element reals[4];
-
-    for (int i = 0; i < 4; ++i) {
-      double rnd = double(std::rand()) / double(RAND_MAX);
-
-      rnd = min + range * rnd;
-
-      // Random values are cast to integer after scaling by a power of two to facilitate error
-      // testing
-
-      if (int_scale >= 0) {
-        rnd = double(int(rnd * double(1 << int_scale)));
-        reals[i] = from_real<Element>(Real(rnd / double(1 << int_scale)));
-      }
-      else {
-        reals[i] = from_real<Element>(Real(rnd));
-      }
-    }
-
-    return make_Quaternion(reals[0], reals[1], reals[2], reals[3]);
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a uniform random distribution.
-template <typename Tensor>                ///< Tensor object
-void TensorFillRandomUniform(
-  Tensor dst,                             ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.   
-
-  detail::RandomUniformFunc<typename Tensor::value_type> random_func(seed, max, min, bits);
-
-  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
-    dst(idx) = random_func();
-  }
-}
-
-/// Fills a block with random values with a uniform random distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomUniform(
-  Element *ptr,
-  size_t capacity,
-  uint64_t seed,                          ///< seed for RNG
-  double max = 1,                         ///< upper bound of distribution
-  double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.                 
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ptr[i] = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Random Gaussian
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-template <typename Element>
-struct RandomGaussianFunc {
-
-  uint64_t seed;
-  double mean;
-  double stddev;
-  int int_scale;
-  double pi;
-
-  //
-  // Methods
-  //
-  RandomGaussianFunc(
-    uint64_t seed_ = 0, 
-    double mean_ = 0, 
-    double stddev_ = 1,
-    int int_scale_ = -1
-  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)) {
-      std::srand((unsigned)seed);
-  }
-
-  /// Compute random value and update RNG state
-  Element operator()() const {
-
-    // Box-Muller transform to generate random numbers with Normal distribution
-    double u1 = double(std::rand()) / double(RAND_MAX);
-    double u2 = double(std::rand()) / double(RAND_MAX);
-
-    // Compute Gaussian random value
-    double rnd = std::sqrt(-2 * std::log(u1)) * std::cos(2 * pi * u2);
-    rnd = mean + stddev * rnd;
-
-    // Scale and convert final result
-    Element result;
-
-    if (int_scale >= 0) {
-      rnd = double(int64_t(rnd * double(1 << int_scale))) / double(1 << int_scale);
-      result = static_cast<Element>(rnd);
-    }
-    else {
-      result = static_cast<Element>(rnd);
-    }
-
-    return result;
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a tensor with random values with a Gaussian distribution.
-template <
-  typename Tensor
->
-void TensorFillRandomGaussian(
-  Tensor  dst,                            ///< destination tensor
-  uint64_t seed,                          ///< seed for RNG
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  
-  detail::RandomGaussianFunc<typename Tensor::value_type> random_func(seed, mean, stddev, bits);
-
-  for (int64_t idx = 0; idx < cute::size(dst); ++idx) {
-    dst(idx) = random_func();
-  }
-}
-
-/// Fills a block with random values with a Gaussian distribution.
-template <
-  typename Element                        ///< Element type
->
-void BlockFillRandomGaussian(
-  Element *ptr,                           ///< destination buffer
-  size_t capacity,                        ///< number of elements
-  uint64_t seed,                          ///< seed for RNG
-  double mean = 0,                        ///< Gaussian distribution's mean
-  double stddev = 1,                      ///< Gaussian distribution's standard deviation
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
-                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.
-  
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits);
-
-  for (size_t i = 0; i < capacity; ++i) {
-    ptr[i] = random_func();
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequential(
-  Element *ptr,
-  int64_t capacity,
-  Element v = Element(1),
-  Element s = Element(0)) {
-  int i = 0;
-
-  while (i < capacity) {
-
-    ptr[i] = Element(s + v);
-    ++i;
-  }
-}
-
-/// Fills a block of data with sequential elements
-template <
-  typename Element
->
-void BlockFillSequentialModN(
-  Element *ptr,
-  int64_t capacity,
-  int64_t mod,
-  int64_t v = int64_t(1),
-  int64_t s = int64_t(0)) {
-  int i = 0;
-
-  while (i < capacity) {
-
-    ptr[i] = static_cast<Element>(int32_t(int64_t(s + v) % mod));
-    ++i;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_foreach.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_foreach.h
deleted file mode 100644
index 43ff17362c21b..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_foreach.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-
-namespace cutlass  {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Defines several helpers
-namespace detail {
-
-/// Helper to perform for-each operation
-template <typename Func, int Rank, int RankRemaining>
-struct TensorForEachHelper {
-
-  /// Index of the active rank
-  static int const kActiveRank = Rank - RankRemaining - 1;
-
-  /// Constructor for general rank
-  TensorForEachHelper(
-    Func &func,
-    Coord<Rank> const &extent,
-    Coord<Rank> &coord) {
-
-    for (int i = 0; i < extent.at(kActiveRank); ++i) {
-      coord[kActiveRank] = i;
-      TensorForEachHelper<Func, Rank, RankRemaining - 1>(func, extent, coord);
-    }
-  }
-};
-
-/// Helper to perform for-each operation
-template <typename Func, int Rank>
-struct TensorForEachHelper<Func, Rank, 0> {
-
-  /// Index of the active rank
-  static int const kActiveRank = Rank - 1;
-
-  /// Constructor for fastest changing rank
-  TensorForEachHelper(
-    Func &func,
-    Coord<Rank> const &extent,
-    Coord<Rank> &coord) {
-
-    for (int i = 0; i < extent.at(kActiveRank); ++i) {
-      coord[kActiveRank] = i;
-      func(coord);
-    }
-  }
-};
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Iterates over the index space of a tensor
-template <
-  typename Func,          ///< function applied to each point in a tensor's index space
-  int Rank>               ///< rank of index space
-void TensorForEach(Coord<Rank> extent, Func & func) {
-  Coord<Rank> coord;
-  detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, extent, coord);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Iterates over the index space of a tensor and calls a C++ lambda
-template <
-  typename Func,          ///< function applied to each point in a tensor's index space
-  int Rank>               ///< rank of index space
-void TensorForEachLambda(Coord<Rank> extent, Func func) {
-  Coord<Rank> coord;
-  detail::TensorForEachHelper<Func, Rank, Rank - 1>(func, extent, coord);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Element, typename Func>
-struct BlockForEach {
-
-  /// Constructor performs the operation.
-  BlockForEach(
-    Element *ptr, 
-    size_t capacity,
-    typename Func::Params params = typename Func::Params()) {
-  
-    Func func(params);
-
-    for (size_t index = 0; index < capacity; ++index) {
-      ptr[index] = func();
-    }    
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_norm.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_norm.h
deleted file mode 100644
index 8a7240665550d..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_norm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-
-#include "cutlass/cutlass.h"
-
-// The contents of this file have been moved  to 'tensor_reduce' to cover other types of reductions.
-
-#include "cutlass/util/reference/host/tensor_reduce.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.h b/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.h
deleted file mode 100644
index 048352ae29514..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-#pragma once
-
-#include <cmath>
-
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/tensor_ref.h"
-
-#include "cutlass/util/reference/detail/linear_to_coordinate.h"
-#include "cutlass/core_io.h"
-
-namespace cutlass  {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform
-) {
-
-  for (int64_t idx = 0; idx < int64_t(view.size()); ++idx) {
-    typename Layout::TensorCoord coord;
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view.extent());
-
-    if (view.contains(coord)) {
-      Element x = view.at(coord);
-      identity = reduce(identity, transform(x));
-    }
-  }
-
-  return identity;
-}
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform) {
-  
-  if (view_A.extent() != view_B.extent()) {
-    throw std::runtime_error("Tensor extents must match.");
-  }
-
-  for (int64_t idx = 0; idx < int64_t(view_A.size()); ++idx) {
-
-    typename Layout::TensorCoord coord;
-    cutlass::reference::detail::LinearToCoordinate<Layout::kRank>()(coord, idx, view_A.extent());
-
-    if (view_A.contains(coord)) {
-      Element a = view_A.at(coord);
-      Element b = view_B.at(coord);
-      identity = reduce(identity, transform(a, b));
-    }
-  }
-
-  return identity;
-}
-
-/// Helper to compute the sum of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSum(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  NumericConverter<ComputeType, Element> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform);
-}
-
-/// Helper to compute the sum of the squares of the elements of a tensor
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = Element
->
-ComputeType TensorSumSq(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared<Element, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform);
-}
-
-/// Helper to compute the norm of the elements of a tensor.
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNorm(
-  TensorView<Element, Layout> view,
-  ComputeType identity = ComputeType()
-) {
-
-  return std::sqrt(TensorSumSq(view, identity));
-}
-
-/// Helper to compute the sum of the squares of the differences of two tensors
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorSumSqDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared_difference<Element, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view_A, view_B, identity, reduce, transform);
-}
-
-
-/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
-template <
-  typename Element,
-  typename Layout,
-  typename ComputeType = double
->
-ComputeType TensorNormDiff(
-  TensorView<Element, Layout> view_A,
-  TensorView<Element, Layout> view_B,
-  ComputeType identity = ComputeType()
-) {
-
-  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity));
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.hpp b/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.hpp
deleted file mode 100644
index 5ea5154107fcb..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/tensor_reduce.hpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/* \file
-  \brief Provides several functions for filling tensors with data.
-*/
-
-#pragma once
-
-// Standard Library includes
-#include <utility>
-#include <cstdlib>
-#include <cmath>
-
-// Cute includes
-#include "cute/tensor.hpp"
-
-// Cutlass includes
-#include "cutlass/cutlass.h"
-#include "cutlass/complex.h"
-#include "cutlass/functional.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/quaternion.h"
-#include "cutlass/array.h"
-#include "cutlass/numeric_types.h"
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Tensor reductions
-//
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename Tensor,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  Tensor view,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform
-) {
-
-  for (int64_t idx = 0; idx < cute::size(view); ++idx) {
-    identity = reduce(identity, transform(view(idx)));
-  }
-
-  return identity;
-}
-
-/// Transform-reduce operation over the elements of a tensor. This helper allocates the device-side
-/// workspace
-template <
-  typename TensorA,
-  typename TensorB,
-  typename ComputeType,
-  typename ReduceOp,
-  typename TransformOp
->
-ComputeType TensorTransformReduce(
-  TensorA view_A,
-  TensorB view_B,
-  ComputeType identity,
-  ReduceOp reduce,
-  TransformOp transform) {
-  
-  if (cute::size(view_A) != cute::size(view_B)) {
-    throw std::runtime_error("Tensor sizes must match.");
-  }
-
-  for (int64_t idx = 0; idx < cute::size(view_A); ++idx) {
-    identity = reduce(identity, transform(view_A(idx), view_B(idx)));
-  }
-
-  return identity;
-}
-
-/// Helper to compute the sum of the elements of a tensor
-template <
-  typename Tensor,
-  typename ComputeType = typename Tensor::value_type
->
-ComputeType TensorSum(
-  Tensor view,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  NumericConverter<ComputeType, typename Tensor::value_type> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform);
-}
-
-/// Helper to compute the sum of the squares of the elements of a tensor
-template <
-  typename Tensor,
-  typename ComputeType = typename Tensor::value_type
->
-ComputeType TensorSumSq(
-  Tensor view,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared<typename Tensor::value_type, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view, identity, reduce, transform);
-}
-
-/// Helper to compute the norm of the elements of a tensor.
-template <
-  typename Tensor,
-  typename ComputeType = double
->
-ComputeType TensorNorm(
-  Tensor view,
-  ComputeType identity = ComputeType()
-) {
-
-  return std::sqrt(TensorSumSq(view, identity));
-}
-
-/// Helper to compute the sum of the squares of the differences of two tensors
-template <
-  typename TensorA,
-  typename TensorB,
-  typename ComputeType = double
->
-ComputeType TensorSumSqDiff(
-  TensorA view_A,
-  TensorB view_B,
-  ComputeType identity = ComputeType()
-) {
-
-  plus<ComputeType> reduce;
-  magnitude_squared_difference<typename TensorA::value_type, ComputeType> transform;
-
-  return TensorTransformReduce(
-    view_A, view_B, identity, reduce, transform);
-}
-
-
-/// Helper to compute the norm of the tensor computed as the difference of two tensors in memory
-template <
-  typename TensorA,
-  typename TensorB,
-  typename ComputeType = double
->
-ComputeType TensorNormDiff(
-  TensorA view_A,
-  TensorB view_B,
-  ComputeType identity = ComputeType()
-) {
-
-  return std::sqrt(TensorSumSqDiff(view_A, view_B, identity));
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/sparse/cutlass/example/util/reference/host/trmm.h b/csrc/sparse/cutlass/example/util/reference/host/trmm.h
deleted file mode 100644
index 08b979254278c..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/trmm.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for TRMM in host-side code.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/util/host_tensor.h"
-
-#include "cutlass/util/reference/host/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  DiagType DiagTypeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_trmm(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  static_assert(SideModeA != SideMode::kInvalid
-                , "Side Mode can either be Left or Right.");
-
-  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
-                , "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
-
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  // Assuming correct k-dimension value is passed
-  int const K = problem_size.k();
- 
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp compare_op;
-
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-              ElementA a = ElementA();
-              ElementB b = ElementB();
-
-              if (SideModeA == SideMode::kLeft) {
-                a = (compare_op(row, k_block)) ? 
-                            (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
-                if (row == k_block && DiagTypeA == DiagType::kUnit) {
-                  a = ElementA(1);
-                }
-                b = tensor_b.at(MatrixCoord(k_block, col));
-              } else if (SideModeA == SideMode::kRight) {
-                a = tensor_b.at(MatrixCoord(row, k_block));
-                b = (compare_op(k_block, col)) ? 
-                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
-                if (k_block == col && DiagTypeA == DiagType::kUnit) {
-                  b = ElementA(1);
-                }
-              }
-                            
-              ComputeType compute_a(cast_if_scalar<ComputeType>(a));
-              ComputeType compute_b(cast_if_scalar<ComputeType>(b));
-
-              accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  DiagType DiagTypeA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAdd
->
-struct Trmm;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, SideMode SideModeA,
-           FillMode FillModeA, DiagType DiagTypeA, 
-           typename ElementB, typename LayoutB,
-           typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct Trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
-            ElementC, LayoutC, ScalarType,
-            ComputeType, arch::OpMultiplyAdd> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_trmm<ElementA, LayoutA, SideModeA, FillModeA, DiagTypeA, ElementB, LayoutB,
-                 ElementC, LayoutC, ScalarType, ComputeType, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/reference/host/trmm_complex.h b/csrc/sparse/cutlass/example/util/reference/host/trmm_complex.h
deleted file mode 100644
index 86e58a035b481..0000000000000
--- a/csrc/sparse/cutlass/example/util/reference/host/trmm_complex.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-/*! \file
-    \brief Reference implementation for complex-valued TRMM in host-side code.
-
-  
-*/
-
-#pragma once
-
-#include "cutlass/blas3.h"
-#include "cutlass/complex.h"
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/gemm/gemm.h"
-
-#include "cutlass/util/reference/host/gemm.h"
-
-namespace cutlass {
-namespace reference {
-namespace host {
-
-/// Computes a Triangular Matrix Multiplication (tensors of rank=2) pointed to by TensorRef
-/// objects.
-template <
-  typename ElementA,
-  typename LayoutA,
-  ComplexTransform TransformA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  DiagType DiagTypeA,
-  typename ElementB,
-  typename LayoutB,
-  ComplexTransform TransformB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = multiply_add<ComputeType>,
-  typename ConvertOp = NumericConverter<ElementC, ScalarType>
->
-void compute_trmm_complex(
-  gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  TensorRef<ElementA, LayoutA> tensor_a,
-  TensorRef<ElementB, LayoutB> tensor_b,
-  TensorRef<ElementC, LayoutC> tensor_d,
-  ComputeType initial_accum) {
-
-  static_assert(
-    LayoutA::kRank == 2 &&
-    LayoutC::kRank == 2, "Tensors must be of rank 2");
-
-  static_assert(SideModeA != SideMode::kInvalid
-                , "Side Mode can either be Left or Right.");
-
-  static_assert(FillModeA == FillMode::kLower || FillModeA == FillMode::kUpper
-                , "Fill Mode can either be Lower or Upper.");
-
-  using CompareOp = typename TrMatrixCompareOp<FillModeA, DiagTypeA>::Type;
-  
-  // Note: batch is ignored.
-  int const M = problem_size.m();
-  int const N = problem_size.n();
-  // Assuming correct k-dimension value is passed
-  int const K = problem_size.k();
- 
-  // Blocking necessary to speedup reference implementation
-  int const Mblock = 16;
-  int const Nblock = 16;
-
-  ConvertOp convert_op;
-  InnerProductOp inner_product_op;
-  CompareOp compare_op;
-  
-  for (int row_block = 0; row_block < M; row_block += Mblock) {
-    for (int col_block = 0; col_block < N; col_block += Nblock) {
-
-      ComputeType accum[Mblock][Nblock];
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          accum[i][j] = initial_accum;
-        }
-      }
-
-      for (int k_block = 0; k_block < K; ++k_block) {
-        for (int j = 0; j < Nblock; j++) {
-          for (int i = 0; i < Mblock; i++) {
-            int row = row_block + i;
-            int col = col_block + j;
-
-            if (row < M && col < N) {
-              ElementA a = ElementA();
-              ElementB b = ElementB();
-              
-              if (SideModeA == SideMode::kLeft) {
-                a = (compare_op(row, k_block)) ? 
-                              (tensor_a.at(MatrixCoord(row, k_block))) : ElementA(0);
-                if (row == k_block && DiagTypeA == DiagType::kUnit) {
-                  a = ElementA(1);
-                }
-                b = tensor_b.at(MatrixCoord(k_block, col));
-              } else if (SideModeA == SideMode::kRight) {
-                a = tensor_b.at(MatrixCoord(row, k_block));
-                b = (compare_op(k_block, col)) ? 
-                      tensor_a.at(MatrixCoord(k_block, col)) : ElementA(0);
-                if (k_block == col && DiagTypeA == DiagType::kUnit) {
-                  b = ElementA(1);
-                }
-              }
-
-              ComputeType a_ik = ComputeType(a);
-              ComputeType b_kj = ComputeType(b);
-              
-              // Conjugate, and hence hermitian, is only allowed for the triangular matrix
-              if (SideModeA == SideMode::kLeft && TransformA == ComplexTransform::kConjugate) {
-                a_ik = conj(a_ik);
-              } else if (SideModeA == SideMode::kRight && TransformA == ComplexTransform::kConjugate) {
-                b_kj = conj(b_kj);
-              }
-
-              accum[i][j] = inner_product_op(a_ik, b_kj,  accum[i][j]);
-            }
-          }
-        }
-      }
-
-      for (int j = 0; j < Nblock; j++) {
-        for (int i = 0; i < Mblock; i++) {
-          int row = row_block + i;
-          int col = col_block + j;
-
-          MatrixCoord coord = MatrixCoord(row, col);
-
-          if (row < M && col < N) {
-            tensor_d.at(coord) = convert_op(
-              alpha * ScalarType(accum[i][j]));
-          }
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-  typename ElementA,
-  typename LayoutA,
-  ComplexTransform TransformA,
-  SideMode SideModeA,
-  FillMode FillModeA,
-  DiagType DiagTypeA,
-  typename ElementB,
-  typename LayoutB,
-  ComplexTransform TransformB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename ComputeType,
-  typename InnerProductOp = cutlass::arch::OpMultiplyAddComplex
->
-struct TrmmComplex;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for multiply-add
-template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
-          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA, 
-          typename ElementB, typename LayoutB, ComplexTransform TransformB,
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct TrmmComplex<ElementA, LayoutA, TransformA, 
-                   SideModeA, FillModeA, DiagTypeA,
-                   ElementB, LayoutB, TransformB,
-                   ElementC, LayoutC, ScalarType,
-                   ComputeType, arch::OpMultiplyAddComplex> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_trmm_complex<ElementA, LayoutA, TransformA,
-                 SideModeA, FillModeA, DiagTypeA,
-                 ElementB, LayoutB, TransformB,
-                 ElementC, LayoutC, 
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Partial specialization for gaussian multiply-add 
-template <typename ElementA, typename LayoutA, ComplexTransform TransformA,
-          SideMode SideModeA, FillMode FillModeA, DiagType DiagTypeA, 
-          typename ElementB, typename LayoutB, ComplexTransform TransformB,
-          typename ElementC, typename LayoutC,
-          typename ScalarType, typename ComputeType>
-struct TrmmComplex<ElementA, LayoutA, TransformA, 
-                   SideModeA, FillModeA, DiagTypeA,
-                   ElementB, LayoutB, TransformB,
-                   ElementC, LayoutC, ScalarType,
-                   ComputeType, arch::OpMultiplyAddGaussianComplex> {
-
-  void operator()(gemm::GemmCoord problem_size, ScalarType alpha,
-                  TensorRef<ElementA, LayoutA> tensor_a,
-                  TensorRef<ElementB, LayoutB> tensor_b,
-                  TensorRef<ElementC, LayoutC> tensor_d,
-                  ComputeType initial_accum = ComputeType(0)) {
-    static_assert(
-        LayoutA::kRank == 2 && LayoutC::kRank == 2,
-        "Tensors must be of rank 2");
-
-    compute_trmm_complex<ElementA, LayoutA, TransformA,
-                 SideModeA, FillModeA, DiagTypeA,
-                 ElementB, LayoutB, TransformB,
-                 ElementC, LayoutC, 
-                 ScalarType, ComputeType, multiply_add<ComputeType>>(
-                 problem_size, alpha, tensor_a, tensor_b, tensor_d, initial_accum);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace host
-} // namespace reference
-} // namespace cutlass
diff --git a/csrc/sparse/cutlass/example/util/tensor_view_io.h b/csrc/sparse/cutlass/example/util/tensor_view_io.h
deleted file mode 100644
index 4f6bdd686b8f0..0000000000000
--- a/csrc/sparse/cutlass/example/util/tensor_view_io.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*
-**************************************************************************************************/
-#pragma once
-
-#include "cutlass/core_io.h"
-#include "cutlass/tensor_view.h"
-#include "cutlass/tensor_view_planar_complex.h"
-#include "cutlass/complex.h"
-
-namespace cutlass {
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-namespace detail {
-
-/// Helper to write the least significant rank of a TensorView
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream & TensorView_WriteLeastSignificantRank(
-  std::ostream& out, 
-  TensorView<Element, Layout> const& view,
-  Coord<Layout::kRank> const &start_coord,
-  int rank,
-  std::streamsize width) {
-
-  for (int idx = 0; idx < view.extent(rank); ++idx) {
-
-    Coord<Layout::kRank> coord(start_coord);
-    coord[rank] = idx;
-
-    if (idx) {
-      out.width(0);
-      out << ", ";
-    }
-    if (idx || coord) {
-      out.width(width);
-    }
-    out << ScalarIO<Element>(view.at(coord));
-  }
-
-  return out;
-}
-
-/// Helper to write a rank of a TensorView
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream & TensorView_WriteRank(
-  std::ostream& out, 
-  TensorView<Element, Layout> const& view,
-  Coord<Layout::kRank> const &start_coord,
-  int rank,
-  std::streamsize width) {
-
-  // If called on the least significant rank, write the result as a row
-  if (rank + 1 == Layout::kRank) {
-    return TensorView_WriteLeastSignificantRank(out, view, start_coord, rank, width);
-  }
-
-  // Otherwise, write a sequence of rows and newlines
-  for (int idx = 0; idx < view.extent(rank); ++idx) {
-
-    Coord<Layout::kRank> coord(start_coord);
-    coord[rank] = idx;
-
-    if (rank + 2 == Layout::kRank) {
-      // Write least significant ranks asa matrix with rows delimited by "\n"
-      if (idx) {
-        out << ",\n";
-      }
-      TensorView_WriteLeastSignificantRank(out, view, coord, rank + 1, width);
-    }
-    else {
-      // Higher ranks are separated by newlines
-      if (idx) {
-        out << ",\n\n";
-      }
-      TensorView_WriteRank(out, view, coord, rank + 1, width);
-    }
-  }
-
-  return out;
-}
-
-/// Helper to write the least significant rank of a TensorView
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream & TensorViewPlanarComplex_WriteLeastSignificantRank(
-  std::ostream& out, 
-  TensorViewPlanarComplex<Element, Layout> const& view,
-  Coord<Layout::kRank> const &start_coord,
-  int rank,
-  std::streamsize width) {
-
-  for (int idx = 0; idx < view.extent(rank); ++idx) {
-
-    Coord<Layout::kRank> coord(start_coord);
-    coord[rank] = idx;
-
-    if (idx) {
-      out.width(0);
-      out << ", ";
-    }
-    if (idx || coord) {
-      out.width(width);
-    }
-
-    complex<Element> x = view.at(coord);
-    out << x;
-  }
-
-  return out;
-}
-
-/// Helper to write a rank of a TensorView
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream & TensorViewPlanarComplex_WriteRank(
-  std::ostream& out, 
-  TensorViewPlanarComplex<Element, Layout> const& view,
-  Coord<Layout::kRank> const &start_coord,
-  int rank,
-  std::streamsize width) {
-
-  // If called on the least significant rank, write the result as a row
-  if (rank + 1 == Layout::kRank) {
-    return TensorViewPlanarComplex_WriteLeastSignificantRank(out, view, start_coord, rank, width);
-  }
-
-  // Otherwise, write a sequence of rows and newlines
-  for (int idx = 0; idx < view.extent(rank); ++idx) {
-
-    Coord<Layout::kRank> coord(start_coord);
-    coord[rank] = idx;
-
-    if (rank + 2 == Layout::kRank) {
-      // Write least significant ranks asa matrix with rows delimited by ";\n"
-      if (idx) {
-        out << ";\n";
-      }
-      TensorViewPlanarComplex_WriteLeastSignificantRank(out, view, coord, rank + 1, width);
-    }
-    else {
-      // Higher ranks are separated by newlines
-      if (idx) {
-        out << "\n";
-      }
-      TensorViewPlanarComplex_WriteRank(out, view, coord, rank + 1, width);
-    }
-  }
-
-  return out;
-}
-
-} // namespace detail
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Prints human-readable representation of a TensorView to an ostream
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream& TensorViewWrite(
-  std::ostream& out, 
-  TensorView<Element, Layout> const& view) {
-
-  // Prints a TensorView according to the following conventions:
-  //   - least significant rank is printed as rows separated by ";\n"
-  //   - all greater ranks are delimited with newlines
-  //
-  // The result is effectively a whitespace-delimited series of 2D matrices.
-
-  return detail::TensorView_WriteRank(out, view, Coord<Layout::kRank>(), 0, out.width());
-}
-
-/// Prints human-readable representation of a TensorView to an ostream
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream& operator<<(
-  std::ostream& out, 
-  TensorView<Element, Layout> const& view) {
-
-  // Prints a TensorView according to the following conventions:
-  //   - least significant rank is printed as rows separated by ";\n"
-  //   - all greater ranks are delimited with newlines
-  //
-  // The result is effectively a whitespace-delimited series of 2D matrices.
-
-  return TensorViewWrite(out, view);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Prints human-readable representation of a TensorView to an ostream
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream& TensorViewWrite(
-  std::ostream& out, 
-  TensorViewPlanarComplex<Element, Layout> const& view) {
-
-  // Prints a TensorView according to the following conventions:
-  //   - least significant rank is printed as rows separated by ";\n"
-  //   - all greater ranks are delimited with newlines
-  //
-  // The result is effectively a whitespace-delimited series of 2D matrices.
-
-  return detail::TensorViewPlanarComplex_WriteRank(out, view, Coord<Layout::kRank>(), 0, out.width());
-}
-
-/// Prints human-readable representation of a TensorView to an ostream
-template <
-  typename Element,
-  typename Layout
->
-inline std::ostream& operator<<(
-  std::ostream& out, 
-  TensorViewPlanarComplex<Element, Layout> const& view) {
-
-  // Prints a TensorView according to the following conventions:
-  //   - least significant rank is printed as rows separated by ";\n"
-  //   - all greater ranks are delimited with newlines
-  //
-  // The result is effectively a whitespace-delimited series of 2D matrices.
-
-  return TensorViewWrite(out, view);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace cutlass