diff --git a/aten/src/ATen/SparseCsrTensorUtils.h b/aten/src/ATen/SparseCsrTensorUtils.h
index dfc7ff8813045..628b8ce61240a 100644
--- a/aten/src/ATen/SparseCsrTensorUtils.h
+++ b/aten/src/ATen/SparseCsrTensorUtils.h
@@ -143,12 +143,12 @@ inline int columnDimension(Layout layout, IntArrayRef size) {
   return size.size() - (isCompressedColumn(layout) ? 2 : 1);
 }
 
-inline int compressedDimension(Layout layout, IntArrayRef size) {
-  return size.size() - (isCompressedRow(layout) ? 2 : 1);
+inline int compressedDimension(Layout layout, IntArrayRef size, size_t dense_ndim=0) {
+  return size.size() - dense_ndim - (isCompressedRow(layout) ? 2 : 1);
 }
 
-inline int plainDimension(Layout layout, IntArrayRef size) {
-  return size.size() - (isCompressedRow(layout) ? 1 : 2);
+inline int plainDimension(Layout layout, IntArrayRef size, size_t dense_ndim=0) {
+  return size.size() - dense_ndim - (isCompressedRow(layout) ? 1 : 2);
 }
 
 } // namespace sparse_csr
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
index 62d600dc0926d..77979f55647de 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@@ -103,38 +103,52 @@ void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_ind
       "number of dimensions of ", compressed_indices_name, " and ", plain_indices_name, " must be the same but got ",
       compressed_indices.dim(), " and ", plain_indices.dim(), ", respectively");
 
-  AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(
+  int block_ndim = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(
       layout, "validate_sparse_compressed_tensor_args",
       [&] {
         TORCH_CHECK(
-                    compressed_indices.dim() == values.dim(),
-                    "number of dimensions of indices and values must be the same but got ",
-                    compressed_indices.dim(), " and ", values.dim(), ", respectively");
+                    compressed_indices.dim() <= values.dim(),
+                    "number of dimensions of indices (=", compressed_indices.dim(),
+                    ") must be equal or less than the number of dimensions of values (=", values.dim(), ")");
+        return 0;
       },
       [&] {
         TORCH_CHECK(
-                    compressed_indices.dim() + 2 == values.dim(),
-                    "number of dimensions of indices must be two less than the number of dimensions of the values but got ",
-                    compressed_indices.dim(), " + 2 not equal to ", values.dim());
+                    compressed_indices.dim() + 2 <= values.dim(),
+                    "number of dimensions of indices (=", compressed_indices.dim(),
+                    ") plus two must be equal or less than the number of dimensions of values (=", values.dim(), ")");
+        return 2;
       });
+  int dense_ndim = values.dim() - compressed_indices.dim() - block_ndim;
+  TORCH_CHECK(dense_ndim == 0, "non-zero dense dimensions (=", dense_ndim, ") is not supported for ", layout, " layout");
 
-  TORCH_CHECK(
-      static_cast<size_t>(compressed_indices.dim()) == size.size() - 1,
-      "number of dimensions of indices must be one less than the number of dimensions of the provided size but got ",
-      compressed_indices.dim(), " not equal to ", size.size(), " - 1");
+  int batch_ndim = size.size() - 2 - dense_ndim;
+  TORCH_INTERNAL_ASSERT(block_ndim >= 0 && dense_ndim >=0 && batch_ndim >= 0);
 
-  int block_ndim = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args", [&]{ return 0; }, [&]{ return 2; });
-  IntArrayRef block_size = values.sizes().slice(values.dim() - block_ndim, block_ndim);
-  int64_t numel_per_block = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args",
-                                [&]() -> int64_t { return 1; }, [&]() -> int64_t { return block_size[0] * block_size[1]; });
-  int compressed_dim = compressedDimension(layout, size);
-  int plain_dim = plainDimension(layout, size);
+  TORCH_CHECK(
+      static_cast<size_t>(compressed_indices.dim()) == size.size() - 1 - dense_ndim,
+      "number of dimensions of indices must be one less than the number of dimensions of the provided size",
+      " (minus the number of dense dimensions) but got ",
+      compressed_indices.dim(), " not equal to ", size.size(), " - 1 - ", dense_ndim);
+
+  // For CSR/CSC formats, we define blocksize=(1, 1) so that checking
+  // the sparse compressed tensor invariants can be unified with the
+  // BSR/BSC invariants.
+  DimVector blocksize{
+                      (block_ndim == 2 ? std::max<int64_t>(1, values.sizes()[values.dim() - dense_ndim - 2]) : 1),
+                      (block_ndim == 2 ? std::max<int64_t>(1, values.sizes()[values.dim() - dense_ndim - 1]) : 1),
+  };
+  TORCH_INTERNAL_ASSERT(blocksize.size() == 2 && blocksize[0] > 0 && blocksize[1] > 0);
+
+  int64_t numel_per_block = blocksize[0] * blocksize[1];
+  int compressed_dim = compressedDimension(layout, size, dense_ndim);
+  int plain_dim = plainDimension(layout, size, dense_ndim);
 
   // All batch sizes must be the same
-  auto batch_size = size.slice(0, size.size() - 2);
-  auto compressed_indices_batch_size = compressed_indices.sizes().slice(0, compressed_indices.dim() - 1);
-  auto plain_indices_batch_size = plain_indices.sizes().slice(0, plain_indices.dim() - 1);
-  auto values_batch_size = values.sizes().slice(0, values.dim() - 1 - block_ndim);
+  DimVector batch_size = DimVector(size.slice(0, batch_ndim));
+  DimVector compressed_indices_batch_size = DimVector(compressed_indices.sizes().slice(0, compressed_indices.dim() - 1));
+  DimVector plain_indices_batch_size = DimVector(plain_indices.sizes().slice(0, plain_indices.dim() - 1));
+  DimVector values_batch_size = DimVector(values.sizes().slice(0, values.dim() - 1 - block_ndim - dense_ndim));
   TORCH_CHECK(
       batch_size == compressed_indices_batch_size &&
       batch_size == plain_indices_batch_size &&
@@ -143,34 +157,56 @@ void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_ind
       compressed_indices_batch_size,", ", plain_indices_batch_size, "), and values (",
       values_batch_size,") must be the same.");
 
+  // A tensor constitutes of full blocks
+  for (int i=0; i<block_ndim; i++) {
+    TORCH_CHECK(size[batch_ndim + i] % blocksize[i] == 0, "tensor size(", batch_ndim + i, ") ", size[batch_ndim + i],
+                " needs to be divisible with blocksize[", i, "] ", blocksize[i]);
+  }
+
   // Note, this check also enforces `compressed_indices.size(-1) >= 1`
-  TORCH_CHECK(
-              compressed_indices.size(-1) == (size[compressed_dim] + 1),
-              compressed_indices_name, ".size(-1) must be equal to size[-", (size.size() - compressed_dim), "] + 1 (that is ",
-              size[compressed_dim] + 1, "), but got: ", compressed_indices.size(-1));
+  if (block_ndim == 2) {
+    TORCH_CHECK(
+                compressed_indices.size(-1) == (size[compressed_dim] / blocksize[compressed_dim - batch_ndim] + 1),
+                compressed_indices_name, ".size(-1) must be equal to size[-", (size.size() - compressed_dim),
+                "]/blocksize[",  compressed_dim - batch_ndim, "] + 1 (that is ",
+                size[compressed_dim] / blocksize[compressed_dim - batch_ndim] + 1, "), but got: ", compressed_indices.size(-1));
+    TORCH_CHECK(
+                plain_indices.numel() * numel_per_block == values.numel(),
+                "number of ", plain_indices_name, " elements must be the same as the number of blocks in values, but got ",
+                plain_indices_name, ".numel() * numel_per_block: ", plain_indices.numel() * numel_per_block,
+                ", values.numel(): ", values.numel(),", numel_per_block: ", numel_per_block);
+  } else {
+    TORCH_CHECK(
+                compressed_indices.size(-1) == (size[compressed_dim] + 1),
+                compressed_indices_name, ".size(-1) must be equal to size[-", (size.size() - compressed_dim),
+                "] + 1 (that is ",
+                size[compressed_dim] + 1, "), but got: ", compressed_indices.size(-1));
+    TORCH_CHECK(
+                plain_indices.numel() == values.numel(),
+                "number of ", plain_indices_name, " elements must be the same number of elements, but got ",
+                plain_indices_name, ".numel(): ", plain_indices.numel(),
+                ", values.numel(): ", values.numel());
+  }
 
-  AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args",
-      [&] {
-        TORCH_CHECK(
-                    plain_indices.numel() == values.numel(),
-                    plain_indices_name, " and values must have the same number of elements, but got ", plain_indices_name, ".numel(): ",
-                    plain_indices.numel(), ", values.numel(): ", values.numel());
-      },
-      [&] {
-        TORCH_CHECK(
-                    plain_indices.numel() * numel_per_block == values.numel(),
-                    "number of ", plain_indices_name, " elements must be the same as the number of blocks in values, but got ",
-                    plain_indices_name, ".numel() * numel_per_block: ", plain_indices.numel() * numel_per_block,
-                    ", values.numel(): ", values.numel(),", numel_per_block: ", numel_per_block);
-      });
+  // Type Invariants
+  auto compressed_indices_type = compressed_indices.scalar_type();
+  auto plain_indices_type = plain_indices.scalar_type();
+  TORCH_CHECK(
+      compressed_indices_type == plain_indices_type,
+      "both ", compressed_indices_name, " and ", plain_indices_name, " should have the same type, bot got ",
+      compressed_indices_type, " and ", plain_indices_type, ", respectively");
+  TORCH_CHECK(
+      compressed_indices_type == kInt || compressed_indices_type == kLong,
+      compressed_indices_name, " and ", plain_indices_name, " must be an int32 or int64 type, but got: ",
+      compressed_indices_type);
 
   // Indices invariants
-  AT_DISPATCH_INDEX_TYPES(compressed_indices.scalar_type(), "validate_sparse_compressed_tensor_args",
+  AT_DISPATCH_INDEX_TYPES(compressed_indices_type, "validate_sparse_compressed_tensor_args",
       [&] {
         Tensor compressed_indices_cpu = compressed_indices.to(kCPU);
         auto compressed_indices_data_ptr = compressed_indices_cpu.data_ptr<index_t>();
         auto batch_stride = compressed_indices_cpu.dim() >= 2 ? compressed_indices_cpu.stride(-2) : 0;
-        auto compressed_dims = size[compressedDimension(layout, size)];
+        auto compressed_dims = (block_ndim == 0 ? size[compressed_dim] : size[compressed_dim] / blocksize[compressed_dim - batch_ndim]);
         for (const auto batch_id : c10::irange(batchCount(compressed_indices_cpu))) {
           TORCH_CHECK(
                       compressed_indices_data_ptr[batch_id*batch_stride] == 0,
@@ -184,7 +220,8 @@ void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_ind
             TORCH_CHECK(
                         compressed_indices_data_ptr[batch_id*batch_stride + i - 1] <= compressed_indices_data_ptr[batch_id*batch_stride + i],
                         "(Batch element ", batch_id, ") ",
-                        "at position i = ", i, ", the condition ", compressed_indices_name, "[i - 1] <= ", compressed_indices_name, "[i] fails");
+                        "at position i = ", i, ", the condition ", compressed_indices_name, "[i - 1] <= ", compressed_indices_name, "[i] fails, got ",
+                        compressed_indices_data_ptr[batch_id*batch_stride + i - 1], " <= ", compressed_indices_data_ptr[batch_id*batch_stride + i]);
           }
         }
         if (plain_indices.numel() > 0) {
@@ -193,18 +230,6 @@ void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_ind
         }
       });
 
-  // Type Invariants
-  auto compressed_indices_type = compressed_indices.scalar_type();
-  auto plain_indices_type = plain_indices.scalar_type();
-  TORCH_CHECK(
-      compressed_indices_type == plain_indices_type,
-      "both ", compressed_indices_name, " and ", plain_indices_name, " should have the same type, bot got ",
-      compressed_indices_type, " and ", plain_indices_type, ", respectively");
-  TORCH_CHECK(
-      compressed_indices_type == kInt || compressed_indices_type == kLong,
-      compressed_indices_name, " and ", plain_indices_name, " must be an int32 or int64 type, but got: ",
-      compressed_indices_type);
-
   // Device Invariants
   TORCH_CHECK(
       plain_indices.get_device() == compressed_indices.get_device(),
@@ -335,6 +360,12 @@ DimVector _estimate_sparse_compressed_tensor_size(
     const Tensor& plain_indices,
     const Tensor& values,
     Layout layout) {
+  int block_ndim = AT_DISPATCH_PLAIN_SPARSE_COMPRESSED_LAYOUTS(layout, "estimate_sparse_compressed_tensor_size", [&] { return 0; }, [&] { return 2; });
+  int dense_ndim = values.dim() - compressed_indices.dim() - block_ndim;
+  DimVector blocksize{
+                      (block_ndim == 2 ? std::max<int64_t>(1, values.sizes()[values.dim() - dense_ndim - 2]) : 1),
+                      (block_ndim == 2 ? std::max<int64_t>(1, values.sizes()[values.dim() - dense_ndim - 1]) : 1),
+  };
   DimVector size = DimVector(IntArrayRef(plain_indices.sizes().data(), plain_indices.dim() - 1));
   int64_t compressed_dim = (plain_indices.size(-1) > 0 ? compressed_indices.size(-1) - 1 : 0);
   int64_t plain_dim = AT_DISPATCH_INTEGRAL_TYPES(plain_indices.scalar_type(), "estimate_sparse_compressed_tensor_size",
@@ -347,13 +378,16 @@ DimVector _estimate_sparse_compressed_tensor_size(
                                                    });
   AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(layout, "estimate_sparse_compressed_tensor_size",
       [&]{
-        size.push_back(compressed_dim);
-        size.push_back(plain_dim);
+        size.push_back(compressed_dim * blocksize[0]);
+        size.push_back(plain_dim * blocksize[1]);
       },
       [&]{
-        size.push_back(plain_dim);
-        size.push_back(compressed_dim);
+        size.push_back(plain_dim * blocksize[0]);
+        size.push_back(compressed_dim * blocksize[1]);
       });
+  for (int i=0; i<dense_ndim; i++) {
+    size.push_back(values.size(values.dim() - dense_ndim + i));
+  }
   return size;
 }
 
@@ -544,11 +578,11 @@ Tensor& copy_sparse_compressed_(Tensor& self, const Tensor& src, bool non_blocki
                                               [&]{
                                                 auto self_values = self.values();
                                                 auto src_values = src.values();
-                                                auto self_block_size = DimVector(self_values.sizes().slice(self_values.dim()-2, 2));
-                                                auto src_block_size = DimVector(src_values.sizes().slice(src_values.dim()-2, 2));
-                                                TORCH_CHECK(self_block_size == src_block_size,
+                                                auto self_blocksize = DimVector(self_values.sizes().slice(self_values.dim()-2, 2));
+                                                auto src_blocksize = DimVector(src_values.sizes().slice(src_values.dim()-2, 2));
+                                                TORCH_CHECK(self_blocksize == src_blocksize,
                                                             "torch.copy_: copy of sparse compressed tensors having different block sizes is not supported.",
-                                                            " self and src block sizes are ", self_block_size, " and ", src_block_size, ", respectivly.");
+                                                            " self and src block sizes are ", self_blocksize, " and ", src_blocksize, ", respectivly.");
                                               });
   AT_DISPATCH_ROW_SPARSE_COMPRESSED_LAYOUTS(self.layout(), "copy_sparse_compressed_",
                                             [&]{
diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect
index bcffa8293c934..0ac8d53e73b60 100644
--- a/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseBSC_cpu.expect
@@ -8,7 +8,7 @@ tensor(ccol_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), size=(2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([0, 2, 4], dtype=torch.int32)
@@ -57,7 +57,7 @@ tensor(ccol_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), size=(2, 2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[0, 2, 4],
@@ -151,7 +151,7 @@ tensor(ccol_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[[0, 2, 4],
@@ -235,7 +235,7 @@ tensor(ccol_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64,
+                      [[ 4., 44.]]]), size=(2, 4), nnz=4, dtype=torch.float64,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([0, 2, 4], dtype=torch.int32)
@@ -284,7 +284,7 @@ tensor(ccol_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), size=(2, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[0, 2, 4],
@@ -378,7 +378,7 @@ tensor(ccol_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[[0, 2, 4],
@@ -462,7 +462,7 @@ tensor(ccol_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), size=(2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([0, 2, 4])
@@ -511,7 +511,7 @@ tensor(ccol_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), size=(2, 2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[0, 2, 4],
@@ -605,7 +605,7 @@ tensor(ccol_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[[0, 2, 4],
@@ -689,7 +689,7 @@ tensor(ccol_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64,
+                      [[ 4., 44.]]]), size=(2, 4), nnz=4, dtype=torch.float64,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([0, 2, 4])
@@ -738,7 +738,7 @@ tensor(ccol_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), size=(2, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[0, 2, 4],
@@ -832,7 +832,7 @@ tensor(ccol_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[[0, 2, 4],
diff --git a/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect b/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect
index 9f74cd7eb53f2..2de9b362e31e1 100644
--- a/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect
+++ b/test/expect/TestSparseCompressedCPU.test_print_SparseBSR_cpu.expect
@@ -8,7 +8,7 @@ tensor(crow_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), size=(2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([0, 2, 4], dtype=torch.int32)
@@ -57,7 +57,7 @@ tensor(crow_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), size=(2, 2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[0, 2, 4],
@@ -151,7 +151,7 @@ tensor(crow_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -235,7 +235,7 @@ tensor(crow_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64,
+                      [[ 4., 44.]]]), size=(2, 4), nnz=4, dtype=torch.float64,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([0, 2, 4], dtype=torch.int32)
@@ -284,7 +284,7 @@ tensor(crow_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), size=(2, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[0, 2, 4],
@@ -378,7 +378,7 @@ tensor(crow_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -462,7 +462,7 @@ tensor(crow_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), size=(2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([0, 2, 4])
@@ -511,7 +511,7 @@ tensor(crow_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), size=(2, 2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[0, 2, 4],
@@ -605,7 +605,7 @@ tensor(crow_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -689,7 +689,7 @@ tensor(crow_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), size=(2, 2), nnz=4, dtype=torch.float64,
+                      [[ 4., 44.]]]), size=(2, 4), nnz=4, dtype=torch.float64,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([0, 2, 4])
@@ -738,7 +738,7 @@ tensor(crow_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), size=(2, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[0, 2, 4],
@@ -832,7 +832,7 @@ tensor(crow_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), size=(2, 3, 2, 2), nnz=4,
+                        [[ 4., 44.]]]]]), size=(2, 3, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[[0, 2, 4],
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect
index df75cb3a4f616..d3a51cb1c939d 100644
--- a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSC_cuda.expect
@@ -8,7 +8,7 @@ tensor(ccol_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
@@ -57,7 +57,7 @@ tensor(ccol_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[0, 2, 4],
@@ -151,7 +151,7 @@ tensor(ccol_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 4),
        nnz=4, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[[0, 2, 4],
@@ -235,7 +235,7 @@ tensor(ccol_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
@@ -284,7 +284,7 @@ tensor(ccol_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[0, 2, 4],
@@ -378,7 +378,7 @@ tensor(ccol_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 4),
        nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[[0, 2, 4],
@@ -462,7 +462,7 @@ tensor(ccol_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([0, 2, 4], device='cuda:0')
@@ -511,7 +511,7 @@ tensor(ccol_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 4), nnz=4,
        layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[0, 2, 4],
@@ -605,7 +605,7 @@ tensor(ccol_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 4),
        nnz=4, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[[0, 2, 4],
@@ -689,7 +689,7 @@ tensor(ccol_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([0, 2, 4], device='cuda:0')
@@ -738,7 +738,7 @@ tensor(ccol_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[0, 2, 4],
@@ -832,7 +832,7 @@ tensor(ccol_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 4),
        nnz=4, dtype=torch.float64, layout=torch.sparse_bsc)
 # _ccol_indices
 tensor([[[0, 2, 4],
diff --git a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect
index 5ab909227272a..90c158c8860d4 100644
--- a/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect
+++ b/test/expect/TestSparseCompressedCUDA.test_print_SparseBSR_cuda.expect
@@ -8,7 +8,7 @@ tensor(crow_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
@@ -57,7 +57,7 @@ tensor(crow_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[0, 2, 4],
@@ -151,7 +151,7 @@ tensor(crow_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 4),
        nnz=4, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -235,7 +235,7 @@ tensor(crow_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([0, 2, 4], device='cuda:0', dtype=torch.int32)
@@ -284,7 +284,7 @@ tensor(crow_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[0, 2, 4],
@@ -378,7 +378,7 @@ tensor(crow_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 4),
        nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -462,7 +462,7 @@ tensor(crow_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([0, 2, 4], device='cuda:0')
@@ -511,7 +511,7 @@ tensor(crow_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 4), nnz=4,
        layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[0, 2, 4],
@@ -605,7 +605,7 @@ tensor(crow_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 4),
        nnz=4, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[[0, 2, 4],
@@ -689,7 +689,7 @@ tensor(crow_indices=tensor([0, 2, 4]),
 
                       [[ 3., 33.]],
 
-                      [[ 4., 44.]]]), device='cuda:0', size=(2, 2), nnz=4,
+                      [[ 4., 44.]]]), device='cuda:0', size=(2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([0, 2, 4], device='cuda:0')
@@ -738,7 +738,7 @@ tensor(crow_indices=tensor([[0, 2, 4],
 
                        [[ 3., 33.]],
 
-                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 2), nnz=4,
+                       [[ 4., 44.]]]]), device='cuda:0', size=(2, 2, 4), nnz=4,
        dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[0, 2, 4],
@@ -832,7 +832,7 @@ tensor(crow_indices=tensor([[[0, 2, 4],
 
                         [[ 3., 33.]],
 
-                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 2),
+                        [[ 4., 44.]]]]]), device='cuda:0', size=(2, 3, 2, 4),
        nnz=4, dtype=torch.float64, layout=torch.sparse_bsr)
 # _crow_indices
 tensor([[[0, 2, 4],
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index 376564bdb0a8d..eb3540ea5f4ab 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -196,7 +196,7 @@ def _generate_small_inputs(self, layout, device, dtype, index_dtype):
             yield (torch.tensor([0, 2, 4], device=device, dtype=index_dtype),
                    torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype),
                    torch.tensor([[[1, 11]], [[2, 22]], [[3, 33]], [[4, 44]]], device=device, dtype=dtype),
-                   (2, 2))
+                   (2, 4))
             yield (torch.tensor([0, ], device=device, dtype=index_dtype),
                    torch.tensor([], device=device, dtype=index_dtype),
                    torch.tensor([], device=device, dtype=dtype).reshape(1, 0, 0),
@@ -207,7 +207,7 @@ def _generate_small_inputs(self, layout, device, dtype, index_dtype):
                        torch.tensor([0, 1, 0, 1], device=device, dtype=index_dtype).repeat(prod, 1).reshape(*batch_shape, -1),
                        torch.tensor([[[1, 11]], [[2, 22]], [[3, 33]], [[4, 44]]],
                                     device=device, dtype=dtype).repeat(prod, 1, 1).reshape(*batch_shape, 4, 1, 2),
-                       (*batch_shape, 2, 2))
+                       (*batch_shape, 2, 4))
 
     @all_sparse_compressed_layouts()
     @onlyCPU
@@ -327,7 +327,9 @@ def test_print(self, layout, device):
                         layout, device, dtype, index_dtype):
                     batch_shape = tuple(size[:-2])
                     block_shape = tuple(values.shape[-2:]) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
-                    if size not in [(2, 2), (0, 0), (2, 3, 2, 2), (2, 2, 2)]:
+                    blocksize0, blocksize1 = block_shape if layout in {torch.sparse_bsr, torch.sparse_bsc} else (1, 1)
+                    if size not in [(2 * blocksize0, 2 * blocksize1), (0, 0),
+                                    (2, 3, 2 * blocksize0, 2 * blocksize1), (2, 2 * blocksize0, 2 * blocksize1)]:
                         # Skip inputs that are not in the list of
                         # expected sizes to ensure the stability of
                         # test_print in the case
@@ -368,55 +370,57 @@ def test_print(self, layout, device):
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_copy(self, layout, device, dtype):
 
-        def run_test(shape, nnz, index_type):
-            block_size = (2, 3) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
+        def run_test(shape, blocksize, nnz, index_type):
             a = self.genSparseCompressedTensor(shape, nnz, dtype=dtype, layout=layout, device=device,
-                                               index_dtype=index_dtype, block_size=block_size)
+                                               index_dtype=index_dtype, blocksize=blocksize)
             b = self.genSparseCompressedTensor(shape, nnz, dtype=dtype, layout=layout, device=device,
-                                               index_dtype=index_dtype, block_size=block_size)
+                                               index_dtype=index_dtype, blocksize=blocksize)
 
             a.copy_(b)
 
             self.assertEqual(a, b)
 
-        ns = [5, 2, 0]
+        ns = [(9, 3), (2, 1), (0, 0)]  # (number of dimensions, the corresponding block size)
         batch_shapes = [(), (2,), (2, 3)]
-        for (m, n, b), index_dtype in zip(itertools.product(ns, ns, batch_shapes), [torch.int32, torch.int64]):
-            run_test((*b, m, n), 0, index_dtype)
-            run_test((*b, m, n), m * n, index_dtype)
+        for ((m, bm), (n, bn), b), index_dtype in zip(itertools.product(ns, ns, batch_shapes), [torch.int32, torch.int64]):
+            blocksize = (bm, bn) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
+            run_test((*b, m, n), blocksize, 0, index_dtype)
+            run_test((*b, m, n), blocksize, m * n, index_dtype)
 
     @skipMeta
     @all_sparse_compressed_layouts()
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
     def test_copy_errors(self, layout, device, dtype):
-        block_size = (2, 3) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
+        blocksize = (2, 3) if layout in {torch.sparse_bsr, torch.sparse_bsc} else ()
+        nnz = 6 if layout in {torch.sparse_bsr, torch.sparse_bsc} else 1
+        shape1 = (2 * 6, 3 * 6) if layout in {torch.sparse_bsr, torch.sparse_bsc} else (2, 3)
         for index_dtype in [torch.int32, torch.int64]:
-            shape1 = (2, 3)
             a = self.genSparseCompressedTensor(shape1, 0, dtype=dtype, layout=layout, device=device,
-                                               index_dtype=index_dtype, block_size=block_size)
+                                               index_dtype=index_dtype, blocksize=blocksize)
 
             with self.assertRaisesRegex(RuntimeError,
                                         "copy of sparse compressed tensors having different layouts is not supported."):
                 a.copy_(torch.empty(a.shape, dtype=dtype, device=device))
 
-            b = self.genSparseCompressedTensor(shape1, 1, dtype=dtype, layout=layout, device=device,
-                                               index_dtype=index_dtype, block_size=block_size)
+            b = self.genSparseCompressedTensor(shape1, nnz, dtype=dtype, layout=layout, device=device,
+                                               index_dtype=index_dtype, blocksize=blocksize)
+            assert a._nnz() != b._nnz(), (a._nnz(), b._nnz())
             with self.assertRaisesRegex(RuntimeError,
                                         "only sparse compressed tensors with the same number of specified elements are supported."):
                 a.copy_(b)
 
             shape2 = tuple(reversed(shape1))
-            c = self.genSparseCompressedTensor(shape2, 1, dtype=dtype, layout=layout, device=device,
-                                               index_dtype=index_dtype, block_size=block_size)
+            c = self.genSparseCompressedTensor(shape2, nnz, dtype=dtype, layout=layout, device=device,
+                                               index_dtype=index_dtype, blocksize=blocksize)
             with self.assertRaisesRegex(
                     RuntimeError,
                     "expected shapes of self and src to match along dimension"):
                 b.copy_(c)
 
-            if block_size:
-                block_size1 = tuple(reversed(block_size))
-                d = self.genSparseCompressedTensor(shape1, 1, dtype=dtype, layout=layout, device=device,
-                                                   index_dtype=index_dtype, block_size=block_size1)
+            if blocksize:
+                blocksize1 = tuple(reversed(blocksize))
+                d = self.genSparseCompressedTensor(shape1, nnz, dtype=dtype, layout=layout, device=device,
+                                                   index_dtype=index_dtype, blocksize=blocksize1)
                 with self.assertRaisesRegex(RuntimeError,
                                             "copy of sparse compressed tensors having different block sizes is not supported"):
                     b.copy_(d)
@@ -548,7 +552,7 @@ def test_factory_type_invariants_check(self, device):
                                     torch.tensor([1, 2, 3, 4]),
                                     device=device)
 
-        with self.assertRaisesRegex(RuntimeError, r"\"validate_sparse_compressed_tensor_args\" not implemented for 'Short'"):
+        with self.assertRaisesRegex(RuntimeError, "crow_indices and col_indices must be an int32 or int64 type"):
             torch.sparse_csr_tensor(torch.tensor([0, 2, 4], dtype=torch.int16),
                                     torch.tensor([0, 1, 0, 1], dtype=torch.int16),
                                     torch.tensor([1, 2, 3, 4]),
@@ -618,7 +622,7 @@ def test_factory_shape_invariants_check(self, device):
                                     device=device)
 
         with self.assertRaisesRegex(RuntimeError,
-                                    r"number of dimensions of indices and values must be the same"):
+                                    r"non-zero dense dimensions \(=1\) is not supported"):
             torch.sparse_csr_tensor(crow_indices, col_indices, values.repeat(2, 1), size,
                                     device=device)
 
@@ -2105,14 +2109,25 @@ def _convert_to_layout(self, a, target_layout, blocksize=(2, 2)):
         is a need for a to.layout overload.
         """
         if target_layout is torch.sparse_csr:
-            return a.to_sparse_csr()
-        if target_layout is torch.sparse_csc:
-            return a.to_sparse_csc()
-        if target_layout is torch.sparse_bsr:
-            return a.to_sparse_bsr(blocksize)
-        if target_layout is torch.sparse_bsc:
-            return a.to_sparse_bsc(blocksize)
-        raise NotImplementedError(repr(a))
+            result = a.to_sparse_csr()
+        elif target_layout is torch.sparse_csc:
+            result = a.to_sparse_csc()
+        elif target_layout is torch.sparse_bsr:
+            result = a.to_sparse_bsr(blocksize)
+        elif target_layout is torch.sparse_bsc:
+            result = a.to_sparse_bsc(blocksize)
+        else:
+            raise NotImplementedError(repr(a))
+        assert result.layout is target_layout
+        # to_sparse_xyz methods use unsafe construction of sparse
+        # compressed tensors. Here we explicitly validate the results
+        # to make sure that the sparse tensors are consistent with the
+        # corresponding sparse tensor invariants.
+        compressed_indices_mth, plain_indices_mth = sparse_compressed_indices_methods[result.layout]
+        compressed_indices, plain_indices = compressed_indices_mth(result), plain_indices_mth(result)
+        torch._validate_sparse_compressed_tensor_args(compressed_indices, plain_indices, result.values(),
+                                                      result.shape, result.layout)
+        return result
 
     def _construct_sp_matrix(self, tensor, layout, blocksize=(2, 2)):
         if tensor.layout in [torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.strided]:
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index b90166d4012ef..86df6c5694fe3 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1950,7 +1950,7 @@ def _make_crow_indices(n_rows, n_cols, nnz,
         final correction, while the external part of the window is
         filled with counts to meet the nnz contraint exactly.
         """
-        assert 0 <= nnz <= n_rows * n_cols
+        assert 0 <= nnz <= n_rows * n_cols, (nnz, n_rows, n_cols)
 
         def sawteeth(n, m):
             # return the total number of counts in the sequence of
@@ -2046,14 +2046,19 @@ def sawteeth(n, m):
         crow_indices.cumsum_(dim=0)
         return crow_indices.to(device=device)
 
-    def genSparseCompressedTensor(self, size, nnz, *, layout, device, dtype, index_dtype, block_size=()):
+    def genSparseCompressedTensor(self, size, nnz, *, layout, device, dtype, index_dtype, blocksize=()):
         from operator import mul
         from functools import reduce
         sparse_dim = 2
         assert all(size[d] > 0 for d in range(len(size))) or nnz == 0, 'invalid arguments'
         assert len(size) >= sparse_dim
-        if block_size:
-            assert len(block_size) == 2
+        if blocksize:
+            assert len(blocksize) == 2, (size, blocksize)
+            assert size[-2] % blocksize[0] == 0, (size, blocksize)
+            assert size[-1] % blocksize[1] == 0, (size, blocksize)
+            blocksize0, blocksize1 = blocksize
+        else:
+            blocksize0 = blocksize1 = 1
 
         def random_sparse_compressed(n_compressed_dims, n_plain_dims, nnz):
             compressed_indices = self._make_crow_indices(n_compressed_dims, n_plain_dims, nnz, device=device, dtype=index_dtype)
@@ -2064,20 +2069,21 @@ def random_sparse_compressed(n_compressed_dims, n_plain_dims, nnz):
                     torch.randperm(n_plain_dims, dtype=index_dtype, device=device)[:count])
             low = -1 if dtype != torch.uint8 else 0
             high = 1 if dtype != torch.uint8 else 2
-            values = make_tensor((nnz,) + block_size, device=device, dtype=dtype, low=low, high=high)
+            values = make_tensor((nnz,) + blocksize, device=device, dtype=dtype, low=low, high=high)
             return values, compressed_indices, plain_indices
 
         batch_shape = size[:-2]
         n_batch = reduce(mul, batch_shape, 1)
 
         if layout in {torch.sparse_csr, torch.sparse_bsr}:
-            n_compressed_dims, n_plain_dims = size[-2], size[-1]
+            n_compressed_dims, n_plain_dims = size[-2] // blocksize0, size[-1] // blocksize1
         else:
-            n_compressed_dims, n_plain_dims = size[-1], size[-2]
-        sparse_tensors = [random_sparse_compressed(n_compressed_dims, n_plain_dims, nnz) for _ in range(n_batch)]
+            n_compressed_dims, n_plain_dims = size[-1] // blocksize1, size[-2] // blocksize0
+        blocknnz = nnz // (blocksize0 * blocksize1)
+        sparse_tensors = [random_sparse_compressed(n_compressed_dims, n_plain_dims, blocknnz) for _ in range(n_batch)]
         sparse_tensors_it = map(list, zip(*sparse_tensors))
 
-        values = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, nnz, *block_size)
+        values = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, blocknnz, *blocksize)
         compressed_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
         plain_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
 
@@ -2086,21 +2092,21 @@ def random_sparse_compressed(n_compressed_dims, n_plain_dims, nnz):
 
     def genSparseCSRTensor(self, size, nnz, *, device, dtype, index_dtype):
         return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_csr, device=device,
-                                              dtype=dtype, index_dtype=index_dtype, block_size=())
+                                              dtype=dtype, index_dtype=index_dtype, blocksize=())
 
     def genSparseCSCTensor(self, size, nnz, *, device, dtype, index_dtype):
         return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_csc, device=device,
-                                              dtype=dtype, index_dtype=index_dtype, block_size=())
+                                              dtype=dtype, index_dtype=index_dtype, blocksize=())
 
-    def genSparseBSRTensor(self, size, block_size, nnz, *, device, dtype, index_dtype):
-        assert len(block_size) == 2
+    def genSparseBSRTensor(self, size, blocksize, nnz, *, device, dtype, index_dtype):
+        assert len(blocksize) == 2
         return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_bsr, device=device,
-                                              dtype=dtype, index_dtype=index_dtype, block_size=block_size)
+                                              dtype=dtype, index_dtype=index_dtype, blocksize=blocksize)
 
-    def genSparseBSCTensor(self, size, block_size, nnz, *, device, dtype, index_dtype):
-        assert len(block_size) == 2
+    def genSparseBSCTensor(self, size, blocksize, nnz, *, device, dtype, index_dtype):
+        assert len(blocksize) == 2
         return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_bsc, device=device,
-                                              dtype=dtype, index_dtype=index_dtype, block_size=block_size)
+                                              dtype=dtype, index_dtype=index_dtype, blocksize=blocksize)
 
     def genSparseTensor(self, size, sparse_dim, nnz, is_uncoalesced, device, dtype):
         # Assert not given impossible combination, where the sparse dims have