diff --git a/.vscode/launch.json b/.vscode/launch.json
index c3539d8..5bb5f67 100755
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,19 +1,16 @@
 {
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
         {
             "name": "CUDA C++: Launch",
             "type": "cuda-gdb",
             "request": "launch",
-            "program": "${workspaceFolder}/build/TensorCore"
+            "program": "${workspaceFolder}/build/test/binarytensor_test"
         },
         {
             "name": "CUDA C++: Attach",
             "type": "cuda-gdb",
             "request": "attach"
-        }
+        },
     ]
 }
\ No newline at end of file
diff --git a/src/binary_tensor/core/tensor_blas.cu b/src/binary_tensor/core/tensor_blas.cu
index 98aedd4..b02ead4 100644
--- a/src/binary_tensor/core/tensor_blas.cu
+++ b/src/binary_tensor/core/tensor_blas.cu
@@ -161,7 +161,7 @@ namespace binary_tensor
 			TensorBase base_b = b.get_buffer().change_device(this_cuda);
 			const std::initializer_list<unsigned int> shape_a = base_a.shape();
 			const std::initializer_list<unsigned int> shape_b = base_b.shape();
-			assert(shape_a.size() == 2 && shape_b.size() == 2 && shape_a.end()[-1] == shape_b.end()[-2]);
+			assert(shape_a.size() == 2 &&shape_b.size() == 2 && shape_a.end()[-1] == shape_b.end()[-2]);
 			std::vector<std::pair<Tensor, Derivation>> temp;
 			if (is_derive)
 			{
@@ -196,7 +196,7 @@ namespace binary_tensor
                 static_cast<const uint1_t_x8*>(base_b.data())
             );
 
-			TensorBase value_buf({ batch_size, shape_a.end()[-2] , shape_b.end()[-1] }, c_ptr, this_cuda);
+			TensorBase value_buf({ shape_a.end()[-2] , shape_b.end()[-1] }, c_ptr, this_cuda);
 			cudaStat = cudaFree(c_ptr);
 			return Tensor(std::move(value_buf), std::move(temp));
 		}
@@ -212,7 +212,7 @@ namespace binary_tensor
 			TensorBase base_b = b.get_buffer().change_device(this_cuda);
 			const std::initializer_list<unsigned int> shape_a = base_a.shape();
 			const std::initializer_list<unsigned int> shape_b = base_b.shape();
-			assert(shape_a.size() == shape_b.size() && std::memcmp(shape_a.begin(), shape_b.begin(), std::min(shape_a.size(), shape_b.size()) - 2) && shape_a.end()[-1] == shape_b.end()[-2]);
+			assert(shape_a.size() == shape_b.size() && std::memcmp(shape_a.begin(), shape_b.begin(), std::min(shape_a.size(), shape_b.size()) - 2) == 0 && shape_a.end()[-1] == shape_b.end()[-2]);
 			std::vector<std::pair<Tensor, Derivation>> temp;
 			if (is_derive)
 			{
@@ -247,7 +247,10 @@ namespace binary_tensor
                 static_cast<const uint1_t_x8*>(base_b.data())
             );
 
-			TensorBase value_buf({ batch_size, shape_a.end()[-2] , shape_b.end()[-1] }, c_ptr, this_cuda);
+            std::vector<unsigned int> out_dims = shape_a;
+            out_dims[out_dims.size() - 1] = shape_b.end()[-1];
+
+			TensorBase value_buf(out_dims, c_ptr, this_cuda);
 			cudaStat = cudaFree(c_ptr);
 			return Tensor(std::move(value_buf), std::move(temp));
 		}
diff --git a/test/main.cpp b/test/main.cpp
index 9300091..fe15c5f 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -6,7 +6,7 @@ using namespace binary_tensor::dtype;
 int main(int argc, char const *argv[])
 {
     /* code */
-    TensorArray<2, 2> a1 =
+    TensorArray<2, 3> a1 =
     {{
         {{
             1
@@ -15,7 +15,7 @@ int main(int argc, char const *argv[])
             1
         }}
     }};
-    TensorArray<2, 2> a2 =
+    TensorArray<3, 2> a2 =
     {{
         {{
             1, 1
@@ -23,7 +23,7 @@ int main(int argc, char const *argv[])
     }};
     Tensor a01 = Tensor(a1);
     Tensor a02 = Tensor(a2);
-    auto b = a01 + a02;
+    auto b = matmul(a01, a02);
     b.calc_grad(ones(b.get_buffer().shape()));
     std::cout << b << std::endl <<
         a01.get_grad() << std::endl <<