diff --git a/ptypy/accelerate/cuda_common/batched_multiply.cu b/ptypy/accelerate/cuda_common/batched_multiply.cu
index 11394f68c..7976ef765 100644
--- a/ptypy/accelerate/cuda_common/batched_multiply.cu
+++ b/ptypy/accelerate/cuda_common/batched_multiply.cu
@@ -22,10 +22,16 @@ extern "C" __global__ void batched_multiply(const complex<IN_TYPE>* input,
   int gy = threadIdx.y + blockIdx.y * blockDim.y;
   int gz = threadIdx.z + blockIdx.z * blockDim.z;
 
-  if (gx > rows || gy > columns || gz > nBatches)
+  if (gx > rows - 1 || gy > columns - 1 || gz > nBatches)
     return;
 
   auto val = input[gz * rows * columns + gy * rows + gx];
+  //printf("gx = %d, gy = %d, gz = %d, val= %.1f +i%.1f\n", gz,gy,gz, val.real(), val.imag());
+  //printf("threads: x=%d y=%d z=%d\n", threadIdx.x, threadIdx.y, threadIdx.z);
+  //printf("blocks: x=%d y=%d z=%d\n", blockIdx.x, blockIdx.y, blockIdx.z);
+  //printf("grids: x=%d y=%d z=%d\n", blockDim.x, blockDim.y, blockDim.z);
+
+
   if (MPY_DO_FILT)  // set at compile-time
   {
     val *= filter[gy * rows + gx];
diff --git a/ptypy/accelerate/cuda_cupy/array_utils.py b/ptypy/accelerate/cuda_cupy/array_utils.py
index 9f14015a1..84c729b56 100644
--- a/ptypy/accelerate/cuda_cupy/array_utils.py
+++ b/ptypy/accelerate/cuda_cupy/array_utils.py
@@ -63,6 +63,37 @@ def dot(self, A: cp.ndarray, B: cp.ndarray, out: cp.ndarray = None) -> cp.ndarra
     def norm2(self, A, out=None):
         return self.dot(A, A, out)
 
+class BatchedMultiplyKernel:
+    def __init__(self, array, queue=None, math_type=np.complex64):
+        self.queue = queue
+        self.array_shape = array.shape[-2:]
+        self.batches = int(np.prod(array.shape[0:array.ndim-2]) if array.ndim > 2 else 1)
+        self.batched_multiply_cuda = load_kernel("batched_multiply", {
+            'MPY_DO_SCALE': 'true',
+            'MPY_DO_FILT': 'true',
+            'IN_TYPE': 'float' if array.dtype==np.complex64 else 'double',
+            'OUT_TYPE': 'float' if array.dtype==np.complex64 else 'double',
+            'MATH_TYPE': 'float' if math_type==np.complex64 else 'double'
+        })
+        self.block = (32,32,1)
+        self.grid = (
+            int((self.array_shape[0] + 31) // 32),
+            int((self.array_shape[1] + 31) // 32),
+            int(self.batches)
+        )
+
+    def multiply(self, x,y, scale=1.):
+        assert x.dtype == y.dtype, "Input arrays must be of same data type"
+        assert x.shape[-2:] == y.shape[-2:], "Input arrays must be of the same size in last 2 dims"
+        if self.queue is not None:
+            self.queue.use()
+        self.batched_multiply_cuda(self.grid,
+                                   self.block,
+                                   args=(x,x,y,
+                                   np.float32(scale),
+                                   np.int32(self.batches),
+                                   np.int32(self.array_shape[0]),
+                                   np.int32(self.array_shape[1])))
 
 class TransposeKernel:
 
diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py
index 2879739e3..7fb730516 100644
--- a/ptypy/accelerate/cuda_pycuda/array_utils.py
+++ b/ptypy/accelerate/cuda_pycuda/array_utils.py
@@ -66,6 +66,38 @@ def dot(self, A, B, out=None):
     def norm2(self, A, out=None):
         return self.dot(A, A, out)
 
+class BatchedMultiplyKernel:
+    def __init__(self, array, queue=None, math_type=np.complex64):
+        self.queue = queue
+        self.array_shape = array.shape[-2:]
+        self.batches = int(np.prod(array.shape[0:array.ndim-2]) if array.ndim > 2 else 1)
+        self.batched_multiply_cuda = load_kernel("batched_multiply", {
+            'MPY_DO_SCALE': 'true',
+            'MPY_DO_FILT': 'true',
+            'IN_TYPE': 'float' if array.dtype==np.complex64 else 'double',
+            'OUT_TYPE': 'float' if array.dtype==np.complex64 else 'double',
+            'MATH_TYPE': 'float' if math_type==np.complex64 else 'double'
+        })
+        self.block = (32,32,1)
+        self.grid = (
+            int((self.array_shape[0] + 31) // 32),
+            int((self.array_shape[1] + 31) // 32),
+            int(self.batches)
+        )
+
+    def multiply(self, x,y, scale=1.):
+        assert x.dtype == y.dtype, "Input arrays must be of same data type"
+        assert x.shape[-2:] == y.shape[-2:], "Input arrays must be of the same size in last 2 dims"
+        self.batched_multiply_cuda(x,x,y,
+                                   np.float32(scale),
+                                   np.int32(self.batches),
+                                   np.int32(self.array_shape[0]),
+                                   np.int32(self.array_shape[1]),
+                                   block=self.block,
+                                   grid=self.grid,
+                                   stream=self.queue)
+
+
 class TransposeKernel:
 
     def __init__(self, queue=None):
diff --git a/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py b/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py
index a7a92d7b9..23cfc0156 100644
--- a/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py
+++ b/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py
@@ -79,6 +79,23 @@ def test_dot_performance(self):
         AU = gau.ArrayUtilsKernel(acc_dtype=np.float64)
         AU.dot(A_dev, A_dev)
 
+    def test_batched_multiply(self):
+        # Arrange
+        sh = (3,14,24)
+        ksh = (14,24)
+        data = (np.random.random(sh) + 1j* np.random.random(sh)).astype(np.complex64)
+        kernel = (np.random.random(ksh) + 1j* np.random.random(ksh)).astype(np.complex64)
+        data_dev = cp.asarray(data)
+        kernel_dev = cp.asarray(kernel)
+
+        # Act
+        BM = gau.BatchedMultiplyKernel(data_dev)
+        BM.multiply(data_dev, kernel_dev, scale=2.)
+
+        # Assert
+        expected = data * kernel * 2.
+        np.testing.assert_array_almost_equal(data_dev.get(), expected)
+
     def test_transpose_2D(self):
         # Arrange
         inp, _ = np.indices((5, 3), dtype=np.int32)
@@ -584,8 +601,6 @@ def test_fft_filter_batched_UNITY(self):
         FF.apply_filter(data_dev)
 
         output = au.fft_filter(data, kernel, prefactor, postfactor)
-        print(data_dev.get())
-
         np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-6)
 
     def test_complex_gaussian_filter_fft_little_blurring_UNITY(self):
diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
index e46230953..be82cc292 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
@@ -82,6 +82,23 @@ def test_dot_performance(self):
         AU = gau.ArrayUtilsKernel(acc_dtype=np.float64)
         out_dev = AU.dot(A_dev, A_dev)
 
+    def test_batched_multiply(self):
+        # Arrange
+        sh = (3,14,24)
+        ksh = (14,24)
+        data = (np.random.random(sh) + 1j* np.random.random(sh)).astype(np.complex64)
+        kernel = (np.random.random(ksh) + 1j* np.random.random(ksh)).astype(np.complex64)
+        data_dev = gpuarray.to_gpu(data)
+        kernel_dev = gpuarray.to_gpu(kernel)
+
+        # Act
+        BM = gau.BatchedMultiplyKernel(data_dev)
+        BM.multiply(data_dev, kernel_dev, scale=2.)
+
+        # Assert
+        expected = data * kernel * 2.
+        np.testing.assert_array_almost_equal(data_dev.get(), expected)
+
     def test_transpose_2D(self):
         ## Arrange
         inp,_ = np.indices((5,3), dtype=np.int32)