From 4a4b038c62f287cad761016f7798cd64666c25f7 Mon Sep 17 00:00:00 2001
From: asistradition <asistradition@users.noreply.github.com>
Date: Wed, 28 Feb 2024 15:26:41 -0500
Subject: [PATCH] v0.9.1

---
 CHANGELOG.md                               |   4 +
 setup.py                                   |   2 +-
 sparse_dot_mkl/_sparse_sparse.py           |  68 +++++++-------
 sparse_dot_mkl/sparse_dot.py               |  11 +--
 sparse_dot_mkl/tests/test_sparse_sparse.py | 103 +++++++++++++++++++--
 5 files changed, 142 insertions(+), 46 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7544880..d635c03 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+### Version 0.9.1
+
+* Support for `out` parameter with sparse-sparse multiplication when `dense=True`
+
 ### Version 0.9.0
 
 * Support for scipy sparse arrays (introduced in scipy 1.11)
diff --git a/setup.py b/setup.py
index 4ea8940..efe7ae4 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 from setuptools import setup, find_packages
 
 DISTNAME = 'sparse_dot_mkl'
-VERSION = '0.9.0'
+VERSION = '0.9.1'
 DESCRIPTION = "Intel MKL wrapper for sparse matrix multiplication"
 MAINTAINER = 'Chris Jackson'
 MAINTAINER_EMAIL = 'cj59@nyu.edu'
diff --git a/sparse_dot_mkl/_sparse_sparse.py b/sparse_dot_mkl/_sparse_sparse.py
index e34cfae..5fd70ab 100644
--- a/sparse_dot_mkl/_sparse_sparse.py
+++ b/sparse_dot_mkl/_sparse_sparse.py
@@ -2,7 +2,6 @@
     MKL,
     sparse_matrix_t,
     _create_mkl_sparse,
-    debug_print,
     debug_timer,
     _export_mkl,
     _order_mkl_handle,
@@ -13,10 +12,10 @@
     _is_allowed_sparse_format,
     _check_return_value,
     _output_dtypes,
-    sparse_output_type
+    sparse_output_type,
+    _out_matrix
 )
 import ctypes as _ctypes
-import numpy as np
 
 
 def _matmul_mkl(sp_ref_a, sp_ref_b):
@@ -59,6 +58,7 @@ def _matmul_mkl_dense(
     sp_ref_b,
     output_shape,
     double_precision,
+    out=None,
     complex_type=False
 ):
     """
@@ -81,9 +81,10 @@ def _matmul_mkl_dense(
     """
 
     # Allocate an array for outputs
-    output_arr = np.empty(
+    output_arr = _out_matrix(
         output_shape,
-        dtype=_output_dtypes[(double_precision, complex_type)]
+        _output_dtypes[(double_precision, complex_type)],
+        out_arr=out
     )
 
     # Set types
@@ -110,7 +111,8 @@ def _sparse_dot_sparse(
     matrix_b,
     cast=False,
     reorder_output=False,
-    dense=False
+    dense=False,
+    out=None
 ):
     """
     Multiply together two scipy sparse matrixes using the intel
@@ -153,30 +155,31 @@ def _sparse_dot_sparse(
             "COO is not supported"
         )
 
-    default_output, output_type = sparse_output_type(matrix_a)
+    if out is not None and not dense:
+        raise ValueError(
+            "out argument cannot be used with sparse (dot) sparse "
+            "matrix multiplication unless dense=True"
+        )
 
-    # Override output if dense flag is set
-    default_output = default_output if not dense else np.zeros
+    default_output, output_type = sparse_output_type(matrix_a)
 
     # Check to make sure that this multiplication can work and check dtypes
     _sanity_check(matrix_a, matrix_b)
 
     # Check for edge condition inputs which result in empty outputs
     if _empty_output_check(matrix_a, matrix_b):
-        debug_print(
-            "Skipping multiplication because A (dot) B must yield an "
-            "empty matrix"
-        )
 
-        if matrix_a.dtype != matrix_b.dtype or matrix_a.dtype != np.float32:
-            final_dtype = np.float64
+        if dense:
+            return _out_matrix(
+                (matrix_a.shape[0], matrix_b.shape[1]),
+                matrix_a.dtype,
+                out_arr=out
+            )
         else:
-            final_dtype = np.float32
-
-        return default_output(
-            (matrix_a.shape[0], matrix_b.shape[1]),
-            dtype=final_dtype
-        )
+            return default_output(
+                (matrix_a.shape[0], matrix_b.shape[1]),
+                dtype=matrix_a.dtype
+            )
 
     # Check dtypes
     matrix_a, matrix_b = _type_check(matrix_a, matrix_b, cast=cast)
@@ -191,18 +194,21 @@ def _sparse_dot_sparse(
 
     # Call spmmd for dense output directly if the dense flag is set
     if dense:
-        dense_arr = _matmul_mkl_dense(
-            mkl_a,
-            mkl_b,
-            (matrix_a.shape[0], matrix_b.shape[1]),
-            a_dbl or b_dbl,
-            complex_type=a_cplx,
-        )
 
-        debug_timer("Multiplied matrices", t)
+        try:
+            dense_arr = _matmul_mkl_dense(
+                mkl_a,
+                mkl_b,
+                (matrix_a.shape[0], matrix_b.shape[1]),
+                a_dbl or b_dbl,
+                complex_type=a_cplx,
+                out=out
+            )
+        finally:
+            _destroy_mkl_handle(mkl_a)
+            _destroy_mkl_handle(mkl_b)
 
-        _destroy_mkl_handle(mkl_a)
-        _destroy_mkl_handle(mkl_b)
+        debug_timer("Multiplied matrices", t)
 
         return dense_arr
 
diff --git a/sparse_dot_mkl/sparse_dot.py b/sparse_dot_mkl/sparse_dot.py
index 0221a29..450e30a 100644
--- a/sparse_dot_mkl/sparse_dot.py
+++ b/sparse_dot_mkl/sparse_dot.py
@@ -82,19 +82,14 @@ def dot_product_mkl(
     ))
 
     # SPARSE (DOT) SPARSE #
-    if num_sparse == 2 and out is not None:
-        raise ValueError(
-            "out argument cannot be used with sparse (dot) sparse "
-            "matrix multiplication"
-        )
-
-    elif num_sparse == 2:
+    if num_sparse == 2:
         return _sds(
             matrix_a,
             matrix_b,
             cast=cast,
             reorder_output=reorder_output,
-            dense=dense
+            dense=dense,
+            out=out
         )
 
     # SPARSE (DOT) VECTOR #
diff --git a/sparse_dot_mkl/tests/test_sparse_sparse.py b/sparse_dot_mkl/tests/test_sparse_sparse.py
index 90dd836..73a47a7 100644
--- a/sparse_dot_mkl/tests/test_sparse_sparse.py
+++ b/sparse_dot_mkl/tests/test_sparse_sparse.py
@@ -279,10 +279,23 @@ def test_float32_CSR(self):
         )
         mat3_np = np.dot(d1.toarray(), d2.toarray())
 
-        mat3 = dot_product_mkl(d1, d2, copy=True, dense=True)
+        mat3 = dot_product_mkl(d1, d2, dense=True)
 
         np_almost_equal(mat3_np, mat3)
 
+    def test_float32_CSR_out(self):
+        d1, d2 = self.mat1.astype(self.single_dtype), self.mat2.astype(
+            self.single_dtype
+        )
+        mat3_np = np.dot(d1.toarray(), d2.toarray())
+        out_arr = np.empty_like(mat3_np)
+        mat3 = dot_product_mkl(d1, d2, dense=True, out=out_arr)
+
+        np_almost_equal(mat3_np, out_arr)
+        self.assertEqual(
+            id(mat3), id(out_arr)
+        )
+
     def test_float32_CSC(self):
         d1, d2 = (
             self.mat1.astype(self.single_dtype).tocsc(),
@@ -290,45 +303,123 @@ def test_float32_CSC(self):
         )
         mat3_np = np.dot(d1.toarray(), d2.toarray())
 
-        mat3 = dot_product_mkl(d1, d2, copy=True, dense=True)
+        mat3 = dot_product_mkl(d1, d2, dense=True)
 
         np_almost_equal(mat3_np, mat3)
 
+    def test_float32_CSC_out(self):
+        d1, d2 = (
+            self.mat1.astype(self.single_dtype).tocsc(),
+            self.mat2.astype(self.single_dtype).tocsc(),
+        )
+        mat3_np = np.dot(d1.toarray(), d2.toarray())
+        out_arr = np.empty_like(mat3_np)
+        mat3 = dot_product_mkl(d1, d2, dense=True, out=out_arr)
+
+        np_almost_equal(mat3_np, out_arr)
+        self.assertEqual(
+            id(mat3), id(out_arr)
+        )
+
     def test_float64_CSR(self):
         d1, d2 = self.mat1, self.mat2
         mat3_np = np.dot(d1.toarray(), d2.toarray())
 
-        mat3 = dot_product_mkl(d1, d2, copy=True, dense=True)
+        mat3 = dot_product_mkl(d1, d2, dense=True)
 
         np_almost_equal(mat3_np, mat3)
 
+    def test_float64_CSR_out(self):
+        d1, d2 = self.mat1, self.mat2
+        mat3_np = np.dot(d1.toarray(), d2.toarray())
+        out_arr = np.empty_like(mat3_np)
+        mat3 = dot_product_mkl(d1, d2, dense=True, out=out_arr)
+
+        np_almost_equal(mat3_np, out_arr)
+        self.assertEqual(
+            id(mat3), id(out_arr)
+        )
+
     def test_float64_CSC(self):
         d1, d2 = self.mat1.tocsc(), self.mat2.tocsc()
         mat3_np = np.dot(d1.toarray(), d2.toarray())
 
-        mat3 = dot_product_mkl(d1, d2, copy=True, dense=True)
+        mat3 = dot_product_mkl(d1, d2, dense=True)
 
         np_almost_equal(mat3_np, mat3)
 
+    def test_float64_CSC_out(self):
+        d1, d2 = self.mat1.tocsc(), self.mat2.tocsc()
+        mat3_np = np.dot(d1.toarray(), d2.toarray())
+        out_arr = np.empty_like(mat3_np)
+        mat3 = dot_product_mkl(d1, d2, dense=True, out=out_arr)
+
+        np_almost_equal(mat3_np, out_arr)
+        self.assertEqual(
+            id(mat3), id(out_arr)
+        )
+
     def test_float64_BSR(self):
         d1, d2 = self.mat1.tobsr(blocksize=(10, 10)), self.mat2.tobsr(
             blocksize=(10, 10)
         )
         mat3_np = np.dot(d1.toarray(), d2.toarray())
 
-        mat3 = dot_product_mkl(d1, d2, copy=True, dense=True)
+        mat3 = dot_product_mkl(d1, d2, dense=True)
 
         np_almost_equal(mat3_np, mat3)
 
+    def test_float64_BSR_out(self):
+        d1, d2 = self.mat1.tobsr(blocksize=(10, 10)), self.mat2.tobsr(
+            blocksize=(10, 10)
+        )
+        mat3_np = np.dot(d1.toarray(), d2.toarray())
+        out_arr = np.empty_like(mat3_np)
+        mat3 = dot_product_mkl(d1, d2, dense=True, out=out_arr)
+
+        np_almost_equal(mat3_np, out_arr)
+        self.assertEqual(
+            id(mat3), id(out_arr)
+        )
+
     def test_float32_BSR(self):
         d1 = self.mat1.astype(self.single_dtype).tobsr(blocksize=(10, 10))
         d2 = self.mat2.astype(self.single_dtype).tobsr(blocksize=(10, 10))
         mat3_np = np.dot(d1.toarray(), d2.toarray())
 
-        mat3 = dot_product_mkl(d1, d2, copy=True, dense=True)
+        mat3 = dot_product_mkl(d1, d2, dense=True)
 
         np_almost_equal(mat3_np, mat3)
 
+    def test_float32_BSR_out(self):
+        d1 = self.mat1.astype(self.single_dtype).tobsr(blocksize=(10, 10))
+        d2 = self.mat2.astype(self.single_dtype).tobsr(blocksize=(10, 10))
+        mat3_np = np.dot(d1.toarray(), d2.toarray())
+        out_arr = np.empty_like(mat3_np)
+        mat3 = dot_product_mkl(d1, d2, dense=True, out=out_arr)
+
+        np_almost_equal(mat3_np, out_arr)
+        self.assertEqual(
+            id(mat3), id(out_arr)
+        )
+
+    def test_CSR_bad_outs(self):
+        d1, d2 = self.mat1, self.mat2
+        mat3_np = np.dot(d1.toarray(), d2.toarray())
+
+        out_arr = np.empty_like(mat3_np, dtype=np.float32)
+
+        with self.assertRaises(ValueError):
+            _ = dot_product_mkl(d1, d2, dense=True, out=out_arr)
+
+        out_arr = np.empty_like(mat3_np, order='F')
+        with self.assertRaises(ValueError):
+            _ = dot_product_mkl(d1, d2, dense=True, out=out_arr)
+
+        out_arr = np.empty((1, 1), dtype=np.float64)
+        with self.assertRaises(ValueError):
+            _ = dot_product_mkl(d1, d2, dense=True, out=out_arr)
+
 
 class _ComplexMixin:
     double_dtype = np.cdouble