From 42b820795506428c1e9a7025cd1ff5e41ac0f0ab Mon Sep 17 00:00:00 2001
From: mvandelle <marc.vandelle@proton.me>
Date: Wed, 20 Nov 2024 17:05:29 -0500
Subject: [PATCH 1/3] Updated SparseLinAlg.hpp and Test_SparseLinAlg.cpp

---
 .../lightning_qubit/utils/SparseLinAlg.hpp    | 89 +++++++++++++++----
 .../utils/tests/Test_SparseLinAlg.cpp         | 13 +++
 2 files changed, 87 insertions(+), 15 deletions(-)
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp
index df8b616303..2e49071ecf 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp
@@ -20,10 +20,43 @@
 #pragma once
 #include <complex>
 #include <vector>
+#include <thread>
 
 namespace Pennylane::LightningQubit::Util {
+
+/**
+ * @brief Worker function to compute a segment of the matrix-vector multiplication for a sparse matrix.
+ *
+ * @tparam fp_precision data float point precision.
+ * @tparam index_type integer type used as indices of the sparse matrix.
+ * @param vector_ptr pointer to the vector.
+ * @param row_map_ptr Pointer to the row_map array. Elements of this array
+ * return the number of non-zero terms in all rows before it.
+ * @param entries_ptr pointer to the column indices of the non-zero elements.
+ * @param values_ptr non-zero elements.
+ * @param result Reference to the output vector where results are stored.
+ * @param start Index of the first row to process.
+ * @param end Index of the last row (exclusive) to process.
+ */
+template <class fp_precision, class index_type>
+void sparse_worker(const std::complex<fp_precision> *vector_ptr,
+                   const index_type *row_map_ptr, 
+                   const index_type *entries_ptr,
+                   const std::complex<fp_precision> *values_ptr,
+                   std::vector<std::complex<fp_precision>> &result,
+                   index_type start, index_type end) {
+    for (index_type i = start; i < end; i++) {
+        std::complex<fp_precision> temp = 0.0;
+        // Loop through all non-zero elements in row `i`
+        for (index_type j = row_map_ptr[i]; j < row_map_ptr[i + 1]; j++) {
+            temp += values_ptr[j] * vector_ptr[entries_ptr[j]];
+        }
+        result[i] = temp; // Store the computed value in the result vector
+    }
+}
+
 /**
- * @brief Apply a sparse matrix to a vector.
+ * @brief Applies a sparse matrix to a vector using multi-threading.
  *
  * @tparam fp_precision data float point precision.
  * @tparam index_type integer type used as indices of the sparse matrix.
@@ -40,21 +73,47 @@ namespace Pennylane::LightningQubit::Util {
 template <class fp_precision, class index_type>
 std::vector<std::complex<fp_precision>>
 apply_Sparse_Matrix(const std::complex<fp_precision> *vector_ptr,
-                    const index_type vector_size, const index_type *row_map_ptr,
-                    [[maybe_unused]] const index_type row_map_size,
-                    const index_type *entries_ptr,
-                    const std::complex<fp_precision> *values_ptr,
-                    [[maybe_unused]] const index_type numNNZ) {
-    std::vector<std::complex<fp_precision>> result;
-    result.resize(vector_size);
-    std::size_t count = 0;
-    for (index_type i = 0; i < vector_size; i++) {
-        result[i] = 0.0;
-        for (index_type j = 0; j < row_map_ptr[i + 1] - row_map_ptr[i]; j++) {
-            result[i] += values_ptr[count] * vector_ptr[entries_ptr[count]];
-            count++;
+                            const index_type vector_size, 
+                            const index_type *row_map_ptr,
+                            [[maybe_unused]] const index_type row_map_size,
+                            const index_type *entries_ptr,
+                            const std::complex<fp_precision> *values_ptr,
+                            [[maybe_unused]] const index_type numNNZ,
+                            index_type num_threads = 0) {
+    // Output vector initialized to zero
+    std::vector<std::complex<fp_precision>> result(vector_size, std::complex<fp_precision>(0.0));
+
+    // Determine the number of threads to use
+    if (num_threads <= 0) {
+        const int max_threads = std::thread::hardware_concurrency();
+        num_threads = std::min(vector_size, static_cast<index_type>(max_threads));
+    }
+    
+    // Divide the rows approximately evenly among the threads    
+    index_type chunk_size = (vector_size + num_threads - 1) / num_threads;
+    std::vector<std::thread> threads;
+
+    // Create and launch threads
+    for (index_type t = 0; t < num_threads; ++t) {
+        index_type start = t * chunk_size;
+        index_type end = std::min(start + chunk_size, vector_size);
+
+        // Only launch threads if there are rows to process
+        if (start < vector_size) {
+            threads.emplace_back(sparse_worker<fp_precision, index_type>,
+                                 vector_ptr, row_map_ptr, entries_ptr, values_ptr,
+                                 std::ref(result), start, end);
         }
     }
+
+    // Wait for all threads to complete
+    for (auto &th : threads) {
+        if (th.joinable()) {
+            th.join();
+        }
+    }
+
     return result;
 };
-} // namespace Pennylane::LightningQubit::Util
+
+} // namespace Pennylane::LightningQubit::Util
\ No newline at end of file
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp
index 56c41a0df5..22008e4fd4 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp
@@ -81,4 +81,17 @@ TEMPLATE_TEST_CASE("apply_Sparse_Matrix", "[Sparse]", float, double) {
             REQUIRE(result_refs[vec] == approx(result).margin(1e-6));
         };
     }
+
+    SECTION("Testing with different number of threads") {
+        for (size_t num_threads : { 1, 2, 4, 8, 16, 32}) {
+            for (size_t vec = 0; vec < vectors.size(); vec++) {
+                auto result = apply_Sparse_Matrix(
+                    vectors[vec].data(), vectors[vec].size(), row_map.data(),
+                    row_map.size(), entries.data(), values.data(), values.size(),
+                    num_threads);
+
+                REQUIRE(result_refs[vec] == approx(result).margin(1e-6));
+            }
+        }
+    }
 }
\ No newline at end of file

From e8af71ebf0b3b3219c7d051c547634208dfe2fa9 Mon Sep 17 00:00:00 2001
From: mvandelle <marc.vandelle@proton.me>
Date: Fri, 22 Nov 2024 15:12:31 -0500
Subject: [PATCH 2/3] Format SparseLinAlg and Test

---
 .../lightning_qubit/utils/SparseLinAlg.hpp    | 37 +++++++++----------
 .../utils/tests/Test_SparseLinAlg.cpp         |  6 +--
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp
index 2e49071ecf..8ae59f1f6a 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp
@@ -19,13 +19,14 @@
 
 #pragma once
 #include <complex>
-#include <vector>
 #include <thread>
+#include <vector>
 
 namespace Pennylane::LightningQubit::Util {
 
 /**
- * @brief Worker function to compute a segment of the matrix-vector multiplication for a sparse matrix.
+ * @brief Worker function to compute a segment of the matrix-vector
+ * multiplication for a sparse matrix.
  *
  * @tparam fp_precision data float point precision.
  * @tparam index_type integer type used as indices of the sparse matrix.
@@ -40,8 +41,7 @@ namespace Pennylane::LightningQubit::Util {
  */
 template <class fp_precision, class index_type>
 void sparse_worker(const std::complex<fp_precision> *vector_ptr,
-                   const index_type *row_map_ptr, 
-                   const index_type *entries_ptr,
+                   const index_type *row_map_ptr, const index_type *entries_ptr,
                    const std::complex<fp_precision> *values_ptr,
                    std::vector<std::complex<fp_precision>> &result,
                    index_type start, index_type end) {
@@ -71,25 +71,24 @@ void sparse_worker(const std::complex<fp_precision> *vector_ptr,
  * @return result       result of the matrix vector multiplication.
  */
 template <class fp_precision, class index_type>
-std::vector<std::complex<fp_precision>>
-apply_Sparse_Matrix(const std::complex<fp_precision> *vector_ptr,
-                            const index_type vector_size, 
-                            const index_type *row_map_ptr,
-                            [[maybe_unused]] const index_type row_map_size,
-                            const index_type *entries_ptr,
-                            const std::complex<fp_precision> *values_ptr,
-                            [[maybe_unused]] const index_type numNNZ,
-                            index_type num_threads = 0) {
+std::vector<std::complex<fp_precision>> apply_Sparse_Matrix(
+    const std::complex<fp_precision> *vector_ptr, const index_type vector_size,
+    const index_type *row_map_ptr,
+    [[maybe_unused]] const index_type row_map_size,
+    const index_type *entries_ptr, const std::complex<fp_precision> *values_ptr,
+    [[maybe_unused]] const index_type numNNZ, index_type num_threads = 0) {
     // Output vector initialized to zero
-    std::vector<std::complex<fp_precision>> result(vector_size, std::complex<fp_precision>(0.0));
+    std::vector<std::complex<fp_precision>> result(
+        vector_size, std::complex<fp_precision>(0.0));
 
     // Determine the number of threads to use
     if (num_threads <= 0) {
         const int max_threads = std::thread::hardware_concurrency();
-        num_threads = std::min(vector_size, static_cast<index_type>(max_threads));
+        num_threads =
+            std::min(vector_size, static_cast<index_type>(max_threads));
     }
-    
-    // Divide the rows approximately evenly among the threads    
+
+    // Divide the rows approximately evenly among the threads
     index_type chunk_size = (vector_size + num_threads - 1) / num_threads;
     std::vector<std::thread> threads;
 
@@ -101,8 +100,8 @@ apply_Sparse_Matrix(const std::complex<fp_precision> *vector_ptr,
         // Only launch threads if there are rows to process
         if (start < vector_size) {
             threads.emplace_back(sparse_worker<fp_precision, index_type>,
-                                 vector_ptr, row_map_ptr, entries_ptr, values_ptr,
-                                 std::ref(result), start, end);
+                                 vector_ptr, row_map_ptr, entries_ptr,
+                                 values_ptr, std::ref(result), start, end);
         }
     }
 
diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp
index 22008e4fd4..76eeb5f458 100644
--- a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp
@@ -83,12 +83,12 @@ TEMPLATE_TEST_CASE("apply_Sparse_Matrix", "[Sparse]", float, double) {
     }
 
     SECTION("Testing with different number of threads") {
-        for (size_t num_threads : { 1, 2, 4, 8, 16, 32}) {
+        for (size_t num_threads : {1, 2, 4, 8, 16, 32}) {
             for (size_t vec = 0; vec < vectors.size(); vec++) {
                 auto result = apply_Sparse_Matrix(
                     vectors[vec].data(), vectors[vec].size(), row_map.data(),
-                    row_map.size(), entries.data(), values.data(), values.size(),
-                    num_threads);
+                    row_map.size(), entries.data(), values.data(),
+                    values.size(), num_threads);
 
                 REQUIRE(result_refs[vec] == approx(result).margin(1e-6));
             }

From 8e27978d1a8f99dea8df3db2497515064832e5f0 Mon Sep 17 00:00:00 2001
From: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Date: Fri, 22 Nov 2024 20:13:58 +0000
Subject: [PATCH 3/3] Auto update version from '0.40.0-dev13' to '0.40.0-dev15'

---
 pennylane_lightning/core/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index fec481f46d..e3474d59e8 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.40.0-dev13"
+__version__ = "0.40.0-dev15"