diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 46600effac..e3474d59e8 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.40.0-dev14" +__version__ = "0.40.0-dev15" diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp index df8b616303..8ae59f1f6a 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/SparseLinAlg.hpp @@ -19,11 +19,44 @@ #pragma once #include +#include #include namespace Pennylane::LightningQubit::Util { + +/** + * @brief Worker function to compute a segment of the matrix-vector + * multiplication for a sparse matrix. + * + * @tparam fp_precision data float point precision. + * @tparam index_type integer type used as indices of the sparse matrix. + * @param vector_ptr pointer to the vector. + * @param row_map_ptr Pointer to the row_map array. Elements of this array + * return the number of non-zero terms in all rows before it. + * @param entries_ptr pointer to the column indices of the non-zero elements. + * @param values_ptr non-zero elements. + * @param result Reference to the output vector where results are stored. + * @param start Index of the first row to process. + * @param end Index of the last row (exclusive) to process. + */ +template +void sparse_worker(const std::complex *vector_ptr, + const index_type *row_map_ptr, const index_type *entries_ptr, + const std::complex *values_ptr, + std::vector> &result, + index_type start, index_type end) { + for (index_type i = start; i < end; i++) { + std::complex temp = 0.0; + // Loop through all non-zero elements in row `i` + for (index_type j = row_map_ptr[i]; j < row_map_ptr[i + 1]; j++) { + temp += values_ptr[j] * vector_ptr[entries_ptr[j]]; + } + result[i] = temp; // Store the computed value in the result vector + } +} + /** - * @brief Apply a sparse matrix to a vector. + * @brief Applies a sparse matrix to a vector using multi-threading. * * @tparam fp_precision data float point precision. * @tparam index_type integer type used as indices of the sparse matrix. @@ -38,23 +71,48 @@ namespace Pennylane::LightningQubit::Util { * @return result result of the matrix vector multiplication. */ template -std::vector> -apply_Sparse_Matrix(const std::complex *vector_ptr, - const index_type vector_size, const index_type *row_map_ptr, - [[maybe_unused]] const index_type row_map_size, - const index_type *entries_ptr, - const std::complex *values_ptr, - [[maybe_unused]] const index_type numNNZ) { - std::vector> result; - result.resize(vector_size); - std::size_t count = 0; - for (index_type i = 0; i < vector_size; i++) { - result[i] = 0.0; - for (index_type j = 0; j < row_map_ptr[i + 1] - row_map_ptr[i]; j++) { - result[i] += values_ptr[count] * vector_ptr[entries_ptr[count]]; - count++; +std::vector> apply_Sparse_Matrix( + const std::complex *vector_ptr, const index_type vector_size, + const index_type *row_map_ptr, + [[maybe_unused]] const index_type row_map_size, + const index_type *entries_ptr, const std::complex *values_ptr, + [[maybe_unused]] const index_type numNNZ, index_type num_threads = 0) { + // Output vector initialized to zero + std::vector> result( + vector_size, std::complex(0.0)); + + // Determine the number of threads to use + if (num_threads <= 0) { + const int max_threads = std::thread::hardware_concurrency(); + num_threads = + std::min(vector_size, static_cast(max_threads)); + } + + // Divide the rows approximately evenly among the threads + index_type chunk_size = (vector_size + num_threads - 1) / num_threads; + std::vector threads; + + // Create and launch threads + for (index_type t = 0; t < num_threads; ++t) { + index_type start = t * chunk_size; + index_type end = std::min(start + chunk_size, vector_size); + + // Only launch threads if there are rows to process + if (start < vector_size) { + threads.emplace_back(sparse_worker, + vector_ptr, row_map_ptr, entries_ptr, + values_ptr, std::ref(result), start, end); } } + + // Wait for all threads to complete + for (auto &th : threads) { + if (th.joinable()) { + th.join(); + } + } + return result; }; -} // namespace Pennylane::LightningQubit::Util + +} // namespace Pennylane::LightningQubit::Util \ No newline at end of file diff --git a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp index 56c41a0df5..76eeb5f458 100644 --- a/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp +++ b/pennylane_lightning/core/src/simulators/lightning_qubit/utils/tests/Test_SparseLinAlg.cpp @@ -81,4 +81,17 @@ TEMPLATE_TEST_CASE("apply_Sparse_Matrix", "[Sparse]", float, double) { REQUIRE(result_refs[vec] == approx(result).margin(1e-6)); }; } + + SECTION("Testing with different number of threads") { + for (size_t num_threads : {1, 2, 4, 8, 16, 32}) { + for (size_t vec = 0; vec < vectors.size(); vec++) { + auto result = apply_Sparse_Matrix( + vectors[vec].data(), vectors[vec].size(), row_map.data(), + row_map.size(), entries.data(), values.data(), + values.size(), num_threads); + + REQUIRE(result_refs[vec] == approx(result).margin(1e-6)); + } + } + } } \ No newline at end of file