From b6655b66a68761b2694d815f42e5b20dd2269c32 Mon Sep 17 00:00:00 2001 From: Ben Howe <141149032+bmhowe23@users.noreply.github.com> Date: Mon, 13 Jan 2025 18:03:21 -0800 Subject: [PATCH 1/4] Make Python CUDA dependencies resolve for arm64, too (#2501) See also #1602 --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 91dc417078..a1a0b60b1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,10 +22,10 @@ dependencies = [ 'numpy >= 1.24', 'scipy >= 1.10.1', 'requests >= 2.31', - 'nvidia-cublas-cu12 ~= 12.0; platform_machine == "x86_64"', - 'nvidia-cuda-runtime-cu12 ~= 12.0; platform_machine == "x86_64"', - 'nvidia-cusolver-cu12 ~= 11.4; platform_machine == "x86_64"', - 'nvidia-cuda-nvrtc-cu12 ~= 12.0; platform_machine == "x86_64"' + 'nvidia-cublas-cu12 ~= 12.0', + 'nvidia-cuda-runtime-cu12 ~= 12.0', + 'nvidia-cusolver-cu12 ~= 11.4', + 'nvidia-cuda-nvrtc-cu12 ~= 12.0' ] classifiers = [ 'Intended Audience :: Science/Research', From a9b50fc1d48083420ed4628e3198dbd7fe52a69a Mon Sep 17 00:00:00 2001 From: Ben Howe <141149032+bmhowe23@users.noreply.github.com> Date: Mon, 13 Jan 2025 21:16:40 -0800 Subject: [PATCH 2/4] [ci] Update docker_images jobs to use more CPUs for build (#2505) --- .github/workflows/docker_images.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker_images.yml b/.github/workflows/docker_images.yml index 0e62b2a302..1c6d4b65bf 100644 --- a/.github/workflows/docker_images.yml +++ b/.github/workflows/docker_images.yml @@ -61,7 +61,7 @@ jobs: run: | if [ -n "$(echo ${{ inputs.platforms }} | grep ',')" ]; then # multi-platform builds get no platform tag - echo "runner=linux-amd64-cpu8" >> $GITHUB_OUTPUT + echo "runner=linux-amd64-cpu16" >> $GITHUB_OUTPUT echo "build_docs=${{ inputs.build_docs != 'false' }}" >> $GITHUB_OUTPUT is_versioned=${{ github.ref_type == 'tag' || startsWith(github.ref_name, 'releases/') || startsWith(github.ref_name, 'staging/') }} has_continuous_deployment=${{ startsWith(github.ref_name, 'experimental/') || github.ref_name == 'main' }} @@ -71,12 +71,12 @@ jobs: elif [ -n "$(echo ${{ inputs.platforms }} | grep -i arm)" ]; then platform_tag=`echo ${{ inputs.platforms }} | sed 's/linux\///g' | tr -d ' '` echo "platform_tag=$platform_tag" >> $GITHUB_OUTPUT - echo "runner=linux-arm64-cpu8" >> $GITHUB_OUTPUT + echo "runner=linux-arm64-cpu16" >> $GITHUB_OUTPUT echo "build_docs=${{ inputs.build_docs == 'true' }}" >> $GITHUB_OUTPUT else platform_tag=`echo ${{ inputs.platforms }} | sed 's/linux\///g' | tr -d ' '` echo "platform_tag=$platform_tag" >> $GITHUB_OUTPUT - echo "runner=linux-amd64-cpu8" >> $GITHUB_OUTPUT + echo "runner=linux-amd64-cpu16" >> $GITHUB_OUTPUT echo "build_docs=${{ inputs.build_docs != 'false' }}" >> $GITHUB_OUTPUT fi From 2c0fb3f05862f3250bd1719bafb56c4ff14b8da2 Mon Sep 17 00:00:00 2001 From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> Date: Wed, 15 Jan 2025 11:05:01 +1100 Subject: [PATCH 3/4] Enable trajectory simulation for the `nvidia` target (#2466) * Docs for trajectory simulation Signed-off-by: Thien Nguyen * Fix spelling and code format Signed-off-by: Thien Nguyen * Update mgpu hash: include fixes for 2434 Signed-off-by: Thien Nguyen * Update docs/sphinx/snippets/cpp/using/backends/trajectory.cpp Co-authored-by: Eric Schweitz Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> * Update docs/sphinx/snippets/cpp/using/backends/trajectory.cpp Co-authored-by: Eric Schweitz Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> * Update docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp Co-authored-by: Eric Schweitz Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> * Update docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp Co-authored-by: Eric Schweitz Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> * Update python/cudaq/runtime/observe.py Co-authored-by: Eric Schweitz Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> --------- Signed-off-by: Thien Nguyen Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> Co-authored-by: Eric Schweitz --- .github/workflows/config/gitlab_commits.txt | 2 +- .../cpp/using/backends/trajectory.cpp | 40 ++++++++ .../cpp/using/backends/trajectory_observe.cpp | 45 +++++++++ .../python/using/backends/trajectory.py | 43 ++++++++ .../using/backends/trajectory_observe.py | 44 +++++++++ docs/sphinx/using/backends/simulators.rst | 99 +++++++++++++++++++ python/cudaq/runtime/observe.py | 7 ++ python/runtime/common/py_ExecutionContext.cpp | 2 + 8 files changed, 281 insertions(+), 1 deletion(-) create mode 100644 docs/sphinx/snippets/cpp/using/backends/trajectory.cpp create mode 100644 docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp create mode 100644 docs/sphinx/snippets/python/using/backends/trajectory.py create mode 100644 docs/sphinx/snippets/python/using/backends/trajectory_observe.py diff --git a/.github/workflows/config/gitlab_commits.txt b/.github/workflows/config/gitlab_commits.txt index 16e13926a6..6d7fc25b84 100644 --- a/.github/workflows/config/gitlab_commits.txt +++ b/.github/workflows/config/gitlab_commits.txt @@ -1,2 +1,2 @@ nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git -nvidia-mgpu-commit: dadce3edc10564e94cd260590344d5840880087a +nvidia-mgpu-commit: 806e7fe5c459f52296ae0d3bd8bc57c3ea806152 diff --git a/docs/sphinx/snippets/cpp/using/backends/trajectory.cpp b/docs/sphinx/snippets/cpp/using/backends/trajectory.cpp new file mode 100644 index 0000000000..eb08f78b9f --- /dev/null +++ b/docs/sphinx/snippets/cpp/using/backends/trajectory.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// [Begin Documentation] +#include + +struct xOp { + void operator()(int qubit_count) __qpu__ { + cudaq::qvector q(qubit_count); + x(q); + mz(q); + } +}; + +int main() { + // Add a simple bit-flip noise channel to X gate + const double error_probability = 0.1; + + cudaq::bit_flip_channel bit_flip(error_probability); + // Add noise channels to our noise model. + cudaq::noise_model noise_model; + // Apply the bitflip channel to any X-gate on any qubits + noise_model.add_all_qubit_channel(bit_flip); + + const int qubit_count = 2; + // Due to the impact of noise, our measurements will no longer be uniformly in + // the |11> state. + auto counts = + cudaq::sample({.shots = 1000, .noise = noise_model}, xOp{}, qubit_count); + + // The probability that we get the perfect result (11) should be ~ 0.9 * 0.9 = + // 0.81 + counts.dump(); + return 0; +} diff --git a/docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp b/docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp new file mode 100644 index 0000000000..c4579f3e19 --- /dev/null +++ b/docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp @@ -0,0 +1,45 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include + +// [Begin Documentation] +#include + +struct xOp { + void operator()() __qpu__ { + cudaq::qubit q; + x(q); + } +}; + +int main() { + // Add a simple bit-flip noise channel to X gate + const double error_probability = 0.1; + + cudaq::bit_flip_channel bit_flip(error_probability); + // Add noise channels to our noise model. + cudaq::noise_model noise_model; + // Apply the bitflip channel to any X-gate on any qubits + noise_model.add_all_qubit_channel(bit_flip); + + double noisy_exp_val = + cudaq::observe({.noise = noise_model, .num_trajectories = 1024}, xOp{}, + cudaq::spin::z(0)); + + // True expectation: 0.1 - 0.9 = -0.8 (|1> has of -1 and |1> has of + // +1) + std::cout << "Noisy with 1024 trajectories = " << noisy_exp_val << "\n"; + + // Rerun with a higher number of trajectories + noisy_exp_val = + cudaq::observe({.noise = noise_model, .num_trajectories = 8192}, xOp{}, + cudaq::spin::z(0)); + std::cout << "Noisy with 8192 trajectories = " << noisy_exp_val << "\n"; + return 0; +} diff --git a/docs/sphinx/snippets/python/using/backends/trajectory.py b/docs/sphinx/snippets/python/using/backends/trajectory.py new file mode 100644 index 0000000000..f64fed1535 --- /dev/null +++ b/docs/sphinx/snippets/python/using/backends/trajectory.py @@ -0,0 +1,43 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +#[Begin Docs] +import cudaq + +# Use the `nvidia` target +cudaq.set_target("nvidia") + +# Let's define a simple kernel that we will add noise to. +qubit_count = 2 + + +@cudaq.kernel +def kernel(qubit_count: int): + qvector = cudaq.qvector(qubit_count) + x(qvector) + mz(qvector) + + +# Add a simple bit-flip noise channel to X gate +error_probability = 0.1 +bit_flip = cudaq.BitFlipChannel(error_probability) + +# Add noise channels to our noise model. +noise_model = cudaq.NoiseModel() +# Apply the bit-flip channel to any X-gate on any qubits +noise_model.add_all_qubit_channel("x", bit_flip) + +# Due to the impact of noise, our measurements will no longer be uniformly +# in the |11> state. +noisy_counts = cudaq.sample(kernel, + qubit_count, + noise_model=noise_model, + shots_count=1000) + +# The probability that we get the perfect result (11) should be ~ 0.9 * 0.9 = 0.81 +noisy_counts.dump() diff --git a/docs/sphinx/snippets/python/using/backends/trajectory_observe.py b/docs/sphinx/snippets/python/using/backends/trajectory_observe.py new file mode 100644 index 0000000000..b025640e23 --- /dev/null +++ b/docs/sphinx/snippets/python/using/backends/trajectory_observe.py @@ -0,0 +1,44 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +#[Begin Docs] +import cudaq +from cudaq import spin + +# Use the `nvidia` target +cudaq.set_target("nvidia") + + +@cudaq.kernel +def kernel(): + q = cudaq.qubit() + x(q) + + +# Add a simple bit-flip noise channel to X gate +error_probability = 0.1 +bit_flip = cudaq.BitFlipChannel(error_probability) + +# Add noise channels to our noise model. +noise_model = cudaq.NoiseModel() +# Apply the bit-flip channel to any X-gate on any qubits +noise_model.add_all_qubit_channel("x", bit_flip) + +noisy_exp_val = cudaq.observe(kernel, + spin.z(0), + noise_model=noise_model, + num_trajectories=1024).expectation() +# True expectation: 0.1 - 0.9 = -0.8 (|1> has of -1 and |1> has of +1) +print("Noisy with 1024 trajectories =", noisy_exp_val) + +# Rerun with a higher number of trajectories +noisy_exp_val = cudaq.observe(kernel, + spin.z(0), + noise_model=noise_model, + num_trajectories=8192).expectation() +print("Noisy with 8192 trajectories =", noisy_exp_val) diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst index a3dba35d92..59743c366b 100644 --- a/docs/sphinx/using/backends/simulators.rst +++ b/docs/sphinx/using/backends/simulators.rst @@ -33,6 +33,11 @@ and multi-QPU (`mqpu` :ref:`platform `) distribution whereby each Host CPU memory can be leveraged in addition to GPU memory to accommodate the state vector (i.e., maximizing the number of qubits to be simulated). +* Trajectory simulation for noisy quantum circuits + +The :code:`nvidia` target supports noisy quantum circuit simulations using quantum trajectory method across all configurations: single GPU, multi-node multi-GPU, and with host memory. +When simulating many trajectories with small state vectors, the simulation is batched for optimal performance. + .. _cuQuantum single-GPU: @@ -266,6 +271,100 @@ environment variable to another integer value as shown below. nvq++ --target nvidia --target-option mgpu,fp64 program.cpp [...] -o program.x CUDAQ_MGPU_FUSE=5 mpiexec -np 2 ./program.x + +Trajectory Noisy Simulation +++++++++++++++++++++++++++++++++++ + +When a :code:`noise_model` is provided to CUDA-Q, the :code:`nvidia` target will incorporate quantum noise into the quantum circuit simulation according to the noise model specified. + + +.. tab:: Python + + .. literalinclude:: ../../snippets/python/using/backends/trajectory.py + :language: python + :start-after: [Begin Docs] + + .. code:: bash + + python3 program.py + { 00:15 01:92 10:81 11:812 } + +.. tab:: C++ + + .. literalinclude:: ../../snippets/cpp/using/backends/trajectory.cpp + :language: cpp + :start-after: [Begin Documentation] + + .. code:: bash + + nvq++ --target nvidia program.cpp [...] -o program.x + ./program.x + { 00:15 01:92 10:81 11:812 } + + +In the case of bit-string measurement sampling as in the above example, each measurement 'shot' is executed as a trajectory, whereby Kraus operators specified in the noise model are sampled. + +For observable expectation value estimation, the statistical error scales asymptotically as :math:`1/\sqrt{N_{trajectories}}`, where :math:`N_{trajectories}` is the number of trajectories. +Hence, depending on the required level of accuracy, the number of trajectories can be specified accordingly. + +.. tab:: Python + + .. literalinclude:: ../../snippets/python/using/backends/trajectory_observe.py + :language: python + :start-after: [Begin Docs] + + .. code:: bash + + python3 program.py + Noisy with 1024 trajectories = -0.810546875 + Noisy with 8192 trajectories = -0.800048828125 + +.. tab:: C++ + + .. literalinclude:: ../../snippets/cpp/using/backends/trajectory_observe.cpp + :language: cpp + :start-after: [Begin Documentation] + + .. code:: bash + + nvq++ --target nvidia program.cpp [...] -o program.x + ./program.x + Noisy with 1024 trajectories = -0.810547 + Noisy with 8192 trajectories = -0.800049 + + +The following environment variable options are applicable to the :code:`nvidia` target for trajectory noisy simulation. Any environment variables must be set +prior to setting the target. + +.. list-table:: **Additional environment variable options for trajectory simulation** + :widths: 20 30 50 + + * - Option + - Value + - Description + * - ``CUDAQ_OBSERVE_NUM_TRAJECTORIES`` + - positive integer + - The default number of trajectories for observe simulation if none was provided in the `observe` call. The default value is 1000. + * - ``CUDAQ_BATCH_SIZE`` + - positive integer or `NONE` + - The number of state vectors in the batched mode. If `NONE`, the batch size will be calculated based on the available device memory. Default is `NONE`. + * - ``CUDAQ_BATCHED_SIM_MAX_BRANCHES`` + - positive integer + - The number of trajectory branches to be tracked simultaneously in the gate fusion. Default is 16. + * - ``CUDAQ_BATCHED_SIM_MAX_QUBITS`` + - positive integer + - The max number of qubits for batching. If the qubit count in the circuit is more than this value, batched trajectory simulation will be disabled. The default value is 20. + * - ``CUDAQ_BATCHED_SIM_MIN_BATCH_SIZE`` + - positive integer + - The minimum number of trajectories for batching. If the number of trajectories is less than this value, batched trajectory simulation will be disabled. Default value is 4. + +.. note:: + + Batched trajectory simulation is only available on the single-GPU execution mode of the :code:`nvidia` target. + + If batched trajectory simulation is not activated, e.g., due to problem size, number of trajectories, or the nature of the circuit (dynamic circuits with mid-circuit measurements and conditional branching), the required number of trajectories will be executed sequentially. + + .. _OpenMP CPU-only: OpenMP CPU-only diff --git a/python/cudaq/runtime/observe.py b/python/cudaq/runtime/observe.py index 1a3fd64059..befd1c0e29 100644 --- a/python/cudaq/runtime/observe.py +++ b/python/cudaq/runtime/observe.py @@ -43,6 +43,7 @@ def observe(kernel, *args, shots_count=0, noise_model=None, + num_trajectories=None, execution=None): """Compute the expected value of the `spin_operator` with respect to the `kernel`. If the input `spin_operator` is a list of `SpinOperator` then compute @@ -67,6 +68,7 @@ def observe(kernel, noise_model (Optional[`NoiseModel`]): The optional :class:`NoiseModel` to add noise to the kernel execution on the simulator. Defaults to an empty noise model. + `num_trajectories` (Optional[int]): The optional number of trajectories for noisy simulation. Only valid if a noise model is provided. Key-word only. Returns: :class:`ObserveResult`: @@ -123,6 +125,11 @@ def observe(kernel, else: ctx = cudaq_runtime.ExecutionContext('observe', shots_count) ctx.setSpinOperator(localOp) + if num_trajectories is not None: + if noise_model is None: + raise RuntimeError( + "num_trajectories is provided without a noise_model.") + ctx.numberTrajectories = num_trajectories cudaq_runtime.setExecutionContext(ctx) kernel(*args) res = ctx.result diff --git a/python/runtime/common/py_ExecutionContext.cpp b/python/runtime/common/py_ExecutionContext.cpp index 40215794fd..7d863480e5 100644 --- a/python/runtime/common/py_ExecutionContext.cpp +++ b/python/runtime/common/py_ExecutionContext.cpp @@ -28,6 +28,8 @@ void bindExecutionContext(py::module &mod) { .def_readwrite("totalIterations", &cudaq::ExecutionContext::totalIterations) .def_readwrite("batchIteration", &cudaq::ExecutionContext::batchIteration) + .def_readwrite("numberTrajectories", + &cudaq::ExecutionContext::numberTrajectories) .def("setSpinOperator", [](cudaq::ExecutionContext &ctx, cudaq::spin_op &spin) { ctx.spin = &spin; }) .def("getExpectationValue", From 742a31dee48f7fa6a9d274528f6f2875c6312f7b Mon Sep 17 00:00:00 2001 From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com> Date: Wed, 15 Jan 2025 16:19:18 +1100 Subject: [PATCH 4/4] Make controlled rank a configurable setting (#2446) For small controlled ops, i.e., singly controlled, expand the gate matrix and use cutensornetStateApplyTensorOperator. Add CUDAQ_TENSORNET_CONTROLLED_RANK threshold to determine when cutensornetStateApplyControlledTensorOperator is used. For MPS, this is fixed at 1 as it cannot handle gate ops with more than 2 qubits. Add doc for the setting and also remove some stale notes about random seeds in the docs. Signed-off-by: Thien Nguyen --- docs/sphinx/using/backends/simulators.rst | 9 +--- .../cutensornet/simulator_cutensornet.cpp | 46 +++++++++++++++---- .../nvqir/cutensornet/simulator_cutensornet.h | 7 +++ .../simulator_tensornet_register.cpp | 18 ++++++++ runtime/nvqir/cutensornet/tensornet_state.cpp | 3 +- 5 files changed, 65 insertions(+), 18 deletions(-) diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst index 59743c366b..225aa3c0c0 100644 --- a/docs/sphinx/using/backends/simulators.rst +++ b/docs/sphinx/using/backends/simulators.rst @@ -481,6 +481,7 @@ Specific aspects of the simulation can be configured by setting the following of * **`CUDA_VISIBLE_DEVICES=X`**: Makes the process only see GPU X on multi-GPU nodes. Each MPI process must only see its own dedicated GPU. For example, if you run 8 MPI processes on a DGX system with 8 GPUs, each MPI process should be assigned its own dedicated GPU via `CUDA_VISIBLE_DEVICES` when invoking `mpiexec` (or `mpirun`) commands. * **`OMP_PLACES=cores`**: Set this environment variable to improve CPU parallelization. * **`OMP_NUM_THREADS=X`**: To enable CPU parallelization, set X to `NUMBER_OF_CORES_PER_NODE/NUMBER_OF_GPUS_PER_NODE`. +* **`CUDAQ_TENSORNET_CONTROLLED_RANK=X`**: Specify the number of controlled qubits whereby the full tensor body of the controlled gate is expanded. If the number of controlled qubits is greater than this value, the gate is applied as a controlled tensor operator to the tensor network state. Default value is 1. .. note:: @@ -488,10 +489,6 @@ Specific aspects of the simulation can be configured by setting the following of If you do not have these dependencies installed, you may encounter an error stating `Invalid simulator requested`. See the section :ref:`dependencies-and-compatibility` for more information about how to install dependencies. -.. note:: - - Setting random seed, via :code:`cudaq::set_random_seed`, is not supported for this backend due to a limitation of the :code:`cuTensorNet` library. This will be fixed in future release once this feature becomes available. - Matrix product state +++++++++++++++++++++++++++++++++++ @@ -535,10 +532,6 @@ Specific aspects of the simulation can be configured by defining the following e If you do not have these dependencies installed, you may encounter an error stating `Invalid simulator requested`. See the section :ref:`dependencies-and-compatibility` for more information about how to install dependencies. -.. note:: - - Setting random seed, via :code:`cudaq::set_random_seed`, is not supported for this backend due to a limitation of the :code:`cuTensorNet` library. This will be fixed in future release once this feature becomes available. - .. note:: The parallelism of Jacobi method (the default `CUDAQ_MPS_SVD_ALGO` setting) gives GPU better performance on small and medium size matrices. If you expect a large number of singular values (e.g., increasing the `CUDAQ_MPS_MAX_BOND` setting), please adjust the `CUDAQ_MPS_SVD_ALGO` setting accordingly. diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp index 983048f612..676c9a38c5 100644 --- a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp +++ b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp @@ -90,17 +90,45 @@ void SimulatorTensorNetBase::applyGate(const GateApplicationTask &task) { } return paramsSs.str() + "__" + std::to_string(vecComplexHash(task.matrix)); }(); - const auto iter = m_gateDeviceMemCache.find(gateKey); - // This is the first time we see this gate, allocate device mem and cache it. - if (iter == m_gateDeviceMemCache.end()) { - void *dMem = allocateGateMatrix(task.matrix); - m_gateDeviceMemCache[gateKey] = dMem; + if (controls.size() <= m_maxControlledRankForFullTensorExpansion) { + // If the number of controlled qubits is less than the threshold, expand the + // full matrix and apply it as a single tensor operation. + // Qubit operands are now both control and target qubits. + std::vector qubitOperands(controls.begin(), controls.end()); + qubitOperands.insert(qubitOperands.end(), targets.begin(), targets.end()); + // Use a different key for expanded gate matrix (reflecting the number of + // control qubits) + const auto expandedMatKey = + gateKey + "_c(" + std::to_string(controls.size()) + ")"; + const auto iter = m_gateDeviceMemCache.find(expandedMatKey); + if (iter != m_gateDeviceMemCache.end()) { + m_state->applyGate(/*controlQubits=*/{}, qubitOperands, iter->second); + } else { + // If this is the first time seeing this (gate + number of control qubits) + // compo, compute the expanded matrix. + const auto expandedGateMat = + generateFullGateTensor(controls.size(), task.matrix); + void *dMem = allocateGateMatrix(expandedGateMat); + m_gateDeviceMemCache[expandedMatKey] = dMem; + m_state->applyGate(/*controlQubits=*/{}, qubitOperands, dMem); + } + } else { + // Propagates control qubits to cutensornet. + const auto iter = m_gateDeviceMemCache.find(gateKey); + // This is the first time we see this gate, allocate device mem and cache + // it. + if (iter == m_gateDeviceMemCache.end()) { + void *dMem = allocateGateMatrix(task.matrix); + m_gateDeviceMemCache[gateKey] = dMem; + } + // Type conversion + const std::vector ctrlQubits(controls.begin(), + controls.end()); + const std::vector targetQubits(targets.begin(), + targets.end()); + m_state->applyGate(ctrlQubits, targetQubits, m_gateDeviceMemCache[gateKey]); } - // Type conversion - const std::vector ctrlQubits(controls.begin(), controls.end()); - const std::vector targetQubits(targets.begin(), targets.end()); - m_state->applyGate(ctrlQubits, targetQubits, m_gateDeviceMemCache[gateKey]); } /// @brief Reset the state of a given qubit to zero diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.h b/runtime/nvqir/cutensornet/simulator_cutensornet.h index 8165996d4a..666411ac11 100644 --- a/runtime/nvqir/cutensornet/simulator_cutensornet.h +++ b/runtime/nvqir/cutensornet/simulator_cutensornet.h @@ -96,6 +96,13 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase { // Random number generator for generating 32-bit numbers with a state size of // 19937 bits for measurements. std::mt19937 m_randomEngine; + // Max number of controlled ranks (qubits) that the full matrix of the + // controlled gate is used as tensor op. + // Default is 1. + // MPS only supports 1 (higher number of controlled ranks must use + // cutensornetStateApplyControlledTensorOperator). Tensornet supports + // arbitrary values. + std::size_t m_maxControlledRankForFullTensorExpansion = 1; }; } // end namespace nvqir diff --git a/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp b/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp index c6a2145e4e..132e80f3b4 100644 --- a/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp +++ b/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp @@ -28,7 +28,25 @@ class SimulatorTensorNet : public SimulatorTensorNetBase { initCuTensornetComm(m_cutnHandle); m_cutnMpiInitialized = true; } + + // Retrieve user-defined controlled rank setting if provided. + if (auto *maxControlledRankEnvVar = + std::getenv("CUDAQ_TENSORNET_CONTROLLED_RANK")) { + auto maxControlledRank = std::atoi(maxControlledRankEnvVar); + if (maxControlledRank <= 0) + throw std::runtime_error( + fmt::format("Invalid CUDAQ_TENSORNET_CONTROLLED_RANK environment " + "variable setting. Expecting a " + "positive integer value, got '{}'.", + maxControlledRank)); + + cudaq::info("Setting max controlled rank for full tensor expansion from " + "{} to {}.", + m_maxControlledRankForFullTensorExpansion, maxControlledRank); + m_maxControlledRankForFullTensorExpansion = maxControlledRank; + } } + // Nothing to do for state preparation virtual void prepareQubitTensorState() override {} virtual std::string name() const override { return "tensornet"; } diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp index 6df0649c74..94e662345c 100644 --- a/runtime/nvqir/cutensornet/tensornet_state.cpp +++ b/runtime/nvqir/cutensornet/tensornet_state.cpp @@ -53,7 +53,8 @@ std::unique_ptr TensorNetState::clone() const { void TensorNetState::applyGate(const std::vector &controlQubits, const std::vector &targetQubits, void *gateDeviceMem, bool adjoint) { - LOG_API_TIME(); + ScopedTraceWithContext("TensorNetState::applyGate", controlQubits.size(), + targetQubits.size()); if (controlQubits.empty()) { HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator( m_cutnHandle, m_quantumState, targetQubits.size(), targetQubits.data(),