Merge branch 'main' into tnguyen/dynamics-mgmn

NVIDIA · Jan 15, 2025 · c5927c8 · c5927c8
2 parents 82d7a79 + 742a31d
commit c5927c8
Show file tree

Hide file tree

Showing 14 changed files with 353 additions and 26 deletions.
diff --git a/.github/workflows/config/gitlab_commits.txt b/.github/workflows/config/gitlab_commits.txt
@@ -1,2 +1,2 @@
 nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git
-nvidia-mgpu-commit: dadce3edc10564e94cd260590344d5840880087a
+nvidia-mgpu-commit: 806e7fe5c459f52296ae0d3bd8bc57c3ea806152
diff --git a/.github/workflows/docker_images.yml b/.github/workflows/docker_images.yml
@@ -61,7 +61,7 @@ jobs:
         run: |
           if [ -n "$(echo ${{ inputs.platforms }} | grep ',')" ]; then
             # multi-platform builds get no platform tag
-            echo "runner=linux-amd64-cpu8" >> $GITHUB_OUTPUT
+            echo "runner=linux-amd64-cpu16" >> $GITHUB_OUTPUT
             echo "build_docs=${{ inputs.build_docs != 'false' }}" >> $GITHUB_OUTPUT
             is_versioned=${{ github.ref_type == 'tag' || startsWith(github.ref_name, 'releases/') || startsWith(github.ref_name, 'staging/') }}
             has_continuous_deployment=${{ startsWith(github.ref_name, 'experimental/') || github.ref_name == 'main' }}
@@ -71,12 +71,12 @@ jobs:
           elif [ -n "$(echo ${{ inputs.platforms }} | grep -i arm)" ]; then
             platform_tag=`echo ${{ inputs.platforms }} | sed 's/linux\///g' | tr -d ' '`
             echo "platform_tag=$platform_tag" >> $GITHUB_OUTPUT
-            echo "runner=linux-arm64-cpu8" >> $GITHUB_OUTPUT
+            echo "runner=linux-arm64-cpu16" >> $GITHUB_OUTPUT
             echo "build_docs=${{ inputs.build_docs == 'true' }}" >> $GITHUB_OUTPUT
           else
             platform_tag=`echo ${{ inputs.platforms }} | sed 's/linux\///g' | tr -d ' '`
             echo "platform_tag=$platform_tag" >> $GITHUB_OUTPUT
-            echo "runner=linux-amd64-cpu8" >> $GITHUB_OUTPUT
+            echo "runner=linux-amd64-cpu16" >> $GITHUB_OUTPUT
             echo "build_docs=${{ inputs.build_docs != 'false' }}" >> $GITHUB_OUTPUT
           fi
 

diff --git a/docs/sphinx/snippets/cpp/using/backends/trajectory.cpp b/docs/sphinx/snippets/cpp/using/backends/trajectory.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// [Begin Documentation]
+#include <cudaq.h>
+
+struct xOp {
+  void operator()(int qubit_count) __qpu__ {
+    cudaq::qvector q(qubit_count);
+    x(q);
+    mz(q);
+  }
+};
+
+int main() {
+  // Add a simple bit-flip noise channel to X gate
+  const double error_probability = 0.1;
+
+  cudaq::bit_flip_channel bit_flip(error_probability);
+  // Add noise channels to our noise model.
+  cudaq::noise_model noise_model;
+  // Apply the bitflip channel to any X-gate on any qubits
+  noise_model.add_all_qubit_channel<cudaq::types::x>(bit_flip);
+
+  const int qubit_count = 2;
+  // Due to the impact of noise, our measurements will no longer be uniformly in
+  // the |11> state.
+  auto counts =
+      cudaq::sample({.shots = 1000, .noise = noise_model}, xOp{}, qubit_count);
+
+  // The probability that we get the perfect result (11) should be ~ 0.9 * 0.9 =
+  // 0.81
+  counts.dump();
+  return 0;
+}
diff --git a/docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp b/docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <iostream>
+
+// [Begin Documentation]
+#include <cudaq.h>
+
+struct xOp {
+  void operator()() __qpu__ {
+    cudaq::qubit q;
+    x(q);
+  }
+};
+
+int main() {
+  // Add a simple bit-flip noise channel to X gate
+  const double error_probability = 0.1;
+
+  cudaq::bit_flip_channel bit_flip(error_probability);
+  // Add noise channels to our noise model.
+  cudaq::noise_model noise_model;
+  // Apply the bitflip channel to any X-gate on any qubits
+  noise_model.add_all_qubit_channel<cudaq::types::x>(bit_flip);
+
+  double noisy_exp_val =
+      cudaq::observe({.noise = noise_model, .num_trajectories = 1024}, xOp{},
+                     cudaq::spin::z(0));
+
+  // True expectation: 0.1 - 0.9 = -0.8 (|1> has <Z> of -1 and |1> has <Z> of
+  // +1)
+  std::cout << "Noisy <Z> with 1024 trajectories = " << noisy_exp_val << "\n";
+
+  // Rerun with a higher number of trajectories
+  noisy_exp_val =
+      cudaq::observe({.noise = noise_model, .num_trajectories = 8192}, xOp{},
+                     cudaq::spin::z(0));
+  std::cout << "Noisy <Z> with 8192 trajectories = " << noisy_exp_val << "\n";
+  return 0;
+}
diff --git a/docs/sphinx/snippets/python/using/backends/trajectory.py b/docs/sphinx/snippets/python/using/backends/trajectory.py
@@ -0,0 +1,43 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+#[Begin Docs]
+import cudaq
+
+# Use the `nvidia` target
+cudaq.set_target("nvidia")
+
+# Let's define a simple kernel that we will add noise to.
+qubit_count = 2
+
+
+@cudaq.kernel
+def kernel(qubit_count: int):
+    qvector = cudaq.qvector(qubit_count)
+    x(qvector)
+    mz(qvector)
+
+
+# Add a simple bit-flip noise channel to X gate
+error_probability = 0.1
+bit_flip = cudaq.BitFlipChannel(error_probability)
+
+# Add noise channels to our noise model.
+noise_model = cudaq.NoiseModel()
+# Apply the bit-flip channel to any X-gate on any qubits
+noise_model.add_all_qubit_channel("x", bit_flip)
+
+# Due to the impact of noise, our measurements will no longer be uniformly
+# in the |11> state.
+noisy_counts = cudaq.sample(kernel,
+                            qubit_count,
+                            noise_model=noise_model,
+                            shots_count=1000)
+
+# The probability that we get the perfect result (11) should be ~ 0.9 * 0.9 = 0.81
+noisy_counts.dump()
diff --git a/docs/sphinx/snippets/python/using/backends/trajectory_observe.py b/docs/sphinx/snippets/python/using/backends/trajectory_observe.py
@@ -0,0 +1,44 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+#[Begin Docs]
+import cudaq
+from cudaq import spin
+
+# Use the `nvidia` target
+cudaq.set_target("nvidia")
+
+
+@cudaq.kernel
+def kernel():
+    q = cudaq.qubit()
+    x(q)
+
+
+# Add a simple bit-flip noise channel to X gate
+error_probability = 0.1
+bit_flip = cudaq.BitFlipChannel(error_probability)
+
+# Add noise channels to our noise model.
+noise_model = cudaq.NoiseModel()
+# Apply the bit-flip channel to any X-gate on any qubits
+noise_model.add_all_qubit_channel("x", bit_flip)
+
+noisy_exp_val = cudaq.observe(kernel,
+                              spin.z(0),
+                              noise_model=noise_model,
+                              num_trajectories=1024).expectation()
+# True expectation: 0.1 - 0.9 = -0.8 (|1> has <Z> of -1 and |1> has <Z> of +1)
+print("Noisy <Z> with 1024 trajectories =", noisy_exp_val)
+
+# Rerun with a higher number of trajectories
+noisy_exp_val = cudaq.observe(kernel,
+                              spin.z(0),
+                              noise_model=noise_model,
+                              num_trajectories=8192).expectation()
+print("Noisy <Z> with 8192 trajectories =", noisy_exp_val)
diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst
@@ -33,6 +33,11 @@ and multi-QPU (`mqpu` :ref:`platform <mqpu-platform>`) distribution whereby each
 Host CPU memory can be leveraged in addition to GPU memory to accommodate the state vector 
 (i.e., maximizing the number of qubits to be simulated).
 
+* Trajectory simulation for noisy quantum circuits
+
+The :code:`nvidia` target supports noisy quantum circuit simulations using quantum trajectory method across all configurations: single GPU, multi-node multi-GPU, and with host memory.
+When simulating many trajectories with small state vectors, the simulation is batched for optimal performance.
+
 .. _cuQuantum single-GPU:
 
 
@@ -266,6 +271,100 @@ environment variable to another integer value as shown below.
         nvq++ --target nvidia --target-option mgpu,fp64 program.cpp [...] -o program.x
         CUDAQ_MGPU_FUSE=5 mpiexec -np 2 ./program.x
 
+
+Trajectory Noisy Simulation
+++++++++++++++++++++++++++++++++++
+
+When a :code:`noise_model` is provided to CUDA-Q, the :code:`nvidia` target will incorporate quantum noise into the quantum circuit simulation according to the noise model specified.
+
+
+.. tab:: Python
+
+    .. literalinclude:: ../../snippets/python/using/backends/trajectory.py
+        :language: python
+        :start-after: [Begin Docs]
+
+    .. code:: bash 
+        
+        python3 program.py
+        { 00:15 01:92 10:81 11:812 }
+
+.. tab:: C++
+
+    .. literalinclude:: ../../snippets/cpp/using/backends/trajectory.cpp
+        :language: cpp
+        :start-after: [Begin Documentation]
+
+    .. code:: bash 
+
+        nvq++ --target nvidia program.cpp [...] -o program.x
+        ./program.x
+        { 00:15 01:92 10:81 11:812 }
+
+
+In the case of bit-string measurement sampling as in the above example, each measurement 'shot' is executed as a trajectory, whereby Kraus operators specified in the noise model are sampled.
+
+For observable expectation value estimation, the statistical error scales asymptotically as :math:`1/\sqrt{N_{trajectories}}`, where :math:`N_{trajectories}` is the number of trajectories.
+Hence, depending on the required level of accuracy, the number of trajectories can be specified accordingly.
+
+.. tab:: Python
+
+    .. literalinclude:: ../../snippets/python/using/backends/trajectory_observe.py
+        :language: python
+        :start-after: [Begin Docs]
+
+    .. code:: bash 
+        
+        python3 program.py
+        Noisy <Z> with 1024 trajectories = -0.810546875
+        Noisy <Z> with 8192 trajectories = -0.800048828125
+
+.. tab:: C++
+
+    .. literalinclude:: ../../snippets/cpp/using/backends/trajectory_observe.cpp
+        :language: cpp
+        :start-after: [Begin Documentation]
+
+    .. code:: bash 
+
+        nvq++ --target nvidia program.cpp [...] -o program.x
+        ./program.x
+        Noisy <Z> with 1024 trajectories = -0.810547
+        Noisy <Z> with 8192 trajectories = -0.800049
+
+
+The following environment variable options are applicable to the :code:`nvidia` target for trajectory noisy simulation. Any environment variables must be set
+prior to setting the target.
+
+.. list-table:: **Additional environment variable options for trajectory simulation**
+  :widths: 20 30 50
+
+  * - Option
+    - Value
+    - Description
+  * - ``CUDAQ_OBSERVE_NUM_TRAJECTORIES``
+    - positive integer
+    - The default number of trajectories for observe simulation if none was provided in the `observe` call. The default value is 1000.
+  * - ``CUDAQ_BATCH_SIZE``
+    - positive integer or `NONE`
+    - The number of state vectors in the batched mode. If `NONE`, the batch size will be calculated based on the available device memory. Default is `NONE`.
+  * - ``CUDAQ_BATCHED_SIM_MAX_BRANCHES``
+    - positive integer
+    - The number of trajectory branches to be tracked simultaneously in the gate fusion. Default is 16. 
+  * - ``CUDAQ_BATCHED_SIM_MAX_QUBITS``
+    - positive integer
+    - The max number of qubits for batching. If the qubit count in the circuit is more than this value, batched trajectory simulation will be disabled. The default value is 20.
+  * - ``CUDAQ_BATCHED_SIM_MIN_BATCH_SIZE``
+    - positive integer
+    - The minimum number of trajectories for batching. If the number of trajectories is less than this value, batched trajectory simulation will be disabled. Default value is 4.
+
+.. note::
+    
+    Batched trajectory simulation is only available on the single-GPU execution mode of the :code:`nvidia` target. 
+    
+    If batched trajectory simulation is not activated, e.g., due to problem size, number of trajectories, or the nature of the circuit (dynamic circuits with mid-circuit measurements and conditional branching), the required number of trajectories will be executed sequentially.  
+
+
 .. _OpenMP CPU-only:
 
 OpenMP CPU-only
@@ -382,17 +481,14 @@ Specific aspects of the simulation can be configured by setting the following of
 * **`CUDA_VISIBLE_DEVICES=X`**: Makes the process only see GPU X on multi-GPU nodes. Each MPI process must only see its own dedicated GPU. For example, if you run 8 MPI processes on a DGX system with 8 GPUs, each MPI process should be assigned its own dedicated GPU via `CUDA_VISIBLE_DEVICES` when invoking `mpiexec` (or `mpirun`) commands. 
 * **`OMP_PLACES=cores`**: Set this environment variable to improve CPU parallelization.
 * **`OMP_NUM_THREADS=X`**: To enable CPU parallelization, set X to `NUMBER_OF_CORES_PER_NODE/NUMBER_OF_GPUS_PER_NODE`.
+* **`CUDAQ_TENSORNET_CONTROLLED_RANK=X`**: Specify the number of controlled qubits whereby the full tensor body of the controlled gate is expanded. If the number of controlled qubits is greater than this value, the gate is applied as a controlled tensor operator to the tensor network state. Default value is 1.
 
 .. note:: 
 
   This backend requires an NVIDIA GPU and CUDA runtime libraries. 
   If you do not have these dependencies installed, you may encounter an error stating `Invalid simulator requested`. 
   See the section :ref:`dependencies-and-compatibility` for more information about how to install dependencies.
 
-.. note::
-
-  Setting random seed, via :code:`cudaq::set_random_seed`, is not supported for this backend due to a limitation of the :code:`cuTensorNet` library. This will be fixed in future release once this feature becomes available.
-
 
 Matrix product state 
 +++++++++++++++++++++++++++++++++++
@@ -436,10 +532,6 @@ Specific aspects of the simulation can be configured by defining the following e
   If you do not have these dependencies installed, you may encounter an error stating `Invalid simulator requested`. 
   See the section :ref:`dependencies-and-compatibility` for more information about how to install dependencies.
 
-.. note::
-
-  Setting random seed, via :code:`cudaq::set_random_seed`, is not supported for this backend due to a limitation of the :code:`cuTensorNet` library. This will be fixed in future release once this feature becomes available.
-
 .. note::
     The parallelism of Jacobi method (the default `CUDAQ_MPS_SVD_ALGO` setting) gives GPU better performance on small and medium size matrices.
     If you expect a large number of singular values (e.g., increasing the `CUDAQ_MPS_MAX_BOND` setting), please adjust the `CUDAQ_MPS_SVD_ALGO` setting accordingly.  

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,10 +22,10 @@ dependencies = [
   'numpy >= 1.24',
   'scipy >= 1.10.1',
   'requests >= 2.31',
-  'nvidia-cublas-cu12 ~= 12.0; platform_machine == "x86_64"',
-  'nvidia-cuda-runtime-cu12 ~= 12.0; platform_machine == "x86_64"',
-  'nvidia-cusolver-cu12 ~= 11.4; platform_machine == "x86_64"',
-  'nvidia-cuda-nvrtc-cu12 ~= 12.0; platform_machine == "x86_64"'
+  'nvidia-cublas-cu12 ~= 12.0',
+  'nvidia-cuda-runtime-cu12 ~= 12.0',
+  'nvidia-cusolver-cu12 ~= 11.4',
+  'nvidia-cuda-nvrtc-cu12 ~= 12.0'
 ]
 classifiers = [
     'Intended Audience :: Science/Research',

diff --git a/python/cudaq/runtime/observe.py b/python/cudaq/runtime/observe.py
@@ -43,6 +43,7 @@ def observe(kernel,
             *args,
             shots_count=0,
             noise_model=None,
+            num_trajectories=None,
             execution=None):
     """Compute the expected value of the `spin_operator` with respect to 
 the `kernel`. If the input `spin_operator` is a list of `SpinOperator` then compute 
@@ -67,6 +68,7 @@ def observe(kernel,
   noise_model (Optional[`NoiseModel`]): The optional :class:`NoiseModel` to add 
     noise to the kernel execution on the simulator. Defaults to an empty 
     noise model.
+  `num_trajectories` (Optional[int]): The optional number of trajectories for noisy simulation. Only valid if a noise model is provided. Key-word only.
 
 Returns:
   :class:`ObserveResult`: 
@@ -123,6 +125,11 @@ def observe(kernel,
     else:
         ctx = cudaq_runtime.ExecutionContext('observe', shots_count)
         ctx.setSpinOperator(localOp)
+        if num_trajectories is not None:
+            if noise_model is None:
+                raise RuntimeError(
+                    "num_trajectories is provided without a noise_model.")
+            ctx.numberTrajectories = num_trajectories
         cudaq_runtime.setExecutionContext(ctx)
         kernel(*args)
         res = ctx.result

diff --git a/python/runtime/common/py_ExecutionContext.cpp b/python/runtime/common/py_ExecutionContext.cpp
@@ -28,6 +28,8 @@ void bindExecutionContext(py::module &mod) {
       .def_readwrite("totalIterations",
                      &cudaq::ExecutionContext::totalIterations)
       .def_readwrite("batchIteration", &cudaq::ExecutionContext::batchIteration)
+      .def_readwrite("numberTrajectories",
+                     &cudaq::ExecutionContext::numberTrajectories)
       .def("setSpinOperator", [](cudaq::ExecutionContext &ctx,
                                  cudaq::spin_op &spin) { ctx.spin = &spin; })
       .def("getExpectationValue",