From b6655b66a68761b2694d815f42e5b20dd2269c32 Mon Sep 17 00:00:00 2001
From: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
Date: Mon, 13 Jan 2025 18:03:21 -0800
Subject: [PATCH 1/4] Make Python CUDA dependencies resolve for arm64, too
 (#2501)

See also #1602
---
 pyproject.toml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 91dc417078..a1a0b60b1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,10 +22,10 @@ dependencies = [
   'numpy >= 1.24',
   'scipy >= 1.10.1',
   'requests >= 2.31',
-  'nvidia-cublas-cu12 ~= 12.0; platform_machine == "x86_64"',
-  'nvidia-cuda-runtime-cu12 ~= 12.0; platform_machine == "x86_64"',
-  'nvidia-cusolver-cu12 ~= 11.4; platform_machine == "x86_64"',
-  'nvidia-cuda-nvrtc-cu12 ~= 12.0; platform_machine == "x86_64"'
+  'nvidia-cublas-cu12 ~= 12.0',
+  'nvidia-cuda-runtime-cu12 ~= 12.0',
+  'nvidia-cusolver-cu12 ~= 11.4',
+  'nvidia-cuda-nvrtc-cu12 ~= 12.0'
 ]
 classifiers = [
     'Intended Audience :: Science/Research',

From a9b50fc1d48083420ed4628e3198dbd7fe52a69a Mon Sep 17 00:00:00 2001
From: Ben Howe <141149032+bmhowe23@users.noreply.github.com>
Date: Mon, 13 Jan 2025 21:16:40 -0800
Subject: [PATCH 2/4] [ci] Update docker_images jobs to use more CPUs for build
 (#2505)

---
 .github/workflows/docker_images.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker_images.yml b/.github/workflows/docker_images.yml
index 0e62b2a302..1c6d4b65bf 100644
--- a/.github/workflows/docker_images.yml
+++ b/.github/workflows/docker_images.yml
@@ -61,7 +61,7 @@ jobs:
         run: |
           if [ -n "$(echo ${{ inputs.platforms }} | grep ',')" ]; then
             # multi-platform builds get no platform tag
-            echo "runner=linux-amd64-cpu8" >> $GITHUB_OUTPUT
+            echo "runner=linux-amd64-cpu16" >> $GITHUB_OUTPUT
             echo "build_docs=${{ inputs.build_docs != 'false' }}" >> $GITHUB_OUTPUT
             is_versioned=${{ github.ref_type == 'tag' || startsWith(github.ref_name, 'releases/') || startsWith(github.ref_name, 'staging/') }}
             has_continuous_deployment=${{ startsWith(github.ref_name, 'experimental/') || github.ref_name == 'main' }}
@@ -71,12 +71,12 @@ jobs:
           elif [ -n "$(echo ${{ inputs.platforms }} | grep -i arm)" ]; then
             platform_tag=`echo ${{ inputs.platforms }} | sed 's/linux\///g' | tr -d ' '`
             echo "platform_tag=$platform_tag" >> $GITHUB_OUTPUT
-            echo "runner=linux-arm64-cpu8" >> $GITHUB_OUTPUT
+            echo "runner=linux-arm64-cpu16" >> $GITHUB_OUTPUT
             echo "build_docs=${{ inputs.build_docs == 'true' }}" >> $GITHUB_OUTPUT
           else
             platform_tag=`echo ${{ inputs.platforms }} | sed 's/linux\///g' | tr -d ' '`
             echo "platform_tag=$platform_tag" >> $GITHUB_OUTPUT
-            echo "runner=linux-amd64-cpu8" >> $GITHUB_OUTPUT
+            echo "runner=linux-amd64-cpu16" >> $GITHUB_OUTPUT
             echo "build_docs=${{ inputs.build_docs != 'false' }}" >> $GITHUB_OUTPUT
           fi
 

From 2c0fb3f05862f3250bd1719bafb56c4ff14b8da2 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Wed, 15 Jan 2025 11:05:01 +1100
Subject: [PATCH 3/4] Enable trajectory simulation for the `nvidia` target
 (#2466)

* Docs for trajectory simulation

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

* Fix spelling and code format

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

* Update mgpu hash: include fixes for 2434

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

* Update docs/sphinx/snippets/cpp/using/backends/trajectory.cpp

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>

* Update docs/sphinx/snippets/cpp/using/backends/trajectory.cpp

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>

* Update docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>

* Update docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>

* Update python/cudaq/runtime/observe.py

Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>

---------

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
Signed-off-by: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Co-authored-by: Eric Schweitz <eschweitz@nvidia.com>
---
 .github/workflows/config/gitlab_commits.txt   |  2 +-
 .../cpp/using/backends/trajectory.cpp         | 40 ++++++++
 .../cpp/using/backends/trajectory_observe.cpp | 45 +++++++++
 .../python/using/backends/trajectory.py       | 43 ++++++++
 .../using/backends/trajectory_observe.py      | 44 +++++++++
 docs/sphinx/using/backends/simulators.rst     | 99 +++++++++++++++++++
 python/cudaq/runtime/observe.py               |  7 ++
 python/runtime/common/py_ExecutionContext.cpp |  2 +
 8 files changed, 281 insertions(+), 1 deletion(-)
 create mode 100644 docs/sphinx/snippets/cpp/using/backends/trajectory.cpp
 create mode 100644 docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp
 create mode 100644 docs/sphinx/snippets/python/using/backends/trajectory.py
 create mode 100644 docs/sphinx/snippets/python/using/backends/trajectory_observe.py

diff --git a/.github/workflows/config/gitlab_commits.txt b/.github/workflows/config/gitlab_commits.txt
index 16e13926a6..6d7fc25b84 100644
--- a/.github/workflows/config/gitlab_commits.txt
+++ b/.github/workflows/config/gitlab_commits.txt
@@ -1,2 +1,2 @@
 nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git
-nvidia-mgpu-commit: dadce3edc10564e94cd260590344d5840880087a
+nvidia-mgpu-commit: 806e7fe5c459f52296ae0d3bd8bc57c3ea806152
diff --git a/docs/sphinx/snippets/cpp/using/backends/trajectory.cpp b/docs/sphinx/snippets/cpp/using/backends/trajectory.cpp
new file mode 100644
index 0000000000..eb08f78b9f
--- /dev/null
+++ b/docs/sphinx/snippets/cpp/using/backends/trajectory.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// [Begin Documentation]
+#include <cudaq.h>
+
+struct xOp {
+  void operator()(int qubit_count) __qpu__ {
+    cudaq::qvector q(qubit_count);
+    x(q);
+    mz(q);
+  }
+};
+
+int main() {
+  // Add a simple bit-flip noise channel to X gate
+  const double error_probability = 0.1;
+
+  cudaq::bit_flip_channel bit_flip(error_probability);
+  // Add noise channels to our noise model.
+  cudaq::noise_model noise_model;
+  // Apply the bitflip channel to any X-gate on any qubits
+  noise_model.add_all_qubit_channel<cudaq::types::x>(bit_flip);
+
+  const int qubit_count = 2;
+  // Due to the impact of noise, our measurements will no longer be uniformly in
+  // the |11> state.
+  auto counts =
+      cudaq::sample({.shots = 1000, .noise = noise_model}, xOp{}, qubit_count);
+
+  // The probability that we get the perfect result (11) should be ~ 0.9 * 0.9 =
+  // 0.81
+  counts.dump();
+  return 0;
+}
diff --git a/docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp b/docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp
new file mode 100644
index 0000000000..c4579f3e19
--- /dev/null
+++ b/docs/sphinx/snippets/cpp/using/backends/trajectory_observe.cpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <iostream>
+
+// [Begin Documentation]
+#include <cudaq.h>
+
+struct xOp {
+  void operator()() __qpu__ {
+    cudaq::qubit q;
+    x(q);
+  }
+};
+
+int main() {
+  // Add a simple bit-flip noise channel to X gate
+  const double error_probability = 0.1;
+
+  cudaq::bit_flip_channel bit_flip(error_probability);
+  // Add noise channels to our noise model.
+  cudaq::noise_model noise_model;
+  // Apply the bitflip channel to any X-gate on any qubits
+  noise_model.add_all_qubit_channel<cudaq::types::x>(bit_flip);
+
+  double noisy_exp_val =
+      cudaq::observe({.noise = noise_model, .num_trajectories = 1024}, xOp{},
+                     cudaq::spin::z(0));
+
+  // True expectation: 0.1 - 0.9 = -0.8 (|1> has <Z> of -1 and |1> has <Z> of
+  // +1)
+  std::cout << "Noisy <Z> with 1024 trajectories = " << noisy_exp_val << "\n";
+
+  // Rerun with a higher number of trajectories
+  noisy_exp_val =
+      cudaq::observe({.noise = noise_model, .num_trajectories = 8192}, xOp{},
+                     cudaq::spin::z(0));
+  std::cout << "Noisy <Z> with 8192 trajectories = " << noisy_exp_val << "\n";
+  return 0;
+}
diff --git a/docs/sphinx/snippets/python/using/backends/trajectory.py b/docs/sphinx/snippets/python/using/backends/trajectory.py
new file mode 100644
index 0000000000..f64fed1535
--- /dev/null
+++ b/docs/sphinx/snippets/python/using/backends/trajectory.py
@@ -0,0 +1,43 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+#[Begin Docs]
+import cudaq
+
+# Use the `nvidia` target
+cudaq.set_target("nvidia")
+
+# Let's define a simple kernel that we will add noise to.
+qubit_count = 2
+
+
+@cudaq.kernel
+def kernel(qubit_count: int):
+    qvector = cudaq.qvector(qubit_count)
+    x(qvector)
+    mz(qvector)
+
+
+# Add a simple bit-flip noise channel to X gate
+error_probability = 0.1
+bit_flip = cudaq.BitFlipChannel(error_probability)
+
+# Add noise channels to our noise model.
+noise_model = cudaq.NoiseModel()
+# Apply the bit-flip channel to any X-gate on any qubits
+noise_model.add_all_qubit_channel("x", bit_flip)
+
+# Due to the impact of noise, our measurements will no longer be uniformly
+# in the |11> state.
+noisy_counts = cudaq.sample(kernel,
+                            qubit_count,
+                            noise_model=noise_model,
+                            shots_count=1000)
+
+# The probability that we get the perfect result (11) should be ~ 0.9 * 0.9 = 0.81
+noisy_counts.dump()
diff --git a/docs/sphinx/snippets/python/using/backends/trajectory_observe.py b/docs/sphinx/snippets/python/using/backends/trajectory_observe.py
new file mode 100644
index 0000000000..b025640e23
--- /dev/null
+++ b/docs/sphinx/snippets/python/using/backends/trajectory_observe.py
@@ -0,0 +1,44 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+#[Begin Docs]
+import cudaq
+from cudaq import spin
+
+# Use the `nvidia` target
+cudaq.set_target("nvidia")
+
+
+@cudaq.kernel
+def kernel():
+    q = cudaq.qubit()
+    x(q)
+
+
+# Add a simple bit-flip noise channel to X gate
+error_probability = 0.1
+bit_flip = cudaq.BitFlipChannel(error_probability)
+
+# Add noise channels to our noise model.
+noise_model = cudaq.NoiseModel()
+# Apply the bit-flip channel to any X-gate on any qubits
+noise_model.add_all_qubit_channel("x", bit_flip)
+
+noisy_exp_val = cudaq.observe(kernel,
+                              spin.z(0),
+                              noise_model=noise_model,
+                              num_trajectories=1024).expectation()
+# True expectation: 0.1 - 0.9 = -0.8 (|1> has <Z> of -1 and |1> has <Z> of +1)
+print("Noisy <Z> with 1024 trajectories =", noisy_exp_val)
+
+# Rerun with a higher number of trajectories
+noisy_exp_val = cudaq.observe(kernel,
+                              spin.z(0),
+                              noise_model=noise_model,
+                              num_trajectories=8192).expectation()
+print("Noisy <Z> with 8192 trajectories =", noisy_exp_val)
diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst
index a3dba35d92..59743c366b 100644
--- a/docs/sphinx/using/backends/simulators.rst
+++ b/docs/sphinx/using/backends/simulators.rst
@@ -33,6 +33,11 @@ and multi-QPU (`mqpu` :ref:`platform <mqpu-platform>`) distribution whereby each
 Host CPU memory can be leveraged in addition to GPU memory to accommodate the state vector 
 (i.e., maximizing the number of qubits to be simulated).
 
+* Trajectory simulation for noisy quantum circuits
+
+The :code:`nvidia` target supports noisy quantum circuit simulations using quantum trajectory method across all configurations: single GPU, multi-node multi-GPU, and with host memory.
+When simulating many trajectories with small state vectors, the simulation is batched for optimal performance.
+
 .. _cuQuantum single-GPU:
 
 
@@ -266,6 +271,100 @@ environment variable to another integer value as shown below.
         nvq++ --target nvidia --target-option mgpu,fp64 program.cpp [...] -o program.x
         CUDAQ_MGPU_FUSE=5 mpiexec -np 2 ./program.x
 
+
+Trajectory Noisy Simulation
+++++++++++++++++++++++++++++++++++
+
+When a :code:`noise_model` is provided to CUDA-Q, the :code:`nvidia` target will incorporate quantum noise into the quantum circuit simulation according to the noise model specified.
+
+
+.. tab:: Python
+
+    .. literalinclude:: ../../snippets/python/using/backends/trajectory.py
+        :language: python
+        :start-after: [Begin Docs]
+
+    .. code:: bash 
+        
+        python3 program.py
+        { 00:15 01:92 10:81 11:812 }
+
+.. tab:: C++
+
+    .. literalinclude:: ../../snippets/cpp/using/backends/trajectory.cpp
+        :language: cpp
+        :start-after: [Begin Documentation]
+
+    .. code:: bash 
+
+        nvq++ --target nvidia program.cpp [...] -o program.x
+        ./program.x
+        { 00:15 01:92 10:81 11:812 }
+
+
+In the case of bit-string measurement sampling as in the above example, each measurement 'shot' is executed as a trajectory, whereby Kraus operators specified in the noise model are sampled.
+
+For observable expectation value estimation, the statistical error scales asymptotically as :math:`1/\sqrt{N_{trajectories}}`, where :math:`N_{trajectories}` is the number of trajectories.
+Hence, depending on the required level of accuracy, the number of trajectories can be specified accordingly.
+
+.. tab:: Python
+
+    .. literalinclude:: ../../snippets/python/using/backends/trajectory_observe.py
+        :language: python
+        :start-after: [Begin Docs]
+
+    .. code:: bash 
+        
+        python3 program.py
+        Noisy <Z> with 1024 trajectories = -0.810546875
+        Noisy <Z> with 8192 trajectories = -0.800048828125
+
+.. tab:: C++
+
+    .. literalinclude:: ../../snippets/cpp/using/backends/trajectory_observe.cpp
+        :language: cpp
+        :start-after: [Begin Documentation]
+
+    .. code:: bash 
+
+        nvq++ --target nvidia program.cpp [...] -o program.x
+        ./program.x
+        Noisy <Z> with 1024 trajectories = -0.810547
+        Noisy <Z> with 8192 trajectories = -0.800049
+
+
+The following environment variable options are applicable to the :code:`nvidia` target for trajectory noisy simulation. Any environment variables must be set
+prior to setting the target.
+
+.. list-table:: **Additional environment variable options for trajectory simulation**
+  :widths: 20 30 50
+
+  * - Option
+    - Value
+    - Description
+  * - ``CUDAQ_OBSERVE_NUM_TRAJECTORIES``
+    - positive integer
+    - The default number of trajectories for observe simulation if none was provided in the `observe` call. The default value is 1000.
+  * - ``CUDAQ_BATCH_SIZE``
+    - positive integer or `NONE`
+    - The number of state vectors in the batched mode. If `NONE`, the batch size will be calculated based on the available device memory. Default is `NONE`.
+  * - ``CUDAQ_BATCHED_SIM_MAX_BRANCHES``
+    - positive integer
+    - The number of trajectory branches to be tracked simultaneously in the gate fusion. Default is 16. 
+  * - ``CUDAQ_BATCHED_SIM_MAX_QUBITS``
+    - positive integer
+    - The max number of qubits for batching. If the qubit count in the circuit is more than this value, batched trajectory simulation will be disabled. The default value is 20.
+  * - ``CUDAQ_BATCHED_SIM_MIN_BATCH_SIZE``
+    - positive integer
+    - The minimum number of trajectories for batching. If the number of trajectories is less than this value, batched trajectory simulation will be disabled. Default value is 4.
+
+.. note::
+    
+    Batched trajectory simulation is only available on the single-GPU execution mode of the :code:`nvidia` target. 
+    
+    If batched trajectory simulation is not activated, e.g., due to problem size, number of trajectories, or the nature of the circuit (dynamic circuits with mid-circuit measurements and conditional branching), the required number of trajectories will be executed sequentially.  
+
+
 .. _OpenMP CPU-only:
 
 OpenMP CPU-only
diff --git a/python/cudaq/runtime/observe.py b/python/cudaq/runtime/observe.py
index 1a3fd64059..befd1c0e29 100644
--- a/python/cudaq/runtime/observe.py
+++ b/python/cudaq/runtime/observe.py
@@ -43,6 +43,7 @@ def observe(kernel,
             *args,
             shots_count=0,
             noise_model=None,
+            num_trajectories=None,
             execution=None):
     """Compute the expected value of the `spin_operator` with respect to 
 the `kernel`. If the input `spin_operator` is a list of `SpinOperator` then compute 
@@ -67,6 +68,7 @@ def observe(kernel,
   noise_model (Optional[`NoiseModel`]): The optional :class:`NoiseModel` to add 
     noise to the kernel execution on the simulator. Defaults to an empty 
     noise model.
+  `num_trajectories` (Optional[int]): The optional number of trajectories for noisy simulation. Only valid if a noise model is provided. Key-word only.
 
 Returns:
   :class:`ObserveResult`: 
@@ -123,6 +125,11 @@ def observe(kernel,
     else:
         ctx = cudaq_runtime.ExecutionContext('observe', shots_count)
         ctx.setSpinOperator(localOp)
+        if num_trajectories is not None:
+            if noise_model is None:
+                raise RuntimeError(
+                    "num_trajectories is provided without a noise_model.")
+            ctx.numberTrajectories = num_trajectories
         cudaq_runtime.setExecutionContext(ctx)
         kernel(*args)
         res = ctx.result
diff --git a/python/runtime/common/py_ExecutionContext.cpp b/python/runtime/common/py_ExecutionContext.cpp
index 40215794fd..7d863480e5 100644
--- a/python/runtime/common/py_ExecutionContext.cpp
+++ b/python/runtime/common/py_ExecutionContext.cpp
@@ -28,6 +28,8 @@ void bindExecutionContext(py::module &mod) {
       .def_readwrite("totalIterations",
                      &cudaq::ExecutionContext::totalIterations)
       .def_readwrite("batchIteration", &cudaq::ExecutionContext::batchIteration)
+      .def_readwrite("numberTrajectories",
+                     &cudaq::ExecutionContext::numberTrajectories)
       .def("setSpinOperator", [](cudaq::ExecutionContext &ctx,
                                  cudaq::spin_op &spin) { ctx.spin = &spin; })
       .def("getExpectationValue",

From 742a31dee48f7fa6a9d274528f6f2875c6312f7b Mon Sep 17 00:00:00 2001
From: Thien Nguyen <58006629+1tnguyen@users.noreply.github.com>
Date: Wed, 15 Jan 2025 16:19:18 +1100
Subject: [PATCH 4/4] Make controlled rank a configurable setting (#2446)

For small controlled ops, i.e., singly controlled, expand the gate
matrix and use cutensornetStateApplyTensorOperator.

Add CUDAQ_TENSORNET_CONTROLLED_RANK threshold to determine when cutensornetStateApplyControlledTensorOperator is used.
For MPS, this is fixed at 1 as it cannot handle gate ops with more than
2 qubits.

Add doc for the setting and also remove some stale notes about random
seeds in the docs.

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 docs/sphinx/using/backends/simulators.rst     |  9 +---
 .../cutensornet/simulator_cutensornet.cpp     | 46 +++++++++++++++----
 .../nvqir/cutensornet/simulator_cutensornet.h |  7 +++
 .../simulator_tensornet_register.cpp          | 18 ++++++++
 runtime/nvqir/cutensornet/tensornet_state.cpp |  3 +-
 5 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/docs/sphinx/using/backends/simulators.rst b/docs/sphinx/using/backends/simulators.rst
index 59743c366b..225aa3c0c0 100644
--- a/docs/sphinx/using/backends/simulators.rst
+++ b/docs/sphinx/using/backends/simulators.rst
@@ -481,6 +481,7 @@ Specific aspects of the simulation can be configured by setting the following of
 * **`CUDA_VISIBLE_DEVICES=X`**: Makes the process only see GPU X on multi-GPU nodes. Each MPI process must only see its own dedicated GPU. For example, if you run 8 MPI processes on a DGX system with 8 GPUs, each MPI process should be assigned its own dedicated GPU via `CUDA_VISIBLE_DEVICES` when invoking `mpiexec` (or `mpirun`) commands. 
 * **`OMP_PLACES=cores`**: Set this environment variable to improve CPU parallelization.
 * **`OMP_NUM_THREADS=X`**: To enable CPU parallelization, set X to `NUMBER_OF_CORES_PER_NODE/NUMBER_OF_GPUS_PER_NODE`.
+* **`CUDAQ_TENSORNET_CONTROLLED_RANK=X`**: Specify the number of controlled qubits whereby the full tensor body of the controlled gate is expanded. If the number of controlled qubits is greater than this value, the gate is applied as a controlled tensor operator to the tensor network state. Default value is 1.
 
 .. note:: 
 
@@ -488,10 +489,6 @@ Specific aspects of the simulation can be configured by setting the following of
   If you do not have these dependencies installed, you may encounter an error stating `Invalid simulator requested`. 
   See the section :ref:`dependencies-and-compatibility` for more information about how to install dependencies.
 
-.. note::
-
-  Setting random seed, via :code:`cudaq::set_random_seed`, is not supported for this backend due to a limitation of the :code:`cuTensorNet` library. This will be fixed in future release once this feature becomes available.
-
 
 Matrix product state 
 +++++++++++++++++++++++++++++++++++
@@ -535,10 +532,6 @@ Specific aspects of the simulation can be configured by defining the following e
   If you do not have these dependencies installed, you may encounter an error stating `Invalid simulator requested`. 
   See the section :ref:`dependencies-and-compatibility` for more information about how to install dependencies.
 
-.. note::
-
-  Setting random seed, via :code:`cudaq::set_random_seed`, is not supported for this backend due to a limitation of the :code:`cuTensorNet` library. This will be fixed in future release once this feature becomes available.
-
 .. note::
     The parallelism of Jacobi method (the default `CUDAQ_MPS_SVD_ALGO` setting) gives GPU better performance on small and medium size matrices.
     If you expect a large number of singular values (e.g., increasing the `CUDAQ_MPS_MAX_BOND` setting), please adjust the `CUDAQ_MPS_SVD_ALGO` setting accordingly.  
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
index 983048f612..676c9a38c5 100644
--- a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
+++ b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
@@ -90,17 +90,45 @@ void SimulatorTensorNetBase::applyGate(const GateApplicationTask &task) {
     }
     return paramsSs.str() + "__" + std::to_string(vecComplexHash(task.matrix));
   }();
-  const auto iter = m_gateDeviceMemCache.find(gateKey);
 
-  // This is the first time we see this gate, allocate device mem and cache it.
-  if (iter == m_gateDeviceMemCache.end()) {
-    void *dMem = allocateGateMatrix(task.matrix);
-    m_gateDeviceMemCache[gateKey] = dMem;
+  if (controls.size() <= m_maxControlledRankForFullTensorExpansion) {
+    // If the number of controlled qubits is less than the threshold, expand the
+    // full matrix and apply it as a single tensor operation.
+    // Qubit operands are now both control and target qubits.
+    std::vector<std::int32_t> qubitOperands(controls.begin(), controls.end());
+    qubitOperands.insert(qubitOperands.end(), targets.begin(), targets.end());
+    // Use a different key for expanded gate matrix (reflecting the number of
+    // control qubits)
+    const auto expandedMatKey =
+        gateKey + "_c(" + std::to_string(controls.size()) + ")";
+    const auto iter = m_gateDeviceMemCache.find(expandedMatKey);
+    if (iter != m_gateDeviceMemCache.end()) {
+      m_state->applyGate(/*controlQubits=*/{}, qubitOperands, iter->second);
+    } else {
+      // If this is the first time seeing this (gate + number of control qubits)
+      // compo, compute the expanded matrix.
+      const auto expandedGateMat =
+          generateFullGateTensor(controls.size(), task.matrix);
+      void *dMem = allocateGateMatrix(expandedGateMat);
+      m_gateDeviceMemCache[expandedMatKey] = dMem;
+      m_state->applyGate(/*controlQubits=*/{}, qubitOperands, dMem);
+    }
+  } else {
+    // Propagates control qubits to cutensornet.
+    const auto iter = m_gateDeviceMemCache.find(gateKey);
+    // This is the first time we see this gate, allocate device mem and cache
+    // it.
+    if (iter == m_gateDeviceMemCache.end()) {
+      void *dMem = allocateGateMatrix(task.matrix);
+      m_gateDeviceMemCache[gateKey] = dMem;
+    }
+    // Type conversion
+    const std::vector<std::int32_t> ctrlQubits(controls.begin(),
+                                               controls.end());
+    const std::vector<std::int32_t> targetQubits(targets.begin(),
+                                                 targets.end());
+    m_state->applyGate(ctrlQubits, targetQubits, m_gateDeviceMemCache[gateKey]);
   }
-  // Type conversion
-  const std::vector<std::int32_t> ctrlQubits(controls.begin(), controls.end());
-  const std::vector<std::int32_t> targetQubits(targets.begin(), targets.end());
-  m_state->applyGate(ctrlQubits, targetQubits, m_gateDeviceMemCache[gateKey]);
 }
 
 /// @brief Reset the state of a given qubit to zero
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.h b/runtime/nvqir/cutensornet/simulator_cutensornet.h
index 8165996d4a..666411ac11 100644
--- a/runtime/nvqir/cutensornet/simulator_cutensornet.h
+++ b/runtime/nvqir/cutensornet/simulator_cutensornet.h
@@ -96,6 +96,13 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase<double> {
   // Random number generator for generating 32-bit numbers with a state size of
   // 19937 bits for measurements.
   std::mt19937 m_randomEngine;
+  // Max number of controlled ranks (qubits) that the full matrix of the
+  // controlled gate is used as tensor op.
+  // Default is 1.
+  // MPS only supports 1 (higher number of controlled ranks must use
+  // cutensornetStateApplyControlledTensorOperator). Tensornet supports
+  // arbitrary values.
+  std::size_t m_maxControlledRankForFullTensorExpansion = 1;
 };
 
 } // end namespace nvqir
diff --git a/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp b/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp
index c6a2145e4e..132e80f3b4 100644
--- a/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp
+++ b/runtime/nvqir/cutensornet/simulator_tensornet_register.cpp
@@ -28,7 +28,25 @@ class SimulatorTensorNet : public SimulatorTensorNetBase {
       initCuTensornetComm(m_cutnHandle);
       m_cutnMpiInitialized = true;
     }
+
+    // Retrieve user-defined controlled rank setting if provided.
+    if (auto *maxControlledRankEnvVar =
+            std::getenv("CUDAQ_TENSORNET_CONTROLLED_RANK")) {
+      auto maxControlledRank = std::atoi(maxControlledRankEnvVar);
+      if (maxControlledRank <= 0)
+        throw std::runtime_error(
+            fmt::format("Invalid CUDAQ_TENSORNET_CONTROLLED_RANK environment "
+                        "variable setting. Expecting a "
+                        "positive integer value, got '{}'.",
+                        maxControlledRank));
+
+      cudaq::info("Setting max controlled rank for full tensor expansion from "
+                  "{} to {}.",
+                  m_maxControlledRankForFullTensorExpansion, maxControlledRank);
+      m_maxControlledRankForFullTensorExpansion = maxControlledRank;
+    }
   }
+
   // Nothing to do for state preparation
   virtual void prepareQubitTensorState() override {}
   virtual std::string name() const override { return "tensornet"; }
diff --git a/runtime/nvqir/cutensornet/tensornet_state.cpp b/runtime/nvqir/cutensornet/tensornet_state.cpp
index 6df0649c74..94e662345c 100644
--- a/runtime/nvqir/cutensornet/tensornet_state.cpp
+++ b/runtime/nvqir/cutensornet/tensornet_state.cpp
@@ -53,7 +53,8 @@ std::unique_ptr<TensorNetState> TensorNetState::clone() const {
 void TensorNetState::applyGate(const std::vector<int32_t> &controlQubits,
                                const std::vector<int32_t> &targetQubits,
                                void *gateDeviceMem, bool adjoint) {
-  LOG_API_TIME();
+  ScopedTraceWithContext("TensorNetState::applyGate", controlQubits.size(),
+                         targetQubits.size());
   if (controlQubits.empty()) {
     HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(
         m_cutnHandle, m_quantumState, targetQubits.size(), targetQubits.data(),