diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
index 55d329b93f..5619f5baea 100644
--- a/.github/workflows/integration_tests.yml
+++ b/.github/workflows/integration_tests.yml
@@ -17,6 +17,7 @@ on:
         options:
           - nightly
           - anyon
+          - infleqtion
           - ionq
           - iqm
           - oqc
@@ -335,11 +336,7 @@ jobs:
           fi
 
       - name: Setup anyon account
-        # This step is currently bypassed during nightly runs due to
-        # maintenance. Restore the if check to the original value when
-        # maintenance is complete.
-        #if: github.event_name == 'schedule' || inputs.target == 'nightly' || inputs.target == 'anyon'
-        if: inputs.target == 'anyon'
+        if: github.event_name == 'schedule' || inputs.target == 'nightly' || inputs.target == 'anyon'
         run: |
           curl -X POST --user "${{ secrets.ANYON_USERNAME }}:${{ secrets.ANYON_PASSWORD }}"  -H "Content-Type: application/json" https://api.anyon.cloud:5000/login > credentials.json
           id_token=`cat credentials.json | jq -r '."id_token"'`
@@ -348,11 +345,7 @@ jobs:
           echo "refresh: $refresh_token" >> ~/.anyon_config
 
       - name: QIR syntax check (Anyon)
-        # This step is currently bypassed during nightly runs due to
-        # maintenance. Restore the if check to the original value when
-        # maintenance is complete.
-        #if: github.event_name == 'schedule' || inputs.target == 'nightly' || inputs.target == 'anyon'
-        if: inputs.target == 'anyon'
+        if: github.event_name == 'schedule' || inputs.target == 'nightly' || inputs.target == 'anyon'
         run: |
           echo "### QIR syntax check (Anyon)" >> $GITHUB_STEP_SUMMARY
           export CUDAQ_LOG_LEVEL="info"
@@ -650,6 +643,51 @@ jobs:
           fi
         shell: bash
 
+      - name: Submit to Infleqtion test server
+        if: (success() || failure()) && (inputs.target == 'infleqtion' || github.event_name == 'schedule' || inputs.target == 'nightly')
+        run: |
+          echo "### Submit to Infleqtion server" >> $GITHUB_STEP_SUMMARY
+          export SUPERSTAQ_API_KEY='${{ secrets.SUPERSTAQ_API_KEY }}'
+          set +e # Allow script to keep going through errors
+          test_err_sum=0
+          cpp_tests="docs/sphinx/targets/cpp/infleqtion.cpp"
+          for filename in $cpp_tests; do
+            [ -e "$filename" ] || echo "::error::Couldn't find file ($filename)"
+            nvq++ --target infleqtion $filename
+            test_status=$?
+            if [ $test_status -eq 0 ]; then
+              ./a.out
+              test_status=$?
+              if [ $test_status -eq 0 ]; then
+                echo ":white_check_mark: Successfully ran test: $filename" >> $GITHUB_STEP_SUMMARY
+              else
+                echo ":x: Test failed (failed to execute): $filename" >> $GITHUB_STEP_SUMMARY
+                test_err_sum=$((test_err_sum+1))
+              fi
+            else
+              echo ":x: Test failed (failed to compile): $filename" >> $GITHUB_STEP_SUMMARY
+              test_err_sum=$((test_err_sum+1))
+            fi
+          done
+          python_tests="docs/sphinx/targets/python/infleqtion.py"
+          for filename in $python_tests; do
+            [ -e "$filename" ] || echo "::error::Couldn't find file ($filename)"
+            python3 $filename 1> /dev/null
+            test_status=$?
+            if [ $test_status -eq 0 ]; then
+              echo ":white_check_mark: Successfully ran test: $filename" >> $GITHUB_STEP_SUMMARY
+            else
+              echo ":x: Test failed (failed to execute): $filename" >> $GITHUB_STEP_SUMMARY
+              test_err_sum=$((test_err_sum+1))
+            fi
+          done
+          set -e # Re-enable exit code error checking
+          if [ ! $test_err_sum -eq 0 ]; then
+            echo "::error::${test_err_sum} tests failed. See step summary for a list of failures"
+            exit 1
+          fi
+        shell: bash
+      
       - name: Submit to ${{ inputs.target }}
         # The full set of tests used by this step is currently only supported on
         # Quantinuum.  The other supported tests are tested by the step above.
diff --git a/.github/workflows/publishing.yml b/.github/workflows/publishing.yml
index cfdb5068b2..675a0f96da 100644
--- a/.github/workflows/publishing.yml
+++ b/.github/workflows/publishing.yml
@@ -1090,6 +1090,11 @@ jobs:
         cuda_version: ['11.8', '12.4']
       fail-fast: false
 
+    # Must have environment to access environment secreats
+    environment:
+      name: ghcr-deployment
+      url: ${{ vars.deployment_url }}
+
     container:
       image: ubuntu:22.04
       options: --user root
@@ -1129,13 +1134,16 @@ jobs:
           rm -rf ${cudaq_metapackage} && readme=README.md
 
           # Setup files for validate_pycudaq.sh script
+          # Important: Notebooks are *not* validated by validate_pycudaq.sh.
           cp    $GITHUB_WORKSPACE/scripts/validate_pycudaq.sh .
           cp -r $GITHUB_WORKSPACE/docs/sphinx/examples/python /tmp/examples/
-          cp -r $GITHUB_WORKSPACE/docs/sphinx/applications/python /tmp/applications/
-          cp -r $GITHUB_WORKSPACE/docs/sphinx/targets/python /tmp/targets/
           cp -r $GITHUB_WORKSPACE/docs/sphinx/snippets/python /tmp/snippets/
           cp -r $GITHUB_WORKSPACE/python/tests /tmp/tests/
           cp    $GITHUB_WORKSPACE/$readme /tmp/README.md
+          # Target tests should not be run here either, since that requires credentials but doesn't require a GPU runner.
+
+          # The NVQC API key is needed to validate NVQC related snippets and examples.
+          export NVQC_API_KEY="${{ secrets.NVQC_PROD_SERVICE_KEY }}"
 
           # Run the script w/ -q to run a shortened test
           set +e # Allow script to keep going through errors (needed for skipped tests)
@@ -1158,7 +1166,7 @@ jobs:
   create_release:
     name: CUDA-Q Release
     needs: [assets, cudaq_images, cudaq_installers, cudaq_wheels, cudaq_metapackages]
-    if: needs.assets.outputs.release_title && inputs.github_commit == '' && inputs.assets_from_run == '' && inputs.nvidia_mgpu_commit == ''
+    if: needs.assets.outputs.release_title && inputs.github_commit == '' && inputs.nvidia_mgpu_commit == ''
     runs-on: ubuntu-latest
 
     environment:
diff --git a/.style.yapf b/.style.yapf
new file mode 100644
index 0000000000..0e9640c29a
--- /dev/null
+++ b/.style.yapf
@@ -0,0 +1,2 @@
+[style]
+based_on_style = google
diff --git a/docs/sphinx/applications/python/deutschs_algorithm.ipynb b/docs/sphinx/applications/python/deutschs_algorithm.ipynb
index feaacacd9d..b1e281a0af 100644
--- a/docs/sphinx/applications/python/deutschs_algorithm.ipynb
+++ b/docs/sphinx/applications/python/deutschs_algorithm.ipynb
@@ -13,17 +13,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We have a function which takes in a bit and outputs a bit. This can be represented as $f: \\{0,1\\} \\longrightarrow \\{0,1\\}$. \n",
+    "Deutsch's Algorithm is a concise demonstration of the differences in computational complexity between classical and quantum algorithms for certain problems.  For Desutch's algorithm, we begin with a function which takes in a bit and outputs a bit. This can be represented as $f: \\{0,1\\} \\longrightarrow \\{0,1\\}$. \n",
+    "The function $f$ has the property that it either constant or balanced. The goal of Deutsch's Algorithm is to determine whether our given function is constant or whether it is balanced. \n",
     "\n",
-    "The function $f$ has a property; either it is constant or balanced. \n",
+    "A constant function is \"A balanced function is a function such that the outputs are the same regardless of the inputs, i.e., if $f(0) = 0$ then $f(1) = 1$ or if $f(0) = 1$ then $f(1) = 0$.\n\", the outputs are the same regardless of the inputs, i.e., in the case of $f: \\{0,1\\} \\longrightarrow \\{0,1\\}$, there are are two ways in which this can occur: $f(0) = f(1) = 0$ or $f(0) = f(1) = 1$.\n",
     "\n",
-    "If constant, the outputs are the same regardless of the inputs, i.e., $f(0) = f(1) = 0$ or $f(0) = f(1) = 1$.\n",
-    "\n",
-    "If balanced, the ouputs are balanced across their possibilities, i.e,  if $f(0) = 0$ then $f(1) = 1$ or if $f(0) = 1$ then $f(1) = 0$.\n",
-    "\n",
-    "The question we would like to answer is if the function is constant or balanced. \n",
+    "A balanced function is defined such that the ouputs are balanced across their possibilities, i.e., if $f(0) = 0$ then $f(1) = 1$ or if $f(0) = 1$ then $f(1) = 0$.\n",
     " \n",
-    "Classically, if we are given a function $f$, we can solve to find its property via the code below: \n"
+    "Classically, if we are given a function $f: \\{0,1\\} \\longrightarrow \\{0,1\\}$, we can determine if it is constant or balanced by evaluating the function at $0$ and at $1$.  This is carried out in the code below: \n"
    ]
   },
   {
@@ -96,11 +93,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If you step through the `if` statements above, one can see that we require 2 calls to the function to determine its property. That is, we have to query $f$ twice.\n",
+    "If you step through the `if` statements above, you may notice that we require 2 calls to the function to determine its property. That is, we have to query $f$ twice.\n",
     "\n",
-    "The claim is that Deutsch's algorithm can solve for this property with 1 function evalulation, demonstrating quantum advantage. \n",
+    "The claim is that Deutsch's Algorithm can determine if a given function is constant or balanced with just 1 function evalulation, demonstrating quantum advantage. \n",
     "\n",
-    "Below we first go through the math and then the implementation in CUDA Quantum. \n",
+    "Below we first outline Deutsch's Algorithm and work through the math to verify that it does as promised. Then, we provide the implementation in CUDA-Q. \n",
     "\n"
    ]
   },
@@ -130,7 +127,7 @@
     "\n",
     "<img src=\"images/oracle.png\"  width=\"300\" height=\"150\">\n",
     "\n",
-    "Suppose we have $f(x): \\{0,1\\} \\longrightarrow \\{0,1\\}$. We can compute this function on a quantum computer using oracles which we treat as black box functions that yield the output with an appropriate sequence of logic gates. \n",
+    "Suppose we have $f(x): \\{0,1\\} \\longrightarrow \\{0,1\\}$. We can compute this function on a quantum computer using oracles which we treat as black box functions that yield the output with an appropriate sequence of logical gates. \n",
     "\n",
     "Above you see an oracle represented as $U_f$ which allows us to transform the state $\\ket{x}\\ket{y}$ into: \n",
     "\n",
@@ -140,7 +137,7 @@
     "\\end{aligned}\n",
     "$$\n",
     "\n",
-    "If $y = 0$, then $U_f\\ket{x}\\ket{y} = U_f\\ket{x}\\ket{0}  =   \\ket{x}\\ket{0 \\oplus f(x)} =  \\ket{x}\\ket{f(x)}$ since $f(x)$ can either be $0/1$ and $0 \\oplus 0 = 0$ and $0 \\oplus 1 = 1$.\n",
+    "If $y = 0$, then $U_f\\ket{x}\\ket{y} = U_f\\ket{x}\\ket{0}  =   \\ket{x}\\ket{0 \\oplus f(x)} =  \\ket{x}\\ket{f(x)}$, since $f(x)$ can either be $0$ or $1$ and $0 \\oplus 0 = 0$ and $0 \\oplus 1 = 1$.\n",
     "\n",
     "This is remarkable because by setting $\\ket{y} = \\ket{0}$, we can extract the value of $f(x)$ by measuring the value of the second qubit. \n",
     " \n",
@@ -213,7 +210,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Deutschs' Algorithm: \n",
+    "## Deutsch's Algorithm: \n",
     "\n",
     "Our aim is to find out if $f: \\{0,1\\} \\longrightarrow \\{0,1\\}$ is a constant or a balanced function? If constant, $f(0) = f(1)$, and if balanced, $f(0) \\neq f(1)$.\n",
     "\n",
@@ -296,18 +293,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/qutip/__init__.py:66: UserWarning: The new version of Cython, (>= 3.0.0) is not supported.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Import the CUDA-Q package and set the target to run on NVIDIA GPUs.\n",
     "\n",
@@ -391,6 +379,17 @@
     "elif np.array(result)[0] == '1':\n",
     "    print('f(x) is a balanced function')"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This algorithm can be generalized to determine if a $n$-bit function $f:{0,1}^n\\longrightarrow {0,1}$ is constant or a balanced with only $\\frac{n}{2}$ function evaluations, for $n$ even.  A function if balanced if half of the inputs map to $0$ and half map to $1$.  \n",
+    "\n",
+    "Here we must assume that the function that we are given is either constant or balanced since there are $n$-bit functions that are neither constant, nor balanced.  For instance the $2$-bit function $f(b_0,b_1) = \\max(b_0,b_1)$ is neither balanced, nor constant.\n",
+    "\n",
+    "A hint on how you might approach this problem is to first solve the problem for $n=2$ and see if you can then use that approach to handle $n$-bit functions for larger values of $n$."
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/sphinx/releases.rst b/docs/sphinx/releases.rst
index de3e12d78b..8e455ff9dd 100644
--- a/docs/sphinx/releases.rst
+++ b/docs/sphinx/releases.rst
@@ -12,6 +12,27 @@ and is also available as a Docker image. More information about installing the n
 - `Documentation <https://nvidia.github.io/cuda-quantum/latest>`__
 - `Examples <https://github.com/NVIDIA/cuda-quantum/tree/main/docs/sphinx/examples>`__
 
+**0.9.1**
+
+This release adds support for using 
+`Amazon Braket <https://nvidia.github.io/cuda-quantum/0.9.1/using/backends/hardware.html#amazon-braket>`__ and 
+`Infeqtion's Superstaq <https://nvidia.github.io/cuda-quantum/0.9.1/using/backends/hardware.html#infleqtion>`__ as backends.
+
+Starting with this release, all C++ quantum kernels will be processed by the `nvq++` compiler regardless of whether 
+they run on a simulator or on a quantum hardware backend. This change is largely non-breaking, but language constructs 
+that are not officially supported within quantum kernels will now lead to a compilation error whereas previously they 
+could be used when executing on a simulator only. The previous behavior can be forced by passing the `--library-mode` 
+flag to the compiler. Please note that if you do so, however, the code will never be executable outside of a simulator 
+and may not be supported even on simulators.
+
+- `Docker image <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/quantum/containers/cuda-quantum>`__
+- `Python wheel <https://pypi.org/project/cudaq/0.9.1>`__
+- `C++ installer <https://github.com/NVIDIA/cuda-quantum/releases/0.9.1>`__
+- `Documentation <https://nvidia.github.io/cuda-quantum/0.9.1>`__
+- `Examples <https://github.com/NVIDIA/cuda-quantum/tree/releases/v0.9.1/docs/sphinx/examples>`__
+
+The full change log can be found `here <https://github.com/NVIDIA/cuda-quantum/releases/0.9.1>`__.
+
 **0.9.0**
 
 We are very excited to share a new toolset added for modeling and manipulating the dynamics of physical systems. 
@@ -21,12 +42,12 @@ The 0.9.0 release furthermore includes a range of contribution to add new backen
 from `Anyon Technologies <https://nvidia.github.io/cuda-quantum/0.9.0/using/backends/hardware.html#anyon-technologies-anyon-computing>`__, 
 `Ferimioniq <https://nvidia.github.io/cuda-quantum/0.9.0/using/backends/simulators.html#fermioniq>`__, and 
 `QuEra Computing <https://nvidia.github.io/cuda-quantum/0.9.0/using/backends/hardware.html#quera-computing>`__, 
-as well as updates to existing backends from `ORCA <https://nvidia.github.io/cuda-quantum/latest/using/backends/hardware.html#orca-computing>`__ 
+as well as updates to existing backends from `ORCA <https://nvidia.github.io/cuda-quantum/0.9.0/using/backends/hardware.html#orca-computing>`__ 
 and `OQC <https://nvidia.github.io/cuda-quantum/0.9.0/using/backends/hardware.html#oqc>`__.
 We hope you enjoy the new features - also check out our new notebooks and examples to dive into CUDA-Q.
 
-- `Docker image <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/quantum/containers/cuda-quantum>`__
-- `Python wheel <https://pypi.org/project/cuda-quantum/0.9.0>`__
+- `Docker image <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/quantum/containers/cuda-quantum/tags>`__
+- `Python wheel <https://pypi.org/project/cudaq/0.9.0>`__
 - `C++ installer <https://github.com/NVIDIA/cuda-quantum/releases/0.9.0>`__
 - `Documentation <https://nvidia.github.io/cuda-quantum/0.9.0>`__
 - `Examples <https://github.com/NVIDIA/cuda-quantum/tree/releases/v0.9.0/docs/sphinx/examples>`__
diff --git a/include/cudaq/Optimizer/CodeGen/CMakeLists.txt b/include/cudaq/Optimizer/CodeGen/CMakeLists.txt
index 5c2c15f8d7..c0140aefa4 100644
--- a/include/cudaq/Optimizer/CodeGen/CMakeLists.txt
+++ b/include/cudaq/Optimizer/CodeGen/CMakeLists.txt
@@ -12,7 +12,3 @@ add_cudaq_dialect_doc(CodeGenDialect codegen)
 set(LLVM_TARGET_DEFINITIONS Passes.td)
 mlir_tablegen(Passes.h.inc -gen-pass-decls -name OptCodeGen)
 add_public_tablegen_target(OptCodeGenPassIncGen)
-
-set(LLVM_TARGET_DEFINITIONS Peephole.td)
-mlir_tablegen(Peephole.inc -gen-rewriters)
-add_public_tablegen_target(OptPeepholeIncGen)
diff --git a/lib/Optimizer/CodeGen/CodeGenDialect.h b/include/cudaq/Optimizer/CodeGen/CodeGenDialect.h
similarity index 100%
rename from lib/Optimizer/CodeGen/CodeGenDialect.h
rename to include/cudaq/Optimizer/CodeGen/CodeGenDialect.h
diff --git a/include/cudaq/Optimizer/CodeGen/Peephole.h b/include/cudaq/Optimizer/CodeGen/Peephole.h
index 4fdca9bd02..f5eae54c4b 100644
--- a/include/cudaq/Optimizer/CodeGen/Peephole.h
+++ b/include/cudaq/Optimizer/CodeGen/Peephole.h
@@ -38,9 +38,8 @@ inline bool isIntToPtrOp(mlir::Value operand) {
 static constexpr char resultIndexName[] = "result.index";
 
 inline mlir::Value createMeasureCall(mlir::PatternRewriter &builder,
-                                     mlir::Location loc, mlir::OpResult result,
+                                     mlir::Location loc, mlir::LLVM::CallOp op,
                                      mlir::ValueRange args) {
-  auto op = cast<mlir::LLVM::CallOp>(result.getDefiningOp());
   auto ptrTy = cudaq::opt::getResultType(builder.getContext());
   if (auto intAttr =
           dyn_cast_or_null<mlir::IntegerAttr>(op->getAttr(resultIndexName))) {
@@ -57,7 +56,7 @@ inline mlir::Value createMeasureCall(mlir::PatternRewriter &builder,
 
 inline mlir::Value createReadResultCall(mlir::PatternRewriter &builder,
                                         mlir::Location loc,
-                                        mlir::OpResult result) {
+                                        mlir::Value result) {
   auto i1Ty = mlir::IntegerType::get(builder.getContext(), 1);
   return builder
       .create<mlir::LLVM::CallOp>(loc, mlir::TypeRange{i1Ty},
@@ -65,7 +64,3 @@ inline mlir::Value createReadResultCall(mlir::PatternRewriter &builder,
                                   mlir::ArrayRef<mlir::Value>{result})
       .getResult();
 }
-
-namespace {
-#include "cudaq/Optimizer/CodeGen/Peephole.inc"
-}
diff --git a/include/cudaq/Optimizer/CodeGen/Peephole.td b/include/cudaq/Optimizer/CodeGen/Peephole.td
deleted file mode 100644
index 32f32b5d21..0000000000
--- a/include/cudaq/Optimizer/CodeGen/Peephole.td
+++ /dev/null
@@ -1,177 +0,0 @@
-/********************************************************** -*- tablegen -*- ***
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#ifndef NVQPP_OPTIMIZER_CODEGEN_PEEPHOLE
-#define NVQPP_OPTIMIZER_CODEGEN_PEEPHOLE
-
-include "cudaq/Optimizer/Dialect/Quake/QuakeOps.td"
-include "mlir/Dialect/LLVMIR/LLVMOps.td"
-include "mlir/IR/OpBase.td"
-include "mlir/IR/PatternBase.td"
-
-//===----------------------------------------------------------------------===//
-
-def InvokeOnXWithOneControl : Constraint<CPred<
-    "$0 && callToInvokeWithXCtrlOneTarget($0.getValue(), $1)">>;
-
-def CreateCallCnot : NativeCodeCall<
-    "[&]() -> std::size_t {"
-    "  $_builder.create<mlir::LLVM::CallOp>($_loc,"
-    "    mlir::TypeRange{}, cudaq::opt::QIRCnot, $0.drop_front(2));"
-    "  return 0; }()">;
-
-// %1 = address_of @__quantum__qis__x__ctl
-// %2 = call @invokewithControlBits %1, %ctrl, %targ
-// ─────────────────────────────────────────────────
-// %2 = call __quantum__qis__cnot %ctrl, %targ
-def XCtrlOneTargetToCNot : Pat<
-    (LLVM_CallOp $callee, $args, $_, $_), (CreateCallCnot $args),
-    [(InvokeOnXWithOneControl $callee, $args)]>;
-
-//===----------------------------------------------------------------------===//
-
-def NeedsRenaming : Constraint<CPred<"$0 && needsToBeRenamed($0.getValue())">>;
-
-def CreateAddressOf : NativeCodeCall<
-    "$_builder.create<mlir::LLVM::AddressOfOp>($_loc, $0.getType(),"
-    "  $1.getValue().str() + \"__body\")">;
-
-// %4 = address_of @__quantum__cis__*
-// ────────────────────────────────────────
-// %4 = address_of @__quantum__cis__*__body
-def AddrOfCisToBase : Pat<
-    (LLVM_AddressOfOp:$addr $global), (CreateAddressOf $addr, $global),
-    [(NeedsRenaming $global)]>;
-
-//===----------------------------------------------------------------------===//
-
-// Apply special rule for `mz`. See below.
-def FuncNotMeasure : Constraint<CPred<
-    "!($_self && $_self.getValue().startswith(cudaq::opt::QIRMeasure))">>;
-
-def CreateCallOp : NativeCodeCall<
-    "[&]() -> std::size_t {"
-    "  $_builder.create<mlir::LLVM::CallOp>($_loc, mlir::TypeRange{},"
-    "    mlir::FlatSymbolRefAttr::get($_builder.getContext(),"
-    "    $0.getValue().str() + \"__body\"), $1, $2, $3);"
-    "  return 0; }()">;
-
-// %4 = call @__quantum__cis__*
-// ──────────────────────────────────
-// %4 = call @__quantum__cis__*__body
-def CalleeConv : Pat<
-      (LLVM_CallOp $callee, $args, $fm, $bw), 
-      (CreateCallOp $callee, $args, $fm, $bw),
-      [(NeedsRenaming $callee), (FuncNotMeasure:$callee)]>;
-
-//===----------------------------------------------------------------------===//
-
-def IsArrayGetElementPtrId : Constraint<CPred<
-    "$0 && $0.getValue().str() == cudaq::opt::QIRArrayGetElementPtr1d">>;
-
-def EraseArrayGEPOp : NativeCodeCall<
-    "$_builder.create<mlir::LLVM::UndefOp>($_loc,"
-    "  cudaq::opt::getQubitType($_builder.getContext()))">;
-
-def EraseDeadArrayGEP : Pat<
-    (LLVM_CallOp:$call $callee, $_, $_, $_), (EraseArrayGEPOp),
-    [(IsArrayGetElementPtrId $callee), (HasNoUseOf:$call)]>;
-
-//===----------------------------------------------------------------------===//
-
-def IsaAllocateCall : Constraint<CPred<
-    "$0 && $0.getValue().str() == cudaq::opt::QIRArrayQubitAllocateArray">>;
-
-def EraseArrayAllocateOp : NativeCodeCall<
-    "$_builder.create<mlir::LLVM::UndefOp>($_loc,"
-    "  cudaq::opt::getArrayType($_builder.getContext()))">;
-
-// Replace the call with a dead op to DCE.
-//
-// %0 = call @allocate ... : ... -> T*
-// ───────────────────────────────────
-// %0 = undef : T*
-def EraseArrayAlloc : Pat<
-    (LLVM_CallOp $callee, $_, $_, $_), (EraseArrayAllocateOp),
-    [(IsaAllocateCall $callee)]>;
-
-//===----------------------------------------------------------------------===//
-
-def IsaReleaseCall : Constraint<CPred<
-    "$0 && ($0.getValue().str() == cudaq::opt::QIRArrayQubitReleaseArray || "
-    "$0.getValue().str() == cudaq::opt::QIRArrayQubitReleaseQubit)">>;
-
-def EraseArrayReleaseOp : NativeCodeCall<"static_cast<std::size_t>(0)">;
-
-// Remove the release calls. This removes both array allocations as well as
-// qubit singletons.
-//
-// call @release %5 : (!Qubit) -> ()
-// ─────────────────────────────────
-def EraseArrayRelease : Pat<
-    (LLVM_CallOp $callee, $_, $_, $_), (EraseArrayReleaseOp),
-    [(IsaReleaseCall $callee)]>;
-
-//===----------------------------------------------------------------------===//
-
-def IsaMeasureCall : Constraint<CPred<
-    "$_self && $_self.getValue() == cudaq::opt::QIRMeasure">>;
-    
-def IsaIntToPtrOperand : Constraint<CPred<"isIntToPtrOp($0[0])">>;
-
-def CreateMeasureCall : NativeCodeCall<
-    "createMeasureCall($_builder, $_loc, $0, $1)">;
-
-// %result = call @__quantum__qis__mz(%qbit) : (!Qubit) -> i1
-// ──────────────────────────────────────────────────────────────
-// call @__quantum__qis__mz_body(%qbit, %result) : (Q*, R*) -> ()
-def MeasureCallConv : Pat<
-    (LLVM_CallOp:$call $callee, $args, $_, $_),
-    (CreateMeasureCall $call, $args),
-    [(IsaMeasureCall:$callee), (IsaIntToPtrOperand $args)]>;
-
-//===----------------------------------------------------------------------===//
-
-def IsaMeasureToRegisterCall : Constraint<CPred<
-    "$_self && $_self.getValue() == cudaq::opt::QIRMeasureToRegister">>;
-
-// %result = call @__quantum__qis__mz__to__register(%qbit, i8) : (!Qubit) -> i1
-// ────────────────────────────────────────────────────────────────────────────
-// call @__quantum__qis__mz_body(%qbit, %result) : (Q*, R*) -> ()
-def MeasureToRegisterCallConv : Pat<
-    (LLVM_CallOp:$call $callee, $args, $_, $_),
-    (CreateMeasureCall $call, $args),
-    [(IsaMeasureToRegisterCall:$callee), (IsaIntToPtrOperand $args)]>;
-
-//===----------------------------------------------------------------------===//
-
-def HasI1PtrType : Constraint<CPred<
-    "$_self.getType() == cudaq::opt::factory::getPointerType("
-    "  mlir::IntegerType::get($_self.getContext(), 1))">>;
-
-def HasResultType : Constraint<CPred<
-    "$_self.getType() == cudaq::opt::getResultType($_self.getContext())">>;
-
-def IsaIntAttr : Constraint<CPred<"$_self.isa<mlir::IntegerAttr>()">>;
-
-def CreateReadResultCall : NativeCodeCall<
-    "createReadResultCall($_builder, $_loc, $0)">;
-
-// %1 = llvm.constant 1
-// %2 = llvm.inttoptr %1 : i64 -> Result*
-// %3 = llvm.bitcast %2 : Result* -> i1*
-// %4 = llvm.load %3
-// ─────────────────────────────────────
-// %4 = call @read_result %2
-def LoadMeasureResult : Pat<
-    (LLVM_LoadOp:$load (LLVM_BitcastOp:$bitcast (LLVM_IntToPtrOp:$cast
-                       (LLVM_ConstantOp $attr))), $_, $_, $_, $_, $_, $_),
-    (CreateReadResultCall $cast),
-    [(HasI1PtrType:$bitcast), (HasResultType:$cast), (IsaIntAttr:$attr)]>;
-
-#endif
diff --git a/include/cudaq/Optimizer/Dialect/Quake/CMakeLists.txt b/include/cudaq/Optimizer/Dialect/Quake/CMakeLists.txt
index d038abd040..6dca96dc83 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/CMakeLists.txt
+++ b/include/cudaq/Optimizer/Dialect/Quake/CMakeLists.txt
@@ -9,7 +9,3 @@
 add_cudaq_dialect(Quake quake)
 add_cudaq_interface(QuakeInterfaces)
 add_cudaq_dialect_doc(QuakeDialect quake)
-
-set(LLVM_TARGET_DEFINITIONS Canonical.td)
-mlir_tablegen(Canonical.inc -gen-rewriters)
-add_public_tablegen_target(CanonicalIncGen)
diff --git a/include/cudaq/Optimizer/Dialect/Quake/Canonical.td b/include/cudaq/Optimizer/Dialect/Quake/Canonical.td
deleted file mode 100644
index d7aec89e6f..0000000000
--- a/include/cudaq/Optimizer/Dialect/Quake/Canonical.td
+++ /dev/null
@@ -1,67 +0,0 @@
-/********************************************************** -*- tablegen -*- ***
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- ******************************************************************************/
-
-#ifndef NVQPP_OPTIMIZER_DIALECT_QUAKE_CANONICAL
-#define NVQPP_OPTIMIZER_DIALECT_QUAKE_CANONICAL
-
-include "mlir/IR/OpBase.td"
-include "mlir/IR/PatternBase.td"
-include "mlir/Dialect/Arith/IR/ArithOps.td"
-include "cudaq/Optimizer/Dialect/Quake/QuakeOps.td"
-
-def KnownSizePred : Constraint<
-      CPred<"$0.getType().isa<quake::VeqType>() && "
-            "$0.getType().cast<quake::VeqType>().hasSpecifiedSize()">>;
-
-def UnknownSizePred : Constraint<
-      CPred<"$0.getType().isa<quake::VeqType>() && "
-            "!$0.getType().cast<quake::VeqType>().hasSpecifiedSize()">>;
-
-def createConstantOp : NativeCodeCall<
-      "$_builder.create<mlir::arith::ConstantOp>($_loc, $0.getType(),"
-      "  $_builder.getIntegerAttr($0.getType(),"
-      "   $1.getType().cast<quake::VeqType>().getSize()))">;
-
-// %4 = quake.veq_size %3 : (!quake.veq<10>) -> 164
-// ────────────────────────────────────────────────
-// %4 = constant 10 : i64
-def ForwardConstantVeqSizePattern : Pat<
-      (quake_VeqSizeOp:$res $veq), (createConstantOp $res, $veq),
-      [(KnownSizePred $veq)]>;
-
-def SizeIsPresentPred : Constraint<CPred<
-      "$0.size() == 1 &&"
-      "isa<mlir::arith::ConstantIntOp, mlir::arith::ConstantIndexOp>("
-      "  $0[0].getDefiningOp())">>;
-
-def createAllocaOp : NativeCodeCall<
-      "quake::createConstantAlloca($_builder, $_loc, $0, $1)">;
-
-// %2 = constant 10 : i32
-// %3 = quake.alloca !quake.veq<?>[%2 : i32]
-// ───────────────────────────────────────────
-// %3 = quake.alloca !quake.veq<10>
-def FuseConstantToAllocaPattern : Pat<
-      (quake_AllocaOp:$alloca $optSize), (createAllocaOp $alloca, $optSize),
-      [(SizeIsPresentPred $optSize)]>;
-
-def createExtractRefOp : NativeCodeCall<
-      "$_builder.create<quake::ExtractRefOp>($_loc, $0,"
-      " cast<mlir::arith::ConstantOp>($1[0].getDefiningOp()).getValue()."
-      " cast<mlir::IntegerAttr>().getInt())">;
-
-// %2 = constant 10 : i32
-// %3 = quake.extract_ref %1[%2] : (!quake.veq<?>, i32) -> !quake.ref
-// ───────────────────────────────────────────
-// %3 = quake.extract_ref %1[10] : (!quake.veq<?>) -> !quake.ref
-def FuseConstantToExtractRefPattern : Pat<
-      (quake_ExtractRefOp $veq, $index, $rawIndex),
-      (createExtractRefOp $veq, $index),
-      [(SizeIsPresentPred $index)]>;
-
-#endif
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td
index 7dd31469fd..0aac6fa46c 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeTypes.td
@@ -153,9 +153,12 @@ def VeqType : QuakeType<"Veq", "veq"> {
   let hasCustomAssemblyFormat = 1;
   
   let extraClassDeclaration = [{
-    bool hasSpecifiedSize() const { return getSize(); }
+    static constexpr std::size_t kDynamicSize =
+      std::numeric_limits<std::size_t>::max();
+
+    bool hasSpecifiedSize() const { return getSize() != kDynamicSize; }
     static VeqType getUnsized(mlir::MLIRContext *ctx) {
-      return VeqType::get(ctx, 0);
+      return VeqType::get(ctx, kDynamicSize);
     }
   }];
 }
diff --git a/include/cudaq/Optimizer/InitAllDialects.h b/include/cudaq/Optimizer/InitAllDialects.h
index 0748b5866a..54b1ac29f7 100644
--- a/include/cudaq/Optimizer/InitAllDialects.h
+++ b/include/cudaq/Optimizer/InitAllDialects.h
@@ -16,7 +16,6 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
 
 namespace cudaq {
 
@@ -31,9 +30,8 @@ inline void registerAllDialects(mlir::DialectRegistry &registry) {
     mlir::func::FuncDialect,
     mlir::LLVM::LLVMDialect,
     mlir::math::MathDialect,
-    mlir::memref::MemRefDialect,
 
-    // NVQ++ dialects
+    // CUDA-Q dialects
     cudaq::cc::CCDialect,
     quake::QuakeDialect
   >();
diff --git a/lib/Optimizer/CodeGen/CMakeLists.txt b/lib/Optimizer/CodeGen/CMakeLists.txt
index 5c056e0e11..3739855b31 100644
--- a/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -37,7 +37,6 @@ add_cudaq_library(OptCodeGen
     CodeGenOpsIncGen
     CodeGenTypesIncGen
     OptCodeGenPassIncGen
-    OptPeepholeIncGen
     OptTransformsPassIncGen
     QuakeDialect
     
diff --git a/lib/Optimizer/CodeGen/CodeGenDialect.cpp b/lib/Optimizer/CodeGen/CodeGenDialect.cpp
index 93204693a6..76665aea3c 100644
--- a/lib/Optimizer/CodeGen/CodeGenDialect.cpp
+++ b/lib/Optimizer/CodeGen/CodeGenDialect.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include "CodeGenDialect.h"
+#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
 #include "CodeGenOps.h"
 #include "mlir/IR/DialectImplementation.h"
 
diff --git a/lib/Optimizer/CodeGen/CodeGenTypes.cpp b/lib/Optimizer/CodeGen/CodeGenTypes.cpp
index fe2ab43058..bc871513bf 100644
--- a/lib/Optimizer/CodeGen/CodeGenTypes.cpp
+++ b/lib/Optimizer/CodeGen/CodeGenTypes.cpp
@@ -7,7 +7,7 @@
  ******************************************************************************/
 
 #include "CodeGenTypes.h"
-#include "CodeGenDialect.h"
+#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/DialectImplementation.h"
diff --git a/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp b/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp
index 7ad2618a4f..1b8ed8264b 100644
--- a/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/ConvertCCToLLVM.cpp
@@ -6,9 +6,9 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include "CodeGenDialect.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/CodeGen/CCToLLVM.h"
+#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
diff --git a/lib/Optimizer/CodeGen/ConvertToQIR.cpp b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
index 738ee66ea1..1eaba931b3 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIR.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIR.cpp
@@ -45,6 +45,8 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
+#include "PeepholePatterns.inc"
+
 /// Greedy pass to match subgraphs in the IR and replace them with codegen ops.
 /// This step makes converting a DAG of nodes in the conversion step simpler.
 static LogicalResult fuseSubgraphPatterns(MLIRContext *ctx, ModuleOp module) {
diff --git a/lib/Optimizer/CodeGen/ConvertToQIRProfile.cpp b/lib/Optimizer/CodeGen/ConvertToQIRProfile.cpp
index 2ae90d302b..f3aad7c60c 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIRProfile.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIRProfile.cpp
@@ -32,6 +32,8 @@
 
 using namespace mlir;
 
+#include "PeepholePatterns.inc"
+
 /// For a call to `__quantum__rt__qubit_allocate_array`, get the number of
 /// qubits allocated.
 static std::size_t getNumQubits(LLVM::CallOp callOp) {
@@ -516,7 +518,7 @@ namespace {
 /// trivial pass only does this preparation work. It performs no analysis and
 /// does not rewrite function body's, etc.
 
-static const std::vector<std::string> measurementFunctionNames{
+static constexpr std::array<const char *, 3> measurementFunctionNames{
     cudaq::opt::QIRMeasureBody, cudaq::opt::QIRMeasure,
     cudaq::opt::QIRMeasureToRegister};
 
@@ -564,7 +566,7 @@ struct QIRProfilePreparationPass
               func.getFunctionType().getParams(), module);
 
     // Apply irreversible attribute to measurement functions
-    for (auto &funcName : measurementFunctionNames) {
+    for (auto *funcName : measurementFunctionNames) {
       Operation *op = SymbolTable::lookupSymbolIn(module, funcName);
       auto funcOp = llvm::dyn_cast_if_present<LLVM::LLVMFuncOp>(op);
       if (funcOp) {
diff --git a/lib/Optimizer/CodeGen/PassDetails.h b/lib/Optimizer/CodeGen/PassDetails.h
index 719c03b391..7cfe50b2d4 100644
--- a/lib/Optimizer/CodeGen/PassDetails.h
+++ b/lib/Optimizer/CodeGen/PassDetails.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include "CodeGenDialect.h"
+#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/lib/Optimizer/CodeGen/PeepholePatterns.inc b/lib/Optimizer/CodeGen/PeepholePatterns.inc
new file mode 100644
index 0000000000..ad6cc64fe8
--- /dev/null
+++ b/lib/Optimizer/CodeGen/PeepholePatterns.inc
@@ -0,0 +1,238 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+
+// %1 = address_of @__quantum__qis__x__ctl
+// %2 = call @invokewithControlBits %1, %ctrl, %targ
+// ─────────────────────────────────────────────────
+// %2 = call __quantum__qis__cnot %ctrl, %targ
+struct XCtrlOneTargetToCNot : public OpRewritePattern<LLVM::CallOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(LLVM::CallOp call,
+                                PatternRewriter &rewriter) const override {
+    auto callee = call.getCallee();
+    if (!callee)
+      return failure();
+    auto args = call.getOperands();
+    if (!callToInvokeWithXCtrlOneTarget(*callee, args))
+      return failure();
+    auto *ctx = rewriter.getContext();
+    auto funcSymbol = FlatSymbolRefAttr::get(ctx, cudaq::opt::QIRCnot);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        call, TypeRange{}, funcSymbol, args.drop_front(2),
+        call.getFastmathFlagsAttr(), call.getBranchWeightsAttr());
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+// %4 = address_of @__quantum__cis__*
+// ────────────────────────────────────────
+// %4 = address_of @__quantum__cis__*__body
+struct AddrOfCisToBase : public OpRewritePattern<LLVM::AddressOfOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(LLVM::AddressOfOp addr,
+                                PatternRewriter &rewriter) const override {
+    auto global = addr.getGlobalName();
+    if (!needsToBeRenamed(global))
+      return failure();
+    rewriter.replaceOpWithNewOp<LLVM::AddressOfOp>(addr, addr.getType(),
+                                                   global.str() + "__body");
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+// This rule does not apply to measurements.
+//
+// %4 = call @__quantum__cis__*
+// ──────────────────────────────────
+// %4 = call @__quantum__cis__*__body
+struct CalleeConv : public OpRewritePattern<LLVM::CallOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(LLVM::CallOp call,
+                                PatternRewriter &rewriter) const override {
+    auto callee = call.getCallee();
+    if (!callee)
+      return failure();
+    if (!needsToBeRenamed(*callee) ||
+        callee->startswith(cudaq::opt::QIRMeasure))
+      return failure();
+    auto *ctx = rewriter.getContext();
+    auto symbol = FlatSymbolRefAttr::get(ctx, callee->str() + "__body");
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        call, TypeRange{}, symbol, call.getOperands(),
+        call.getFastmathFlagsAttr(), call.getBranchWeightsAttr());
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+// Manually erase dead calls to QIRArrayGetElementPtr1d.
+struct EraseDeadArrayGEP : public OpRewritePattern<LLVM::CallOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(LLVM::CallOp call,
+                                PatternRewriter &rewriter) const override {
+    auto callee = call.getCallee();
+    if (!callee)
+      return failure();
+    if (*callee != cudaq::opt::QIRArrayGetElementPtr1d)
+      return failure();
+    if (!call->use_empty())
+      return failure();
+    rewriter.eraseOp(call);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+// Replace the call with a dead op to DCE.
+//
+// %0 = call @allocate ... : ... -> T*
+// ───────────────────────────────────
+// %0 = undef : T*
+struct EraseArrayAlloc : public OpRewritePattern<LLVM::CallOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(LLVM::CallOp call,
+                                PatternRewriter &rewriter) const override {
+    auto callee = call.getCallee();
+    if (!callee)
+      return failure();
+    if (*callee != cudaq::opt::QIRArrayQubitAllocateArray)
+      return failure();
+    auto *ctx = rewriter.getContext();
+    rewriter.replaceOpWithNewOp<LLVM::UndefOp>(call,
+                                               cudaq::opt::getArrayType(ctx));
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+// Remove the release calls. This removes both array allocations as well as
+// qubit singletons.
+//
+// call @release %5 : (!Qubit) -> ()
+// ─────────────────────────────────
+//
+struct EraseArrayRelease : public OpRewritePattern<LLVM::CallOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(LLVM::CallOp call,
+                                PatternRewriter &rewriter) const override {
+    auto callee = call.getCallee();
+    if (!callee)
+      return failure();
+    if (*callee != cudaq::opt::QIRArrayQubitReleaseArray &&
+        *callee != cudaq::opt::QIRArrayQubitReleaseQubit)
+      return failure();
+    rewriter.eraseOp(call);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+// %result = call @__quantum__qis__mz(%qbit) : (!Qubit) -> i1
+// ──────────────────────────────────────────────────────────────
+// call @__quantum__qis__mz_body(%qbit, %result) : (Q*, R*) -> ()
+struct MeasureCallConv : public OpRewritePattern<LLVM::CallOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(LLVM::CallOp call,
+                                PatternRewriter &rewriter) const override {
+    auto callee = call.getCallee();
+    if (!callee)
+      return failure();
+    auto args = call.getOperands();
+    if (*callee != cudaq::opt::QIRMeasure)
+      return failure();
+    auto inttoptr = args[0].getDefiningOp<LLVM::IntToPtrOp>();
+    if (!inttoptr)
+      return failure();
+    rewriter.replaceOp(call,
+                       createMeasureCall(rewriter, call.getLoc(), call, args));
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+// %result = call @__quantum__qis__mz__to__register(%qbit, i8) : (!Qubit) -> i1
+// ────────────────────────────────────────────────────────────────────────────
+// call @__quantum__qis__mz_body(%qbit, %result) : (Q*, R*) -> ()
+struct MeasureToRegisterCallConv : public OpRewritePattern<LLVM::CallOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(LLVM::CallOp call,
+                                PatternRewriter &rewriter) const override {
+    auto callee = call.getCallee();
+    if (!callee)
+      return failure();
+    auto args = call.getOperands();
+    if (*callee != cudaq::opt::QIRMeasureToRegister)
+      return failure();
+    auto inttoptr = args[0].getDefiningOp<LLVM::IntToPtrOp>();
+    if (!inttoptr)
+      return failure();
+    rewriter.replaceOp(call,
+                       createMeasureCall(rewriter, call.getLoc(), call, args));
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+// %1 = llvm.constant 1
+// %2 = llvm.inttoptr %1 : i64 -> Result*
+// %3 = llvm.bitcast %2 : Result* -> i1*
+// %4 = llvm.load %3
+// ─────────────────────────────────────
+// %4 = call @read_result %2
+struct LoadMeasureResult : public OpRewritePattern<LLVM::LoadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(LLVM::LoadOp load,
+                                PatternRewriter &rewriter) const override {
+    auto *ctx = rewriter.getContext();
+    auto bitcast = load.getAddr().getDefiningOp<LLVM::BitcastOp>();
+    if (!bitcast)
+      return failure();
+    auto inttoptr = bitcast.getArg().getDefiningOp<LLVM::IntToPtrOp>();
+    if (!inttoptr)
+      return failure();
+    auto conint = inttoptr.getArg().getDefiningOp<LLVM::ConstantOp>();
+    if (!conint)
+      return failure();
+    if (bitcast.getType() !=
+        cudaq::opt::factory::getPointerType(IntegerType::get(ctx, 1)))
+      return failure();
+    if (inttoptr.getType() != cudaq::opt::getResultType(ctx))
+      return failure();
+    if (!isa<IntegerAttr>(conint.getValue()))
+      return failure();
+
+    rewriter.replaceOp(load, createReadResultCall(rewriter, load.getLoc(),
+                                                  inttoptr.getResult()));
+    return success();
+  }
+};
+
+} // namespace
diff --git a/lib/Optimizer/CodeGen/VerifyQIRProfile.cpp b/lib/Optimizer/CodeGen/VerifyQIRProfile.cpp
index 02ccc932fa..6adbe833d2 100644
--- a/lib/Optimizer/CodeGen/VerifyQIRProfile.cpp
+++ b/lib/Optimizer/CodeGen/VerifyQIRProfile.cpp
@@ -9,7 +9,7 @@
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
-#include "cudaq/Optimizer/CodeGen/Peephole.h"
+#include "cudaq/Optimizer/CodeGen/QIRFunctionNames.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "cudaq/Todo.h"
 #include "nlohmann/json.hpp"
diff --git a/lib/Optimizer/Dialect/Quake/CMakeLists.txt b/lib/Optimizer/Dialect/Quake/CMakeLists.txt
index 87733716c4..bc55e40d52 100644
--- a/lib/Optimizer/Dialect/Quake/CMakeLists.txt
+++ b/lib/Optimizer/Dialect/Quake/CMakeLists.txt
@@ -16,7 +16,6 @@ add_cudaq_dialect_library(QuakeDialect
   QuakeDialectIncGen
   QuakeOpsIncGen
   QuakeTypesIncGen
-  CanonicalIncGen
 
   LINK_LIBS
   CCDialect
diff --git a/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc b/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
new file mode 100644
index 0000000000..de1eada9be
--- /dev/null
+++ b/lib/Optimizer/Dialect/Quake/CanonicalPatterns.inc
@@ -0,0 +1,489 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// These canonicalization patterns are used by the canonicalize pass and not
+// shared for other uses. Generally speaking, these patterns should be trivial
+// peephole optimizations that reduce the size and complexity of the input IR.
+
+// This file must be included after a `using namespace mlir;` as it uses bare
+// identifiers from that namespace.
+
+namespace {
+
+// %4 = quake.veq_size %3 : (!quake.veq<10>) -> 164
+// ────────────────────────────────────────────────
+// %4 = constant 10 : i64
+struct ForwardConstantVeqSizePattern
+    : public OpRewritePattern<quake::VeqSizeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::VeqSizeOp veqSize,
+                                PatternRewriter &rewriter) const override {
+    auto veqTy = dyn_cast<quake::VeqType>(veqSize.getVeq().getType());
+    if (!veqTy)
+      return failure();
+    if (!veqTy.hasSpecifiedSize())
+      return failure();
+    auto resTy = veqSize.getType();
+    rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(veqSize, veqTy.getSize(),
+                                                      resTy);
+    return success();
+  }
+};
+
+// %2 = constant 10 : i32
+// %3 = quake.alloca !quake.veq<?>[%2 : i32]
+// ─────────────────────────────────────────
+// %3 = quake.alloca !quake.veq<10>
+struct FuseConstantToAllocaPattern : public OpRewritePattern<quake::AllocaOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::AllocaOp alloc,
+                                PatternRewriter &rewriter) const override {
+    auto size = alloc.getSize();
+    if (!size)
+      return failure();
+    auto intCon = cudaq::opt::factory::getIntIfConstant(size);
+    if (!intCon)
+      return failure();
+    auto veqTy = dyn_cast<quake::VeqType>(alloc.getType());
+    if (!veqTy)
+      return failure();
+    if (veqTy.hasSpecifiedSize())
+      return failure();
+    auto loc = alloc.getLoc();
+    auto resTy = alloc.getType();
+    auto newAlloc = rewriter.create<quake::AllocaOp>(
+        loc, static_cast<std::size_t>(*intCon));
+    rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(alloc, resTy, newAlloc);
+    return success();
+  }
+};
+
+// %2 = constant 10 : i32
+// %3 = quake.extract_ref %1[%2] : (!quake.veq<?>, i32) -> !quake.ref
+// ──────────────────────────────────────────────────────────────────
+// %3 = quake.extract_ref %1[10] : (!quake.veq<?>) -> !quake.ref
+struct FuseConstantToExtractRefPattern
+    : public OpRewritePattern<quake::ExtractRefOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::ExtractRefOp extract,
+                                PatternRewriter &rewriter) const override {
+    auto index = extract.getIndex();
+    if (!index)
+      return failure();
+    auto intCon = cudaq::opt::factory::getIntIfConstant(index);
+    if (!intCon)
+      return failure();
+    rewriter.replaceOpWithNewOp<quake::ExtractRefOp>(
+        extract, extract.getVeq(), static_cast<std::size_t>(*intCon));
+    return success();
+  }
+};
+
+// %4 = quake.concat %2, %3 : (!quake.ref, !quake.ref) -> !quake.veq<2>
+// %7 = quake.extract_ref %4[0] : (!quake.veq<2>) -> !quake.ref
+// ───────────────────────────────────────────
+// replace all use with %2
+struct ForwardConcatExtractPattern
+    : public OpRewritePattern<quake::ExtractRefOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::ExtractRefOp extract,
+                                PatternRewriter &rewriter) const override {
+    auto veq = extract.getVeq();
+    auto concatOp = veq.getDefiningOp<quake::ConcatOp>();
+    if (concatOp && extract.hasConstantIndex()) {
+      // Don't run this canonicalization if any of the operands
+      // to concat are of type veq.
+      auto concatQubits = concatOp.getQbits();
+      for (auto qOp : concatQubits)
+        if (isa<quake::VeqType>(qOp.getType()))
+          return failure();
+
+      // concat only has ref type operands.
+      auto index = extract.getConstantIndex();
+      if (index < concatQubits.size()) {
+        auto qOpValue = concatQubits[index];
+        if (isa<quake::RefType>(qOpValue.getType())) {
+          rewriter.replaceOp(extract, {qOpValue});
+          return success();
+        }
+      }
+    }
+    return failure();
+  }
+};
+
+// %2 = quake.concat %1 : (!quake.ref) -> !quake.veq<1>
+// %3 = quake.extract_ref %2[0] : (!quake.veq<1>) -> !quake.ref
+// quake.* %3 ...
+// ───────────────────────────────────────────
+// quake.* %1 ...
+struct ForwardConcatExtractSingleton
+    : public OpRewritePattern<quake::ExtractRefOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::ExtractRefOp extract,
+                                PatternRewriter &rewriter) const override {
+    if (auto concat = extract.getVeq().getDefiningOp<quake::ConcatOp>())
+      if (concat.getType().getSize() == 1 && extract.hasConstantIndex() &&
+          extract.getConstantIndex() == 0) {
+        assert(concat.getQbits().size() == 1 && concat.getQbits()[0]);
+        extract.getResult().replaceUsesWithIf(
+            concat.getQbits()[0], [&](OpOperand &use) {
+              if (Operation *user = use.getOwner())
+                return isQuakeOperation(user);
+              return false;
+            });
+        return success();
+      }
+    return failure();
+  }
+};
+
+// %7 = quake.concat %4 : (!quake.veq<2>) -> !quake.veq<2>
+// ───────────────────────────────────────────
+// removed
+struct ConcatNoOpPattern : public OpRewritePattern<quake::ConcatOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::ConcatOp concat,
+                                PatternRewriter &rewriter) const override {
+    // Remove concat veq<N> -> veq<N>
+    // or
+    // concat ref -> ref
+    auto qubitsToConcat = concat.getQbits();
+    if (qubitsToConcat.size() > 1)
+      return failure();
+
+    // We only want to handle veq -> veq here.
+    if (isa<quake::RefType>(qubitsToConcat.front().getType())) {
+      return failure();
+    }
+
+    // Do not handle anything where we don't know the sizes.
+    auto retTy = concat.getResult().getType();
+    if (auto veqTy = dyn_cast<quake::VeqType>(retTy))
+      if (!veqTy.hasSpecifiedSize())
+        // This could be a folded quake.relax_size op.
+        return failure();
+
+    rewriter.replaceOp(concat, qubitsToConcat);
+    return success();
+  }
+};
+
+// %8 = quake.concat %4, %5, %6 : (!quake.ref, !quake.veq<4>,
+//        !quake.veq<2>) -> !quake.veq<?>
+// ───────────────────────────────────────────────────────────
+// %.8 = quake.concat %4, %5, %6 : (!quake.ref, !quake.veq<4>,
+//        !quake.veq<2>) -> !quake.veq<7>
+// %8 = quake.relax_size %.8 : (!quake.veq<7>) -> !quake.veq<?>
+struct ConcatSizePattern : public OpRewritePattern<quake::ConcatOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::ConcatOp concat,
+                                PatternRewriter &rewriter) const override {
+    if (concat.getType().hasSpecifiedSize())
+      return failure();
+
+    // Walk the arguments and sum them, if possible.
+    std::size_t sum = 0;
+    for (auto opnd : concat.getQbits()) {
+      if (auto veqTy = dyn_cast<quake::VeqType>(opnd.getType())) {
+        if (!veqTy.hasSpecifiedSize())
+          return failure();
+        sum += veqTy.getSize();
+        continue;
+      }
+      assert(isa<quake::RefType>(opnd.getType()));
+      sum++;
+    }
+
+    // Leans into the relax_size canonicalization pattern.
+    auto *ctx = rewriter.getContext();
+    auto loc = concat.getLoc();
+    auto newTy = quake::VeqType::get(ctx, sum);
+    Value newOp =
+        rewriter.create<quake::ConcatOp>(loc, newTy, concat.getQbits());
+    auto noSizeTy = quake::VeqType::getUnsized(ctx);
+    rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(concat, noSizeTy, newOp);
+    return success();
+  }
+};
+
+// %7 = quake.make_struq %5, %6 : (!quake.veq<A>, !quake.veq<N>) ->
+//                    !quake.struq<!quake.veq<A>, !quake.veq<N>>
+// %8 = quake.get_member %7[1] : (!quake.struq<!quake.veq<A>,
+//                                !quake.veq<N>>) -> !quake.veq<N>
+// ───────────────────────────────────────────────────────────
+// replace uses of %8 with %6
+struct BypassMakeStruq : public OpRewritePattern<quake::GetMemberOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::GetMemberOp getMem,
+                                PatternRewriter &rewriter) const override {
+    auto makeStruq = getMem.getStruq().getDefiningOp<quake::MakeStruqOp>();
+    if (!makeStruq)
+      return failure();
+    auto toStrTy = cast<quake::StruqType>(getMem.getStruq().getType());
+    std::uint32_t idx = getMem.getIndex();
+    Value from = makeStruq.getOperand(idx);
+    auto toTy = toStrTy.getMembers()[idx];
+    if (from.getType() != toTy)
+      rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(getMem, toTy, from);
+    else
+      rewriter.replaceOp(getMem, from);
+    return success();
+  }
+};
+
+// %22 = quake.init_state %1, %2 : (!quake.veq<k>, T) -> !quake.veq<?>
+// ────────────────────────────────────────────────────────────────────
+// %.22 = quake.init_state %1, %2 : (!quake.veq<k>, T) -> !quake.veq<k>
+// %22 = quake.relax_size %.22 : (!quake.veq<k>) -> !quake.veq<?>
+struct ForwardAllocaTypePattern
+    : public OpRewritePattern<quake::InitializeStateOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
+                                PatternRewriter &rewriter) const override {
+    if (auto isTy = dyn_cast<quake::VeqType>(initState.getType()))
+      if (!isTy.hasSpecifiedSize()) {
+        auto targ = initState.getTargets();
+        if (auto targTy = dyn_cast<quake::VeqType>(targ.getType()))
+          if (targTy.hasSpecifiedSize()) {
+            auto newInit = rewriter.create<quake::InitializeStateOp>(
+                initState.getLoc(), targTy, targ, initState.getState());
+            rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(initState, isTy,
+                                                            newInit);
+            return success();
+          }
+      }
+
+    // Remove any intervening cast to !cc.ptr<!cc.array<T x ?>> ops.
+    if (auto stateCast =
+            initState.getState().getDefiningOp<cudaq::cc::CastOp>())
+      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateCast.getType())) {
+        auto eleTy = ptrTy.getElementType();
+        if (auto arrTy = dyn_cast<cudaq::cc::ArrayType>(eleTy))
+          if (arrTy.isUnknownSize()) {
+            rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
+                initState, initState.getTargets().getType(),
+                initState.getTargets(), stateCast.getValue());
+            return success();
+          }
+      }
+    return failure();
+  }
+};
+
+// %3 = quake.subveq %0, 4, 10 : (!quake.veq<12>, i64, i64) -> !quake.veq<?>
+// ──────────────────────────────────────────────────────────────────────────
+// %.3 = quake.subveq %0, 4, 10 : (!quake.veq<12>, i64, i64) -> !quake.veq<7>
+// %3 = quake.relax_size %.3 : (!quake.veq<7>) -> !quake.veq<?>
+struct FixUnspecifiedSubveqPattern : public OpRewritePattern<quake::SubVeqOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::SubVeqOp subveq,
+                                PatternRewriter &rewriter) const override {
+    auto veqTy = dyn_cast<quake::VeqType>(subveq.getType());
+    if (veqTy && veqTy.hasSpecifiedSize())
+      return failure();
+    if (!(subveq.hasConstantLowerBound() && subveq.hasConstantUpperBound()))
+      return failure();
+    auto *ctx = rewriter.getContext();
+    std::size_t size =
+        subveq.getConstantUpperBound() - subveq.getConstantLowerBound() + 1u;
+    auto szVecTy = quake::VeqType::get(ctx, size);
+    auto loc = subveq.getLoc();
+    auto subv = rewriter.create<quake::SubVeqOp>(
+        loc, szVecTy, subveq.getVeq(), subveq.getLower(), subveq.getUpper(),
+        subveq.getRawLower(), subveq.getRawUpper());
+    rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(subveq, veqTy, subv);
+    return success();
+  }
+};
+
+// %1 = constant 4 : i64
+// %2 = constant 10 : i64
+// %3 = quake.subveq %0, %1, %2 : (!quake.veq<12>, i64, i64) -> !quake.veq<?>
+// ──────────────────────────────────────────────────────────────────────────
+// %3 = quake.subveq %0, 4, 10 : (!quake.veq<12>, i64, i64) -> !quake.veq<7>
+struct FuseConstantToSubveqPattern : public OpRewritePattern<quake::SubVeqOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::SubVeqOp subveq,
+                                PatternRewriter &rewriter) const override {
+    if (subveq.hasConstantLowerBound() && subveq.hasConstantUpperBound())
+      return failure();
+    bool regen = false;
+    std::int64_t lo = subveq.getConstantLowerBound();
+    Value loVal = subveq.getLower();
+    if (!subveq.hasConstantLowerBound())
+      if (auto olo = cudaq::opt::factory::getIntIfConstant(subveq.getLower())) {
+        regen = true;
+        loVal = nullptr;
+        lo = *olo;
+      }
+
+    std::int64_t hi = subveq.getConstantUpperBound();
+    Value hiVal = subveq.getUpper();
+    if (!subveq.hasConstantUpperBound())
+      if (auto ohi = cudaq::opt::factory::getIntIfConstant(subveq.getUpper())) {
+        regen = true;
+        hiVal = nullptr;
+        hi = *ohi;
+      }
+
+    if (!regen)
+      return failure();
+    rewriter.replaceOpWithNewOp<quake::SubVeqOp>(
+        subveq, subveq.getType(), subveq.getVeq(), loVal, hiVal, lo, hi);
+    return success();
+  }
+};
+
+// Replace subveq operations that extract the entire original register with the
+// original register.
+struct RemoveSubVeqNoOpPattern : public OpRewritePattern<quake::SubVeqOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::SubVeqOp subVeqOp,
+                                PatternRewriter &rewriter) const override {
+    auto origVeq = subVeqOp.getVeq();
+    // The original veq size must be known
+    auto veqType = dyn_cast<quake::VeqType>(origVeq.getType());
+    if (!veqType.hasSpecifiedSize())
+      return failure();
+    if (!(subVeqOp.hasConstantLowerBound() && subVeqOp.hasConstantUpperBound()))
+      return failure();
+
+    // If the subveq is the whole register, than the start value must be 0.
+    if (subVeqOp.getConstantLowerBound() != 0)
+      return failure();
+
+    // If the sizes are equal, then replace
+    if (veqType.getSize() != subVeqOp.getConstantUpperBound() + 1)
+      return failure();
+
+    // this subveq is the whole original register, hence a no-op
+    rewriter.replaceOp(subVeqOp, origVeq);
+    return success();
+  }
+};
+
+// %11 = quake.init_state %_, %_ : (!quake.veq<2>, T1) -> !quake.veq<?>
+// %12 = quake.veq_size %11 : (!quake.veq<?>) -> i64
+// ────────────────────────────────────────────────────────────────────
+// %11 = quake.init_state %_, %_ : (!quake.veq<2>, T1) -> !quake.veq<?>
+// %12 = constant 2 : i64
+struct FoldInitStateSizePattern : public OpRewritePattern<quake::VeqSizeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::VeqSizeOp veqSize,
+                                PatternRewriter &rewriter) const override {
+    Value veq = veqSize.getVeq();
+    if (auto initState = veq.getDefiningOp<quake::InitializeStateOp>())
+      if (auto veqTy =
+              dyn_cast<quake::VeqType>(initState.getTargets().getType()))
+        if (veqTy.hasSpecifiedSize()) {
+          std::size_t numQubits = veqTy.getSize();
+          rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(veqSize, numQubits,
+                                                            veqSize.getType());
+          return success();
+        }
+    return failure();
+  }
+};
+
+// If there is no operation that modifies the wire after it gets unwrapped and
+// before it is wrapped, then the wrap operation is a nop and can be
+// eliminated.
+struct KillDeadWrapPattern : public OpRewritePattern<quake::WrapOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::WrapOp wrap,
+                                PatternRewriter &rewriter) const override {
+    if (auto unwrap = wrap.getWireValue().getDefiningOp<quake::UnwrapOp>())
+      rewriter.eraseOp(wrap);
+    return success();
+  }
+};
+
+template <typename OP>
+struct MergeRotationPattern : public OpRewritePattern<OP> {
+  using Base = OpRewritePattern<OP>;
+  using Base::Base;
+
+  LogicalResult matchAndRewrite(OP rotate,
+                                PatternRewriter &rewriter) const override {
+    auto wireTy = quake::WireType::get(rewriter.getContext());
+    if (rotate.getTarget(0).getType() != wireTy ||
+        !rotate.getControls().empty())
+      return failure();
+    assert(!rotate.getNegatedQubitControls());
+    auto input = rotate.getTarget(0).template getDefiningOp<OP>();
+    if (!input || !input.getControls().empty())
+      return failure();
+    assert(!input.getNegatedQubitControls());
+
+    // At this point, we have
+    //   %input  = quake.rotate %angle1, %wire
+    //   %rotate = quake.rotate %angle2, %input
+    // Replace those ops with
+    //   %new    = quake.rotate (%angle1 + %angle2), %wire
+    auto loc = rotate.getLoc();
+    auto angle1 = input.getParameter(0);
+    auto angle2 = rotate.getParameter(0);
+    if (angle1.getType() != angle2.getType())
+      return failure();
+    auto adjAttr = rotate.getIsAdjAttr();
+    auto newAngle = [&]() -> Value {
+      if (input.isAdj() == rotate.isAdj())
+        return rewriter.create<arith::AddFOp>(loc, angle1, angle2);
+      // One is adjoint, so it should be subtracted from the other.
+      if (input.isAdj())
+        return rewriter.create<arith::SubFOp>(loc, angle2, angle1);
+      adjAttr = input.getIsAdjAttr();
+      return rewriter.create<arith::SubFOp>(loc, angle1, angle2);
+    }();
+    rewriter.replaceOpWithNewOp<OP>(rotate, rotate.getResultTypes(), adjAttr,
+                                    ValueRange{newAngle}, ValueRange{},
+                                    ValueRange{input.getTarget(0)},
+                                    rotate.getNegatedQubitControlsAttr());
+    return success();
+  }
+};
+
+// Forward the argument to a relax_size to the users for all users that are
+// quake operations. All quake ops that take a sized veq argument are
+// polymorphic on all veq types. If the op is not a quake op, then maintain
+// strong typing.
+struct ForwardRelaxedSizePattern : public OpRewritePattern<quake::RelaxSizeOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::RelaxSizeOp relax,
+                                PatternRewriter &rewriter) const override {
+    auto inpVec = relax.getInputVec();
+    bool replaced = false;
+    rewriter.replaceOpWithIf(relax, inpVec, [&](OpOperand &use) {
+      bool res = false;
+      if (Operation *user = use.getOwner())
+        res = isQuakeOperation(user) && !isa<quake::ApplyOp>(user);
+      replaced = replaced || res;
+      return res;
+    });
+    // return success if and only if at least one use was replaced.
+    return success(replaced);
+  };
+};
+
+} // namespace
diff --git a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
index 1f2c3bd06d..d2da75d99b 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
@@ -23,9 +23,7 @@
 
 using namespace mlir;
 
-namespace {
-#include "cudaq/Optimizer/Dialect/Quake/Canonical.inc"
-} // namespace
+#include "CanonicalPatterns.inc"
 
 static LogicalResult verifyWireResultsAreLinear(Operation *op) {
   for (Value v : op->getOpResults())
@@ -311,73 +309,6 @@ LogicalResult quake::BorrowWireOp::verify() {
 // Concat
 //===----------------------------------------------------------------------===//
 
-namespace {
-// %7 = quake.concat %4 : (!quake.veq<2>) -> !quake.veq<2>
-// ───────────────────────────────────────────
-// removed
-struct ConcatNoOpPattern : public OpRewritePattern<quake::ConcatOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::ConcatOp concat,
-                                PatternRewriter &rewriter) const override {
-    // Remove concat veq<N> -> veq<N>
-    // or
-    // concat ref -> ref
-    auto qubitsToConcat = concat.getQbits();
-    if (qubitsToConcat.size() > 1)
-      return failure();
-
-    // We only want to handle veq -> veq here.
-    if (isa<quake::RefType>(qubitsToConcat.front().getType())) {
-      return failure();
-    }
-
-    // Do not handle anything where we don't know the sizes.
-    auto retTy = concat.getResult().getType();
-    if (auto veqTy = dyn_cast<quake::VeqType>(retTy))
-      if (!veqTy.hasSpecifiedSize())
-        // This could be a folded quake.relax_size op.
-        return failure();
-
-    rewriter.replaceOp(concat, qubitsToConcat);
-    return success();
-  }
-};
-
-struct ConcatSizePattern : public OpRewritePattern<quake::ConcatOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::ConcatOp concat,
-                                PatternRewriter &rewriter) const override {
-    if (concat.getType().hasSpecifiedSize())
-      return failure();
-
-    // Walk the arguments and sum them, if possible.
-    std::size_t sum = 0;
-    for (auto opnd : concat.getQbits()) {
-      if (auto veqTy = dyn_cast<quake::VeqType>(opnd.getType())) {
-        if (!veqTy.hasSpecifiedSize())
-          return failure();
-        sum += veqTy.getSize();
-        continue;
-      }
-      assert(isa<quake::RefType>(opnd.getType()));
-      sum++;
-    }
-
-    // Leans into the relax_size canonicalization pattern.
-    auto *ctx = rewriter.getContext();
-    auto loc = concat.getLoc();
-    auto newTy = quake::VeqType::get(ctx, sum);
-    Value newOp =
-        rewriter.create<quake::ConcatOp>(loc, newTy, concat.getQbits());
-    auto noSizeTy = quake::VeqType::getUnsized(ctx);
-    rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(concat, noSizeTy, newOp);
-    return success();
-  };
-};
-} // namespace
-
 void quake::ConcatOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
                                                   MLIRContext *context) {
   patterns.add<ConcatSizePattern, ConcatNoOpPattern>(context);
@@ -418,69 +349,6 @@ void printRawIndex(OpAsmPrinter &printer, OP refOp, Value index,
     printer << rawIndex.getValue();
 }
 
-namespace {
-// %4 = quake.concat %2, %3 : (!quake.ref, !quake.ref) -> !quake.veq<2>
-// %7 = quake.extract_ref %4[0] : (!quake.veq<2>) -> !quake.ref
-// ───────────────────────────────────────────
-// replace all use with %2
-struct ForwardConcatExtractPattern
-    : public OpRewritePattern<quake::ExtractRefOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::ExtractRefOp extract,
-                                PatternRewriter &rewriter) const override {
-    auto veq = extract.getVeq();
-    auto concatOp = veq.getDefiningOp<quake::ConcatOp>();
-    if (concatOp && extract.hasConstantIndex()) {
-      // Don't run this canonicalization if any of the operands
-      // to concat are of type veq.
-      auto concatQubits = concatOp.getQbits();
-      for (auto qOp : concatQubits)
-        if (isa<quake::VeqType>(qOp.getType()))
-          return failure();
-
-      // concat only has ref type operands.
-      auto index = extract.getConstantIndex();
-      if (index < concatQubits.size()) {
-        auto qOpValue = concatQubits[index];
-        if (isa<quake::RefType>(qOpValue.getType())) {
-          rewriter.replaceOp(extract, {qOpValue});
-          return success();
-        }
-      }
-    }
-    return failure();
-  }
-};
-
-// %2 = quake.concat %1 : (!quake.ref) -> !quake.veq<1>
-// %3 = quake.extract_ref %2[0] : (!quake.veq<1>) -> !quake.ref
-// quake.* %3 ...
-// ───────────────────────────────────────────
-// quake.* %1 ...
-struct ForwardConcatExtractSingleton
-    : public OpRewritePattern<quake::ExtractRefOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::ExtractRefOp extract,
-                                PatternRewriter &rewriter) const override {
-    if (auto concat = extract.getVeq().getDefiningOp<quake::ConcatOp>())
-      if (concat.getType().getSize() == 1 && extract.hasConstantIndex() &&
-          extract.getConstantIndex() == 0) {
-        assert(concat.getQbits().size() == 1 && concat.getQbits()[0]);
-        extract.getResult().replaceUsesWithIf(
-            concat.getQbits()[0], [&](OpOperand &use) {
-              if (Operation *user = use.getOwner())
-                return isQuakeOperation(user);
-              return false;
-            });
-        return success();
-      }
-    return failure();
-  }
-};
-} // namespace
-
 void quake::ExtractRefOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
   patterns.add<FuseConstantToExtractRefPattern, ForwardConcatExtractSingleton,
@@ -523,30 +391,6 @@ LogicalResult quake::GetMemberOp::verify() {
   return success();
 }
 
-namespace {
-struct BypassMakeStruq : public OpRewritePattern<quake::GetMemberOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::GetMemberOp getMem,
-                                PatternRewriter &rewriter) const override {
-    if (auto makeStruq =
-            getMem.getStruq().getDefiningOp<quake::MakeStruqOp>()) {
-      auto toStrTy = cast<quake::StruqType>(getMem.getStruq().getType());
-      std::uint32_t idx = getMem.getIndex();
-      Value from = makeStruq.getOperand(idx);
-      auto toTy = toStrTy.getMembers()[idx];
-      if (from.getType() != toTy) {
-        rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(getMem, toTy, from);
-      } else {
-        rewriter.replaceOp(getMem, from);
-      }
-      return success();
-    }
-    return failure();
-  }
-};
-} // namespace
-
 void quake::GetMemberOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
   patterns.add<BypassMakeStruq>(context);
@@ -575,48 +419,6 @@ LogicalResult quake::InitializeStateOp::verify() {
   return success();
 }
 
-namespace {
-// %22 = quake.init_state %1, %2 : (!quake.veq<k>, T) -> !quake.veq<?>
-// ────────────────────────────────────────────────────────────────────
-// %22' = quake.init_state %1, %2 : (!quake.veq<k>, T) -> !quake.veq<k>
-// %22 = quake.relax_size %22' : (!quake.veq<k>) -> !quake.veq<?>
-struct ForwardAllocaTypePattern
-    : public OpRewritePattern<quake::InitializeStateOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::InitializeStateOp initState,
-                                PatternRewriter &rewriter) const override {
-    if (auto isTy = dyn_cast<quake::VeqType>(initState.getType()))
-      if (!isTy.hasSpecifiedSize()) {
-        auto targ = initState.getTargets();
-        if (auto targTy = dyn_cast<quake::VeqType>(targ.getType()))
-          if (targTy.hasSpecifiedSize()) {
-            auto newInit = rewriter.create<quake::InitializeStateOp>(
-                initState.getLoc(), targTy, targ, initState.getState());
-            rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(initState, isTy,
-                                                            newInit);
-            return success();
-          }
-      }
-
-    // Remove any intervening cast to !cc.ptr<!cc.array<T x ?>> ops.
-    if (auto stateCast =
-            initState.getState().getDefiningOp<cudaq::cc::CastOp>())
-      if (auto ptrTy = dyn_cast<cudaq::cc::PointerType>(stateCast.getType())) {
-        auto eleTy = ptrTy.getElementType();
-        if (auto arrTy = dyn_cast<cudaq::cc::ArrayType>(eleTy))
-          if (arrTy.isUnknownSize()) {
-            rewriter.replaceOpWithNewOp<quake::InitializeStateOp>(
-                initState, initState.getTargets().getType(),
-                initState.getTargets(), stateCast.getValue());
-            return success();
-          }
-      }
-    return failure();
-  }
-};
-} // namespace
-
 void quake::InitializeStateOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
   patterns.add<ForwardAllocaTypePattern>(context);
@@ -652,30 +454,6 @@ LogicalResult quake::RelaxSizeOp::verify() {
   return success();
 }
 
-namespace {
-// Forward the argument to a relax_size to the users for all users that are
-// quake operations. All quake ops that take a sized veq argument are
-// polymorphic on all veq types. If the op is not a quake op, then maintain
-// strong typing.
-struct ForwardRelaxedSizePattern : public RewritePattern {
-  ForwardRelaxedSizePattern(MLIRContext *context)
-      : RewritePattern("quake.relax_size", 1, context, {}) {}
-
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
-    auto relax = cast<quake::RelaxSizeOp>(op);
-    auto inpVec = relax.getInputVec();
-    Value result = relax.getResult();
-    result.replaceUsesWithIf(inpVec, [&](OpOperand &use) {
-      if (Operation *user = use.getOwner())
-        return isQuakeOperation(user) && !isa<quake::ApplyOp>(user);
-      return false;
-    });
-    return success();
-  };
-};
-} // namespace
-
 void quake::RelaxSizeOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
   patterns.add<ForwardRelaxedSizePattern>(context);
@@ -709,103 +487,6 @@ LogicalResult quake::SubVeqOp::verify() {
   return success();
 }
 
-namespace {
-// %3 = quake.subveq %0, 4, 10 : (!quake.veq<12>, i64, i64) -> !quake.veq<?>
-// ─────────────────────────────────────────────────────────────────────────────
-// %new3 = quake.subveq %0, 4, 10 : (!quake.veq<12>, i64, i64) -> !quake.veq<7>
-// %3 = quake.relax_size %new3 : (!quake.veq<7>) -> !quake.veq<?>
-struct FixUnspecifiedSubveqPattern : public OpRewritePattern<quake::SubVeqOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::SubVeqOp subveq,
-                                PatternRewriter &rewriter) const override {
-    auto veqTy = dyn_cast<quake::VeqType>(subveq.getType());
-    if (veqTy && veqTy.hasSpecifiedSize())
-      return failure();
-    if (!(subveq.hasConstantLowerBound() && subveq.hasConstantUpperBound()))
-      return failure();
-    auto *ctx = rewriter.getContext();
-    std::size_t size =
-        subveq.getConstantUpperBound() - subveq.getConstantLowerBound() + 1u;
-    auto szVecTy = quake::VeqType::get(ctx, size);
-    auto loc = subveq.getLoc();
-    auto subv = rewriter.create<quake::SubVeqOp>(
-        loc, szVecTy, subveq.getVeq(), subveq.getLower(), subveq.getUpper(),
-        subveq.getRawLower(), subveq.getRawUpper());
-    rewriter.replaceOpWithNewOp<quake::RelaxSizeOp>(subveq, veqTy, subv);
-    return success();
-  }
-};
-
-// %1 = constant 4 : i64
-// %2 = constant 10 : i64
-// %3 = quake.subveq %0, %1, %2 : (!quake.veq<12>, i64, i64) -> !quake.veq<?>
-// ─────────────────────────────────────────────────────────────────────────────
-// %3 = quake.subveq %0, 4, 10 : (!quake.veq<12>, i64, i64) -> !quake.veq<7>
-struct FuseConstantToSubveqPattern : public OpRewritePattern<quake::SubVeqOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::SubVeqOp subveq,
-                                PatternRewriter &rewriter) const override {
-    if (subveq.hasConstantLowerBound() && subveq.hasConstantUpperBound())
-      return failure();
-    bool regen = false;
-    std::int64_t lo = subveq.getConstantLowerBound();
-    Value loVal = subveq.getLower();
-    if (!subveq.hasConstantLowerBound())
-      if (auto olo = cudaq::opt::factory::getIntIfConstant(subveq.getLower())) {
-        regen = true;
-        loVal = nullptr;
-        lo = *olo;
-      }
-
-    std::int64_t hi = subveq.getConstantUpperBound();
-    Value hiVal = subveq.getUpper();
-    if (!subveq.hasConstantUpperBound())
-      if (auto ohi = cudaq::opt::factory::getIntIfConstant(subveq.getUpper())) {
-        regen = true;
-        hiVal = nullptr;
-        hi = *ohi;
-      }
-
-    if (!regen)
-      return failure();
-    rewriter.replaceOpWithNewOp<quake::SubVeqOp>(
-        subveq, subveq.getType(), subveq.getVeq(), loVal, hiVal, lo, hi);
-    return success();
-  }
-};
-
-// Replace subveq operations that extract the entire original register with the
-// original register.
-struct RemoveSubVeqNoOpPattern : public OpRewritePattern<quake::SubVeqOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::SubVeqOp subVeqOp,
-                                PatternRewriter &rewriter) const override {
-    auto origVeq = subVeqOp.getVeq();
-    // The original veq size must be known
-    auto veqType = dyn_cast<quake::VeqType>(origVeq.getType());
-    if (!veqType.hasSpecifiedSize())
-      return failure();
-    if (!(subVeqOp.hasConstantLowerBound() && subVeqOp.hasConstantUpperBound()))
-      return failure();
-
-    // If the subveq is the whole register, than the start value must be 0.
-    if (subVeqOp.getConstantLowerBound() != 0)
-      return failure();
-
-    // If the sizes are equal, then replace
-    if (veqType.getSize() != subVeqOp.getConstantUpperBound() + 1)
-      return failure();
-
-    // this subveq is the whole original register, hence a no-op
-    rewriter.replaceOp(subVeqOp, origVeq);
-    return success();
-  }
-};
-} // namespace
-
 void quake::SubVeqOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
                                                   MLIRContext *context) {
   patterns.add<FixUnspecifiedSubveqPattern, FuseConstantToSubveqPattern,
@@ -816,32 +497,6 @@ void quake::SubVeqOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
 // VeqSizeOp
 //===----------------------------------------------------------------------===//
 
-namespace {
-struct FoldInitStateSizePattern : public OpRewritePattern<quake::VeqSizeOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  // %11 = quake.init_state %_, %_ : (!quake.veq<2>, T1) -> !quake.veq<?>
-  // %12 = quake.veq_size %11 : (!quake.veq<?>) -> i64
-  // ────────────────────────────────────────────────────────────────────
-  // %11 = quake.init_state %_, %_ : (!quake.veq<2>, T1) -> !quake.veq<?>
-  // %12 = constant 2 : i64
-  LogicalResult matchAndRewrite(quake::VeqSizeOp veqSize,
-                                PatternRewriter &rewriter) const override {
-    Value veq = veqSize.getVeq();
-    if (auto initState = veq.getDefiningOp<quake::InitializeStateOp>())
-      if (auto veqTy =
-              dyn_cast<quake::VeqType>(initState.getTargets().getType()))
-        if (veqTy.hasSpecifiedSize()) {
-          std::size_t numQubits = veqTy.getSize();
-          rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(veqSize, numQubits,
-                                                            veqSize.getType());
-          return success();
-        }
-    return failure();
-  }
-};
-} // namespace
-
 void quake::VeqSizeOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
                                                    MLIRContext *context) {
   patterns.add<FoldInitStateSizePattern, ForwardConstantVeqSizePattern>(
@@ -852,22 +507,6 @@ void quake::VeqSizeOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
 // WrapOp
 //===----------------------------------------------------------------------===//
 
-namespace {
-// If there is no operation that modifies the wire after it gets unwrapped and
-// before it is wrapped, then the wrap operation is a nop and can be
-// eliminated.
-struct KillDeadWrapPattern : public OpRewritePattern<quake::WrapOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(quake::WrapOp wrap,
-                                PatternRewriter &rewriter) const override {
-    if (auto unwrap = wrap.getWireValue().getDefiningOp<quake::UnwrapOp>())
-      rewriter.eraseOp(wrap);
-    return success();
-  }
-};
-} // namespace
-
 void quake::WrapOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
                                                 MLIRContext *context) {
   patterns.add<KillDeadWrapPattern>(context);
@@ -1040,53 +679,6 @@ void quake::RxOp::getOperatorMatrix(Matrix &matrix) {
                  -1i * std::sin(theta / 2.), std::cos(theta / 2.)});
 }
 
-namespace {
-template <typename OP>
-struct MergeRotationPattern : public OpRewritePattern<OP> {
-  using Base = OpRewritePattern<OP>;
-  using Base::Base;
-
-  LogicalResult matchAndRewrite(OP rotate,
-                                PatternRewriter &rewriter) const override {
-    auto wireTy = quake::WireType::get(rewriter.getContext());
-    if (rotate.getTarget(0).getType() != wireTy ||
-        !rotate.getControls().empty())
-      return failure();
-    assert(!rotate.getNegatedQubitControls());
-    auto input = rotate.getTarget(0).template getDefiningOp<OP>();
-    if (!input || !input.getControls().empty())
-      return failure();
-    assert(!input.getNegatedQubitControls());
-
-    // At this point, we have
-    //   %input  = quake.rotate %angle1, %wire
-    //   %rotate = quake.rotate %angle2, %input
-    // Replace those ops with
-    //   %new    = quake.rotate (%angle1 + %angle2), %wire
-    auto loc = rotate.getLoc();
-    auto angle1 = input.getParameter(0);
-    auto angle2 = rotate.getParameter(0);
-    if (angle1.getType() != angle2.getType())
-      return failure();
-    auto adjAttr = rotate.getIsAdjAttr();
-    auto newAngle = [&]() -> Value {
-      if (input.isAdj() == rotate.isAdj())
-        return rewriter.create<arith::AddFOp>(loc, angle1, angle2);
-      // One is adjoint, so it should be subtracted from the other.
-      if (input.isAdj())
-        return rewriter.create<arith::SubFOp>(loc, angle2, angle1);
-      adjAttr = input.getIsAdjAttr();
-      return rewriter.create<arith::SubFOp>(loc, angle1, angle2);
-    }();
-    rewriter.replaceOpWithNewOp<OP>(rotate, rotate.getResultTypes(), adjAttr,
-                                    ValueRange{newAngle}, ValueRange{},
-                                    ValueRange{input.getTarget(0)},
-                                    rotate.getNegatedQubitControlsAttr());
-    return success();
-  }
-};
-} // namespace
-
 void quake::RxOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
                                               MLIRContext *context) {
   patterns.add<MergeRotationPattern<quake::RxOp>>(context);
diff --git a/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp b/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp
index b536a59710..d124e32c3c 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeTypes.cpp
@@ -39,9 +39,9 @@ void quake::VeqType::print(AsmPrinter &os) const {
 Type quake::VeqType::parse(AsmParser &parser) {
   if (parser.parseLess())
     return {};
-  std::size_t size = 0;
+  std::size_t size = kDynamicSize;
   if (succeeded(parser.parseOptionalQuestion()))
-    size = 0;
+    size = kDynamicSize;
   else if (parser.parseInteger(size))
     return {};
   if (parser.parseGreater())
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index d76a802d72..7783822bd3 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -3168,7 +3168,7 @@ def bodyBuilder(iterVar):
             # we currently handle `veq` and `stdvec` types
             if quake.VeqType.isinstance(iterable.type):
                 size = quake.VeqType.getSize(iterable.type)
-                if size:
+                if quake.VeqType.hasSpecifiedSize(iterable.type):
                     totalSize = self.getConstantInt(size)
                 else:
                     totalSize = quake.VeqSizeOp(self.getIntegerType(64),
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index 192aad8929..59c091cba2 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -675,8 +675,8 @@ def __applyControlOrAdjoint(self, target, isAdjoint, controls, *args):
 
                 if (quake.VeqType.isinstance(inTy) and
                         quake.VeqType.isinstance(argTy)):
-                    if quake.VeqType.getSize(
-                            inTy) and not quake.VeqType.getSize(argTy):
+                    if quake.VeqType.hasSpecifiedSize(
+                            inTy) and not quake.VeqType.hasSpecifiedSize(argTy):
                         value = quake.RelaxSizeOp(argTy, value).result
 
                 mlirValues.append(value)
@@ -1029,8 +1029,8 @@ def reset(self, target):
                 return
 
             # target is a VeqType
-            size = quake.VeqType.getSize(target.mlirValue.type)
-            if size:
+            if quake.VeqType.hasSpecifiedSize(target.mlirValue.type):
+                size = quake.VeqType.getSize(target.mlirValue.type)
                 for i in range(size):
                     extracted = quake.ExtractRefOp(quake.RefType.get(self.ctx),
                                                    target.mlirValue, i).result
diff --git a/python/cudaq/kernel/quake_value.py b/python/cudaq/kernel/quake_value.py
index 3c55a9170c..41689cbfad 100644
--- a/python/cudaq/kernel/quake_value.py
+++ b/python/cudaq/kernel/quake_value.py
@@ -67,7 +67,7 @@ def size(self):
 
             if quake.VeqType.isinstance(type):
                 size = quake.VeqType.getSize(type)
-                if size:
+                if quake.VeqType.hasSpecifiedSize(type):
                     return size
                 return QuakeValue(
                     quake.VeqSizeOp(self.intType, self.mlirValue).result,
diff --git a/python/cudaq/operator/expressions.py b/python/cudaq/operator/expressions.py
index 8077e9f80a..4ee2756297 100644
--- a/python/cudaq/operator/expressions.py
+++ b/python/cudaq/operator/expressions.py
@@ -768,42 +768,49 @@ def generator(
     """
 
     @classmethod
-    def define(cls, op_id: str, expected_dimensions: Sequence[int],
-               create: Callable[..., NDArray[numpy.complexfloating]]) -> None:
+    def define(
+        cls,
+        op_id: str,
+        expected_dimensions: Sequence[int],
+        create: Callable[..., NDArray[numpy.complexfloating]],
+        override: bool = False,
+    ) -> None:
         """
         Adds the definition of an elementary operator with the given id to the class.
         After definition, an the defined elementary operator can be instantiated by
         providing the operator id as well as the degree(s) of freedom that it acts on.
-        
-        An elementary operator is a parameterized object acting on certain degrees of 
-        freedom. To evaluate an operator, for example to compute its matrix, the level, 
-        that is the dimension, for each degree of freedom it acts on must be provided, 
-        as well as all additional parameters. Additional parameters must be provided in 
+
+        An elementary operator is a parameterized object acting on certain degrees of
+        freedom. To evaluate an operator, for example to compute its matrix, the level,
+        that is the dimension, for each degree of freedom it acts on must be provided,
+        as well as all additional parameters. Additional parameters must be provided in
         the form of keyword arguments.
 
         Note:
-        The dimensions passed during operator evaluation are automatically validated 
-        against the expected dimensions specified during definition - the `create` 
+        The dimensions passed during operator evaluation are automatically validated
+        against the expected dimensions specified during definition - the `create`
         function does not need to do this.
 
         Arguments:
             op_id: A string that uniquely identifies the defined operator.
             expected_dimensions: defines the number of levels, that is the dimension,
-                for each degree of freedom in canonical (that is sorted) order. A 
-                negative or zero value for one (or more) of the expected dimensions 
-                indicates that the operator is defined for any dimension of the 
+                for each degree of freedom in canonical (that is sorted) order. A
+                negative or zero value for one (or more) of the expected dimensions
+                indicates that the operator is defined for any dimension of the
                 corresponding degree of freedom.
-            create: Takes any number of complex-valued arguments and returns the 
+            create: Takes any number of complex-valued arguments and returns the
                 matrix representing the operator in canonical order. If the matrix can
-                be defined for any number of levels for one or more degree of freedom, 
+                be defined for any number of levels for one or more degree of freedom,
                 the `create` function must take an argument called `dimension` (or `dim`
                 for short), if the operator acts on a single degree of freedom, and an
-                argument called `dimensions` (or `dims` for short), if the operator acts 
+                argument called `dimensions` (or `dims` for short), if the operator acts
                 on multiple degrees of freedom.
+            override: if True it allows override the definition. (default: False)
         """
-        if op_id in cls._ops:
+        if not override and op_id in cls._ops:
             raise ValueError(
                 f"an {cls.__name__} with id {op_id} already exists")
+
         cls._ops[op_id] = cls.Definition(op_id, expected_dimensions, create,
                                          cls._create_key)
 
diff --git a/python/runtime/mlir/py_register_dialects.cpp b/python/runtime/mlir/py_register_dialects.cpp
index 9c0c4f2985..cda4f2a30a 100644
--- a/python/runtime/mlir/py_register_dialects.cpp
+++ b/python/runtime/mlir/py_register_dialects.cpp
@@ -74,7 +74,8 @@ void registerQuakeDialectAndTypes(py::module &m) {
           [](py::object cls, MlirContext ctx, std::size_t size) {
             return wrap(quake::VeqType::get(unwrap(ctx), size));
           },
-          py::arg("cls"), py::arg("context"), py::arg("size") = 0)
+          py::arg("cls"), py::arg("context"),
+          py::arg("size") = std::numeric_limits<std::size_t>::max())
       .def_staticmethod(
           "hasSpecifiedSize",
           [](MlirType type) {
diff --git a/runtime/cudaq/builder/QuakeValue.cpp b/runtime/cudaq/builder/QuakeValue.cpp
index ea2261651e..9a714d1ac8 100644
--- a/runtime/cudaq/builder/QuakeValue.cpp
+++ b/runtime/cudaq/builder/QuakeValue.cpp
@@ -190,7 +190,8 @@ QuakeValue QuakeValue::size() {
 
 std::optional<std::size_t> QuakeValue::constantSize() {
   if (auto qvecTy = dyn_cast<quake::VeqType>(getValue().getType()))
-    return qvecTy.getSize();
+    if (qvecTy.hasSpecifiedSize())
+      return qvecTy.getSize();
 
   return std::nullopt;
 }
diff --git a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
index df8a89e6f4..b24637c2ce 100644
--- a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
+++ b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
@@ -82,6 +82,7 @@ class DefaultQuantumPlatform : public cudaq::quantum_platform {
   /// specified by that variable.
   void setTargetBackend(const std::string &backend) override {
     platformQPUs.clear();
+    threadToQpuId.clear();
     platformQPUs.emplace_back(std::make_unique<DefaultQPU>());
 
     cudaq::info("Backend string is {}", backend);
@@ -121,6 +122,7 @@ class DefaultQuantumPlatform : public cudaq::quantum_platform {
       auto qpuName = config.BackendConfig->PlatformQpu;
       cudaq::info("Default platform QPU subtype name: {}", qpuName);
       platformQPUs.clear();
+      threadToQpuId.clear();
       platformQPUs.emplace_back(cudaq::registry::get<cudaq::QPU>(qpuName));
       if (platformQPUs.front() == nullptr)
         throw std::runtime_error(
diff --git a/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp b/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
index bdde97a464..198a214c16 100644
--- a/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
+++ b/runtime/cudaq/platform/mqpu/MultiQPUPlatform.cpp
@@ -31,6 +31,7 @@ class MultiQPUQuantumPlatform : public cudaq::quantum_platform {
     // Make sure that we clean up the client QPUs first before cleaning up the
     // remote servers.
     platformQPUs.clear();
+    threadToQpuId.clear();
     platformNumQPUs = 0;
     m_remoteServers.clear();
   }
@@ -154,6 +155,7 @@ class MultiQPUQuantumPlatform : public cudaq::quantum_platform {
                         qpuSubType));
       if (qpuSubType == "NvcfSimulatorQPU") {
         platformQPUs.clear();
+        threadToQpuId.clear();
         auto simName = getOpt(description, "backend");
         if (simName.empty())
           simName = "custatevec-fp32";
@@ -199,6 +201,7 @@ class MultiQPUQuantumPlatform : public cudaq::quantum_platform {
       } else if (qpuSubType == "orca") {
         auto urls = cudaq::split(getOpt(description, "url"), ',');
         platformQPUs.clear();
+        threadToQpuId.clear();
         for (std::size_t qId = 0; qId < urls.size(); ++qId) {
           // Populate the information and add the QPUs
           platformQPUs.emplace_back(cudaq::registry::get<cudaq::QPU>("orca"));
@@ -244,6 +247,7 @@ class MultiQPUQuantumPlatform : public cudaq::quantum_platform {
               "receiving {}, expecting {}.",
               sims.size(), urls.size()));
         platformQPUs.clear();
+        threadToQpuId.clear();
         for (std::size_t qId = 0; qId < urls.size(); ++qId) {
           const auto simName = sims.size() == 1 ? sims.front() : sims[qId];
           // Populate the information and add the QPUs
diff --git a/runtime/cudaq/platform/mqpu/helpers/MQPUUtils.cpp b/runtime/cudaq/platform/mqpu/helpers/MQPUUtils.cpp
index bbf32ddd22..fe84926d4e 100644
--- a/runtime/cudaq/platform/mqpu/helpers/MQPUUtils.cpp
+++ b/runtime/cudaq/platform/mqpu/helpers/MQPUUtils.cpp
@@ -116,7 +116,9 @@ cudaq::AutoLaunchRestServerProcess::AutoLaunchRestServerProcess(
       if (!std::string(*env).starts_with("LD_LIBRARY_PATH="))
         Env->push_back(*env);
     }
-    Env->push_back("LD_LIBRARY_PATH=" + dynLibs);
+    // Cache the string as a member var to keep the pointer alive.
+    m_ldLibPathEnv = "LD_LIBRARY_PATH=" + dynLibs;
+    Env->push_back(m_ldLibPathEnv);
   }
 
   constexpr std::size_t PORT_MAX_RETRIES = 10;
diff --git a/runtime/cudaq/platform/mqpu/helpers/MQPUUtils.h b/runtime/cudaq/platform/mqpu/helpers/MQPUUtils.h
index 974107fd4c..f759923d7e 100644
--- a/runtime/cudaq/platform/mqpu/helpers/MQPUUtils.h
+++ b/runtime/cudaq/platform/mqpu/helpers/MQPUUtils.h
@@ -25,6 +25,7 @@ struct AutoLaunchRestServerProcess {
 private:
   int m_pid;
   std::string m_url;
+  std::string m_ldLibPathEnv;
 };
 
 // Helper to retrieve the number of GPU.
diff --git a/scripts/validate_pycudaq.sh b/scripts/validate_pycudaq.sh
index 63bf95d05f..f6eb70b00f 100644
--- a/scripts/validate_pycudaq.sh
+++ b/scripts/validate_pycudaq.sh
@@ -25,13 +25,16 @@
 # COPY ${package_folder} ${package_folder}
 # COPY scripts/validate_pycudaq.sh validate_pycudaq.sh
 # COPY docs/sphinx/examples/python /tmp/examples/
-# COPY docs/sphinx/applications/python /tmp/applications/
-# COPY docs/sphinx/targets/python /tmp/targets/
 # COPY docs/sphinx/snippets/python /tmp/snippets/
 # COPY python/tests /tmp/tests/
 # COPY python/README.md.in /tmp/README.md
 # RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates vim wget openssh-client
 
+# Note: To run the target tests, make sure to set all necessary API keys:
+# COPY docs/sphinx/targets/python /tmp/targets/
+# ENV NVQC_API_KEY=...
+# ENV ...
+
 __optind__=$OPTIND
 OPTIND=1
 python_version=3.11
@@ -123,18 +126,6 @@ if [ ! $? -eq 0 ]; then
     status_sum=$((status_sum+1))
 fi
 
-# Run torch integrator tests.
-# This is an optional integrator, which requires torch and torchdiffeq.
-# Install torch separately to match the cuda version.
-# Torch if installed as part of torchdiffeq's dependencies, may default to the latest cuda version. 
-python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu$(echo ${cuda_version//.})
-python3 -m pip install torchdiffeq
-python3 -m pytest -v "$root_folder/tests/operator/integrators"
-if [ ! $? -eq 0 ]; then
-    echo -e "\e[01;31mPython tests failed.\e[0m" >&2
-    status_sum=$((status_sum+1))
-fi
-
 # If this is a quick test, we return here.
 if $quick_test; then
     if [ ! $status_sum -eq 0 ]; then
@@ -169,19 +160,42 @@ for parallelTest in "$root_folder/tests/parallel"/*.py; do
     fi
 done
 
+# Run torch integrator tests.
+# This is an optional integrator, which requires torch and torchdiffeq.
+# Install torch separately to match the cuda version.
+# Torch if installed as part of torchdiffeq's dependencies, may default to the latest cuda version. 
+python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu$(echo $cuda_version | cut -d '.' -f-2 | tr -d .)
+python3 -m pip install torchdiffeq
+python3 -m pytest -v "$root_folder/tests/operator/integrators"
+if [ ! $? -eq 0 ]; then
+    echo -e "\e[01;31mPython tests failed.\e[0m" >&2
+    status_sum=$((status_sum+1))
+fi
+
 # Run snippets in docs
-for ex in `find "$root_folder/snippets" -name '*.py'`; do
+# Some snippets generate plots
+python3 -m pip install --user matplotlib
+for ex in `find "$root_folder/snippets" -name '*.py' -not -path '*/nvqc/*'`; do
     python3 "$ex"
     if [ ! $? -eq 0 ]; then
         echo -e "\e[01;31mFailed to execute $ex.\e[0m" >&2
         status_sum=$((status_sum+1))
     fi
 done
+if [ -n "${NVQC_API_KEY}" ]; then
+    for ex in `find "$root_folder/snippets" -name '*.py' -path '*/nvqc/*'`; do
+        python3 "$ex"
+        if [ ! $? -eq 0 ]; then
+            echo -e "\e[01;31mFailed to execute $ex.\e[0m" >&2
+            status_sum=$((status_sum+1))
+        fi
+    done
+fi
 
 # Run examples
 # Some examples generate plots
 python3 -m pip install --user matplotlib
-for ex in `find "$root_folder/examples" "$root_folder/applications" "$root_folder/targets" -name '*.py'`; do
+for ex in `find "$root_folder/examples" -name '*.py'`; do
     skip_example=false
     explicit_targets=`cat $ex | grep -Po '^\s*cudaq.set_target\("\K.*(?=")'`
     for t in $explicit_targets; do
@@ -190,6 +204,9 @@ for ex in `find "$root_folder/examples" "$root_folder/applications" "$root_folde
             # to submit a (paid) job to Amazon Braket (includes QuEra).
             echo -e "\e[01;31mWarning: Explicitly set target braket or quera in $ex; skipping validation due to paid submission.\e[0m" >&2
             skip_example=true
+        elif [ "$t" == "nvqc" ] && [ -z "${NVQC_API_KEY}" ]; then 
+            echo -e "\e[01;31mWarning: Explicitly set target nvqc in $ex; skipping validation due to missing API key.\e[0m" >&2
+            skip_example=true
         fi
     done
     if ! $skip_example; then 
@@ -201,6 +218,29 @@ for ex in `find "$root_folder/examples" "$root_folder/applications" "$root_folde
     fi
 done
 
+# Run target tests if target folder exists.
+if [ -d "$root_folder/targets" ]; then
+    for ex in `find "$root_folder/targets" -name '*.py'`; do
+        skip_example=false
+        explicit_targets=`cat $ex | grep -Po '^\s*cudaq.set_target\("\K.*(?=")'`
+        for t in $explicit_targets; do
+            if [ "$t" == "quera" ] || [ "$t" == "braket" ] ; then 
+                # Skipped because GitHub does not have the necessary authentication token 
+                # to submit a (paid) job to Amazon Braket (includes QuEra).
+                echo -e "\e[01;31mWarning: Explicitly set target braket or quera in $ex; skipping validation due to paid submission.\e[0m" >&2
+                skip_example=true
+            fi
+        done
+        if ! $skip_example; then 
+            python3 "$ex"
+            if [ ! $? -eq 0 ]; then
+                echo -e "\e[01;31mFailed to execute $ex.\e[0m" >&2
+                status_sum=$((status_sum+1))
+            fi
+        fi
+    done
+fi
+
 # Run remote-mqpu platform test
 # Use cudaq-qpud.py wrapper script to automatically find dependencies for the Python wheel configuration.
 # Note that a derivative of this code is in
diff --git a/targettests/execution/angled_gate.cpp b/targettests/execution/angled_gate.cpp
index d3ac5c7e92..90d97d04e3 100644
--- a/targettests/execution/angled_gate.cpp
+++ b/targettests/execution/angled_gate.cpp
@@ -6,7 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 
 #include <cudaq.h>
 
diff --git a/targettests/execution/bug_qubit.cpp b/targettests/execution/bug_qubit.cpp
index d33409b9c8..c987e0025c 100644
--- a/targettests/execution/bug_qubit.cpp
+++ b/targettests/execution/bug_qubit.cpp
@@ -15,7 +15,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
 // RUN: cudaq-quake %cpp_std %s | cudaq-opt --promote-qubit-allocation | FileCheck --check-prefixes=MLIR %s
 
diff --git a/targettests/execution/callable_kernel_arg.cpp b/targettests/execution/callable_kernel_arg.cpp
index 7ffa47ebc9..7e142f7f98 100644
--- a/targettests/execution/callable_kernel_arg.cpp
+++ b/targettests/execution/callable_kernel_arg.cpp
@@ -13,7 +13,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then  nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then  nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
 // clang-format on
 
diff --git a/targettests/execution/cudaq_observe.cpp b/targettests/execution/cudaq_observe.cpp
index 4db6b8c644..24642b5c6d 100644
--- a/targettests/execution/cudaq_observe.cpp
+++ b/targettests/execution/cudaq_observe.cpp
@@ -15,7 +15,7 @@
 // RUN: nvq++ --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/custom_operation_adj.cpp b/targettests/execution/custom_operation_adj.cpp
index 16dfefc218..73bf181e86 100644
--- a/targettests/execution/custom_operation_adj.cpp
+++ b/targettests/execution/custom_operation_adj.cpp
@@ -14,7 +14,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/custom_operation_basic.cpp b/targettests/execution/custom_operation_basic.cpp
index 9ba007c8b7..a660fa6985 100644
--- a/targettests/execution/custom_operation_basic.cpp
+++ b/targettests/execution/custom_operation_basic.cpp
@@ -14,7 +14,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/graph_coloring-1.cpp b/targettests/execution/graph_coloring-1.cpp
index 3257c9a9d3..34db685e32 100644
--- a/targettests/execution/graph_coloring-1.cpp
+++ b/targettests/execution/graph_coloring-1.cpp
@@ -10,7 +10,7 @@
 // clang-format off
 // RUN: nvq++ %s -o %t --target infleqtion --emulate && %t | FileCheck %s
 // RUN: nvq++ %s -o %t --target quantinuum --emulate && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %s -o %t --target braket --emulate && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %s -o %t --target braket --emulate && %t | FileCheck %s; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/graph_coloring.cpp b/targettests/execution/graph_coloring.cpp
index fd11d40f42..ae40f54f3f 100644
--- a/targettests/execution/graph_coloring.cpp
+++ b/targettests/execution/graph_coloring.cpp
@@ -10,7 +10,7 @@
 // clang-format off
 // RUN: nvq++ %s -o %t --target infleqtion --emulate && %t | FileCheck %s
 // RUN: nvq++ %s -o %t --target quantinuum --emulate && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %s -o %t --target braket --emulate && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %s -o %t --target braket --emulate && %t | FileCheck %s; fi
 // clang-format on
 
 #include <cudaq.h>
diff --git a/targettests/execution/if_jit.cpp b/targettests/execution/if_jit.cpp
index ee820f3cdb..464d35c8d7 100644
--- a/targettests/execution/if_jit.cpp
+++ b/targettests/execution/if_jit.cpp
@@ -15,7 +15,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
 // clang-format on
 
diff --git a/targettests/execution/int8_t.cpp b/targettests/execution/int8_t.cpp
index 2ff858d5eb..90303b539a 100644
--- a/targettests/execution/int8_t.cpp
+++ b/targettests/execution/int8_t.cpp
@@ -13,7 +13,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
 // clang-format on
 
diff --git a/targettests/execution/int8_t_free_func.cpp b/targettests/execution/int8_t_free_func.cpp
index 5df3f81249..d6c12515ac 100644
--- a/targettests/execution/int8_t_free_func.cpp
+++ b/targettests/execution/int8_t_free_func.cpp
@@ -13,7 +13,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
 // clang-format on
 
diff --git a/targettests/execution/load_value.cpp b/targettests/execution/load_value.cpp
index f4296931d4..cd31d04b62 100644
--- a/targettests/execution/load_value.cpp
+++ b/targettests/execution/load_value.cpp
@@ -14,7 +14,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
 // clang-format on
 
diff --git a/targettests/execution/state_preparation.cpp b/targettests/execution/state_preparation.cpp
index c1e74889f1..adb4e9c002 100644
--- a/targettests/execution/state_preparation.cpp
+++ b/targettests/execution/state_preparation.cpp
@@ -17,7 +17,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 
 #include <bitset> 
 #include <cudaq.h>
diff --git a/targettests/execution/state_preparation_vector.cpp b/targettests/execution/state_preparation_vector.cpp
index d7e896f2a2..138ac1ae50 100644
--- a/targettests/execution/state_preparation_vector.cpp
+++ b/targettests/execution/state_preparation_vector.cpp
@@ -11,7 +11,7 @@
 // RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
 
 // Quantum emulators
-// RUN: nvq++ %cpp_std -target braket     -emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std -target braket -emulate %s -o %t && %t | FileCheck %s ; fi
 // RUN: nvq++ %cpp_std -target quantinuum -emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std -target ionq       -emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std -target oqc        -emulate %s -o %t && %t | FileCheck %s
@@ -35,17 +35,10 @@ __qpu__ void test_complex_constant_array() {
   cudaq::qvector v(std::vector<cudaq::complex>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
 }
 
-#ifdef CUDAQ_SIMULATION_SCALAR_FP32
 __qpu__ void test_complex_constant_array_floating_point() {
   cudaq::qvector v(
-      std::vector<std::complex<float>>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+      std::vector<std::complex<cudaq::real>>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
 }
-#else
-__qpu__ void test_complex_constant_array_floating_point() {
-  cudaq::qvector v(
-      std::vector<std::complex<double>>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
-}
-#endif
 
 __qpu__ void test_complex_constant_array2() {
   cudaq::qvector v1(
@@ -63,45 +56,27 @@ __qpu__ void test_complex_array_param(std::vector<cudaq::complex> inState) {
   cudaq::qvector q1 = inState;
 }
 
-#ifdef CUDAQ_SIMULATION_SCALAR_FP32
-__qpu__ void test_complex_array_param_floating_point(
-    std::vector<std::complex<float>> inState) {
-  cudaq::qvector q1 = inState;
-}
-#else
 __qpu__ void test_complex_array_param_floating_point(
-    std::vector<std::complex<double>> inState) {
+    std::vector<std::complex<cudaq::real>> inState) {
   cudaq::qvector q1 = inState;
 }
-#endif
 
 __qpu__ void test_real_constant_array() {
   cudaq::qvector v({M_SQRT1_2, M_SQRT1_2, 0., 0.});
 }
 
-#ifdef CUDAQ_SIMULATION_SCALAR_FP32
-__qpu__ void test_real_constant_array_floating_point() {
-  cudaq::qvector v(std::vector<float>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
-}
-#else
 __qpu__ void test_real_constant_array_floating_point() {
-  cudaq::qvector v(std::vector<double>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
+  cudaq::qvector v(std::vector<cudaq::real>({M_SQRT1_2, M_SQRT1_2, 0., 0.}));
 }
-#endif
 
 __qpu__ void test_real_array_param(std::vector<cudaq::real> inState) {
   cudaq::qvector q1 = inState;
 }
 
-#ifdef CUDAQ_SIMULATION_SCALAR_FP32
-__qpu__ void test_real_array_param_floating_point(std::vector<float> inState) {
-  cudaq::qvector q1 = inState;
-}
-#else
-__qpu__ void test_real_array_param_floating_point(std::vector<double> inState) {
+__qpu__ void
+test_real_array_param_floating_point(std::vector<cudaq::real> inState) {
   cudaq::qvector q1 = inState;
 }
-#endif
 
 void printCounts(cudaq::sample_result &result) {
   std::vector<std::string> values{};
diff --git a/targettests/execution/state_preparation_vector_sizes.cpp b/targettests/execution/state_preparation_vector_sizes.cpp
index 7fbc721415..54ff3276c5 100644
--- a/targettests/execution/state_preparation_vector_sizes.cpp
+++ b/targettests/execution/state_preparation_vector_sizes.cpp
@@ -16,7 +16,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 
 #include <cudaq.h>
 #include <iostream>
diff --git a/targettests/execution/swap_gate.cpp b/targettests/execution/swap_gate.cpp
index 4cbab8facf..615dbcd364 100644
--- a/targettests/execution/swap_gate.cpp
+++ b/targettests/execution/swap_gate.cpp
@@ -13,7 +13,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: nvq++ -std=c++17 --enable-mlir %s -o %t && %t | FileCheck %s
 
 #include "cudaq.h"
diff --git a/targettests/execution/variable_size_qreg.cpp b/targettests/execution/variable_size_qreg.cpp
index dfabe16498..538da46b82 100644
--- a/targettests/execution/variable_size_qreg.cpp
+++ b/targettests/execution/variable_size_qreg.cpp
@@ -13,7 +13,7 @@
 // RUN: nvq++ %cpp_std --target iqm --iqm-machine Adonis --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target oqc                      --emulate %s -o %t && %t | FileCheck %s
 // RUN: nvq++ %cpp_std --target quantinuum               --emulate %s -o %t && %t | FileCheck %s
-// RUN: if $braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s; fi
 // RUN: nvq++ -std=c++17 --enable-mlir %s -o %t
 // clang-format on
 
diff --git a/test/Quake/lambda_kernel_exec.qke b/test/Quake/lambda_kernel_exec.qke
index aedb9564b5..f4076f57cb 100644
--- a/test/Quake/lambda_kernel_exec.qke
+++ b/test/Quake/lambda_kernel_exec.qke
@@ -27,10 +27,10 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHa
     %4 = arith.extsi %c0_i32_0 : i32 to i64
     %5 = quake.extract_ref %1[%4] : (!quake.veq<?>, i64) -> !quake.ref
     %16 = quake.mz %5 name "b" : (!quake.ref) -> !quake.measure
-    %alloca = memref.alloca() : memref<i1>
+    %alloca = cc.alloca i1
     %6 = quake.discriminate %16 : (!quake.measure) -> i1
-    memref.store %6, %alloca[] : memref<i1>
-    %7 = memref.load %alloca[] : memref<i1>
+    cc.store %6, %alloca : !cc.ptr<i1>
+    %7 = cc.load %alloca : !cc.ptr<i1>
     cc.if(%7) {
       cc.scope {
         %c1_i32_1 = arith.constant 1 : i32
@@ -67,9 +67,9 @@ module attributes {quake.mangled_name_map = {__nvqpp__mlirgen__lambda.main.canHa
     %5 = quake.extract_ref %1[%4] : (!quake.veq<?>,i64) -> !quake.ref
     %16 = quake.mz %5 name "b" : (!quake.ref) -> !quake.measure
     %6 = quake.discriminate %16 : (!quake.measure) -> i1
-    %alloca = memref.alloca() : memref<i1>
-    memref.store %6, %alloca[] : memref<i1>
-    %7 = memref.load %alloca[] : memref<i1>
+    %alloca = cc.alloca i1
+    cc.store %6, %alloca : !cc.ptr<i1>
+    %7 = cc.load %alloca : !cc.ptr<i1>
     cc.if(%7) {
       cc.scope {
         %c1_i32_1 = arith.constant 1 : i32
diff --git a/test/Quake/loop.qke b/test/Quake/loop.qke
index 3c144eb9ea..2982c517c7 100644
--- a/test/Quake/loop.qke
+++ b/test/Quake/loop.qke
@@ -9,17 +9,17 @@
 // RUN: cudaq-opt %s | cudaq-opt | FileCheck %s
 
 func.func @test_old_for() {
-  %1 = memref.alloc() : memref<i32>
+  %1 = cc.alloca i32
   %zero = arith.constant 0 : i32
-  memref.store %zero, %1[] : memref<i32>
+  cc.store %zero, %1 : !cc.ptr<i32>
   cc.loop while {
-    %3 = memref.load %1[] : memref<i32>
+    %3 = cc.load %1 : !cc.ptr<i32>
     %ten = arith.constant 10 : i32
     %8 = arith.cmpi slt, %3, %ten : i32
     cc.condition %8
   } do {
     ^bb0:
-      %13 = memref.load %1[] : memref<i32>
+      %13 = cc.load %1 : !cc.ptr<i32>
       %five = arith.constant 5 : i32
       %18 = arith.cmpi slt, %13, %five : i32
       cf.cond_br %18, ^bb1, ^bb2
@@ -29,25 +29,25 @@ func.func @test_old_for() {
       cc.continue
   } step {
     %4 = arith.constant 12 : i32
-    %5 = memref.load %1[] : memref<i32>
+    %5 = cc.load %1 : !cc.ptr<i32>
     %6 = arith.addi %4, %5 : i32
-    memref.store %6, %1[] : memref<i32>
+    cc.store %6, %1 : !cc.ptr<i32>
     cc.continue
   }
   func.return
 }
 
 // CHECK-LABEL:   func.func @test_old_for() {
-// CHECK:           %[[VAL_0:.*]] = memref.alloc() : memref<i32>
+// CHECK:           %[[VAL_0:.*]] = cc.alloca i32
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
-// CHECK:           memref.store %[[VAL_1]], %[[VAL_0]][] : memref<i32>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:           cc.loop while {
-// CHECK:             %[[VAL_2:.*]] = memref.load %[[VAL_0]][] : memref<i32>
+// CHECK:             %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:             %[[VAL_3:.*]] = arith.constant 10 : i32
 // CHECK:             %[[VAL_4:.*]] = arith.cmpi slt, %[[VAL_2]], %[[VAL_3]] : i32
 // CHECK:             cc.condition %[[VAL_4]]
 // CHECK:           } do {
-// CHECK:             %[[VAL_5:.*]] = memref.load %[[VAL_0]][] : memref<i32>
+// CHECK:             %[[VAL_5:.*]] = cc.load %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:             %[[VAL_6:.*]] = arith.constant 5 : i32
 // CHECK:             %[[VAL_7:.*]] = arith.cmpi slt, %[[VAL_5]], %[[VAL_6]] : i32
 // CHECK:             cf.cond_br %[[VAL_7]], ^bb1, ^bb2
@@ -57,26 +57,26 @@ func.func @test_old_for() {
 // CHECK:             cc.continue
 // CHECK:           } step {
 // CHECK:             %[[VAL_8:.*]] = arith.constant 12 : i32
-// CHECK:             %[[VAL_9:.*]] = memref.load %[[VAL_0]][] : memref<i32>
+// CHECK:             %[[VAL_9:.*]] = cc.load %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:             %[[VAL_10:.*]] = arith.addi %[[VAL_8]], %[[VAL_9]] : i32
-// CHECK:             memref.store %[[VAL_10]], %[[VAL_0]][] : memref<i32>
+// CHECK:             cc.store %[[VAL_10]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
 
 func.func @test_scoped_for() {
   cc.scope {
-    %1 = memref.alloc() : memref<i32>
+    %1 = cc.alloca i32
     %zero = arith.constant 0 : i32
-    memref.store %zero, %1[] : memref<i32>
+    cc.store %zero, %1 : !cc.ptr<i32>
     cc.loop while {
-      %3 = memref.load %1[] : memref<i32>
+      %3 = cc.load %1 : !cc.ptr<i32>
       %ten = arith.constant 10 : i32
       %8 = arith.cmpi slt, %3, %ten : i32
       cc.condition %8
     } do {
       ^bb0:
-        %13 = memref.load %1[] : memref<i32>
+        %13 = cc.load %1 : !cc.ptr<i32>
         %five = arith.constant 5 : i32
         %18 = arith.cmpi slt, %13, %five : i32
         cf.cond_br %18, ^bb1, ^bb2
@@ -86,9 +86,9 @@ func.func @test_scoped_for() {
         cc.continue
     } step {
       %4 = arith.constant 12 : i32
-      %5 = memref.load %1[] : memref<i32>
+      %5 = cc.load %1 : !cc.ptr<i32>
       %6 = arith.addi %4, %5 : i32
-      memref.store %6, %1[] : memref<i32>
+      cc.store %6, %1 : !cc.ptr<i32>
       cc.continue
     }
     cc.continue
@@ -98,16 +98,16 @@ func.func @test_scoped_for() {
 
 // CHECK-LABEL:   func.func @test_scoped_for() {
 // CHECK:           cc.scope {
-// CHECK:             %[[VAL_0:.*]] = memref.alloc() : memref<i32>
+// CHECK:             %[[VAL_0:.*]] = cc.alloca i32
 // CHECK:             %[[VAL_1:.*]] = arith.constant 0 : i32
-// CHECK:             memref.store %[[VAL_1]], %[[VAL_0]][] : memref<i32>
+// CHECK:             cc.store %[[VAL_1]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:             cc.loop while {
-// CHECK:               %[[VAL_2:.*]] = memref.load %[[VAL_0]][] : memref<i32>
+// CHECK:               %[[VAL_2:.*]] = cc.load %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:               %[[VAL_3:.*]] = arith.constant 10 : i32
 // CHECK:               %[[VAL_4:.*]] = arith.cmpi slt, %[[VAL_2]], %[[VAL_3]] : i32
 // CHECK:               cc.condition %[[VAL_4]]
 // CHECK:             } do {
-// CHECK:               %[[VAL_5:.*]] = memref.load %[[VAL_0]][] : memref<i32>
+// CHECK:               %[[VAL_5:.*]] = cc.load %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:               %[[VAL_6:.*]] = arith.constant 5 : i32
 // CHECK:               %[[VAL_7:.*]] = arith.cmpi slt, %[[VAL_5]], %[[VAL_6]] : i32
 // CHECK:               cf.cond_br %[[VAL_7]], ^bb1, ^bb2
@@ -117,9 +117,9 @@ func.func @test_scoped_for() {
 // CHECK:               cc.continue
 // CHECK:             } step {
 // CHECK:               %[[VAL_8:.*]] = arith.constant 12 : i32
-// CHECK:               %[[VAL_9:.*]] = memref.load %[[VAL_0]][] : memref<i32>
+// CHECK:               %[[VAL_9:.*]] = cc.load %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:               %[[VAL_10:.*]] = arith.addi %[[VAL_8]], %[[VAL_9]] : i32
-// CHECK:               memref.store %[[VAL_10]], %[[VAL_0]][] : memref<i32>
+// CHECK:               cc.store %[[VAL_10]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return
@@ -127,17 +127,17 @@ func.func @test_scoped_for() {
 
 func.func @test_scoped_for_with_args() {
   cc.scope {
-    %1 = memref.alloc() : memref<i32>
+    %1 = cc.alloca i32
     %zero = arith.constant 0 : i32
-    memref.store %zero, %1[] : memref<i32>
+    cc.store %zero, %1 : !cc.ptr<i32>
     %z2 = cc.loop while ((%xtra = %zero) -> i32) {
-      %3 = memref.load %1[] : memref<i32>
+      %3 = cc.load %1 : !cc.ptr<i32>
       %ten = arith.constant 10 : i32
       %8 = arith.cmpi slt, %3, %ten : i32
       cc.condition %8 (%xtra : i32)
     } do {
       ^bb0(%x2 : i32):
-        %13 = memref.load %1[] : memref<i32>
+        %13 = cc.load %1 : !cc.ptr<i32>
         %five = arith.constant 5 : i32
         %18 = arith.cmpi slt, %13, %five : i32
         cf.cond_br %18, ^bb1, ^bb2
@@ -149,9 +149,9 @@ func.func @test_scoped_for_with_args() {
       ^bb4 (%x3 : i32):
         %4 = arith.constant 12 : i32
         %16 = arith.addi %x3, %4 : i32
-        %5 = memref.load %1[] : memref<i32>
+        %5 = cc.load %1 : !cc.ptr<i32>
         %6 = arith.addi %16, %5 : i32
-        memref.store %6, %1[] : memref<i32>
+        cc.store %6, %1 : !cc.ptr<i32>
         cc.continue %x3 : i32
     }
   }
@@ -162,17 +162,17 @@ func.func private @getI32() -> i32
 
 // CHECK-LABEL:   func.func @test_scoped_for_with_args() {
 // CHECK:           cc.scope {
-// CHECK:             %[[VAL_0:.*]] = memref.alloc() : memref<i32>
+// CHECK:             %[[VAL_0:.*]] = cc.alloca i32
 // CHECK:             %[[VAL_1:.*]] = arith.constant 0 : i32
-// CHECK:             memref.store %[[VAL_1]], %[[VAL_0]][] : memref<i32>
+// CHECK:             cc.store %[[VAL_1]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:             %[[VAL_2:.*]] = cc.loop while ((%[[VAL_3:.*]] = %[[VAL_1]]) -> (i32)) {
-// CHECK:               %[[VAL_4:.*]] = memref.load %[[VAL_0]][] : memref<i32>
+// CHECK:               %[[VAL_4:.*]] = cc.load %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:               %[[VAL_5:.*]] = arith.constant 10 : i32
 // CHECK:               %[[VAL_6:.*]] = arith.cmpi slt, %[[VAL_4]], %[[VAL_5]] : i32
 // CHECK:               cc.condition %[[VAL_6]](%[[VAL_3]] : i32)
 // CHECK:             } do {
 // CHECK:             ^bb0(%[[VAL_7:.*]]: i32):
-// CHECK:               %[[VAL_8:.*]] = memref.load %[[VAL_0]][] : memref<i32>
+// CHECK:               %[[VAL_8:.*]] = cc.load %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:               %[[VAL_9:.*]] = arith.constant 5 : i32
 // CHECK:               %[[VAL_10:.*]] = arith.cmpi slt, %[[VAL_8]], %[[VAL_9]] : i32
 // CHECK:               cf.cond_br %[[VAL_10]], ^bb1, ^bb2
@@ -184,9 +184,9 @@ func.func private @getI32() -> i32
 // CHECK:             ^bb0(%[[VAL_11:.*]]: i32):
 // CHECK:               %[[VAL_12:.*]] = arith.constant 12 : i32
 // CHECK:               %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_12]] : i32
-// CHECK:               %[[VAL_14:.*]] = memref.load %[[VAL_0]][] : memref<i32>
+// CHECK:               %[[VAL_14:.*]] = cc.load %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i32
-// CHECK:               memref.store %[[VAL_15]], %[[VAL_0]][] : memref<i32>
+// CHECK:               cc.store %[[VAL_15]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:               cc.continue %[[VAL_11]] : i32
 // CHECK:             }
 // CHECK:           }
@@ -196,13 +196,13 @@ func.func private @getI32() -> i32
 // CHECK-LABEL:   func.func private @getI32() -> i32
 
 func.func @test_do_while() {
-  %1 = memref.alloc() : memref<i32>
+  %1 = cc.alloca i32
   %zero = arith.constant 0 : i32
-  memref.store %zero, %1[] : memref<i32>
+  cc.store %zero, %1 : !cc.ptr<i32>
   cc.loop do {
     ^bb0:
       %8 = func.call @getI32() : () -> i32
-      memref.store %8, %1[] : memref<i32>
+      cc.store %8, %1 : !cc.ptr<i32>
       cc.continue
   } while {
     %3 = arith.constant 1 : i1
@@ -212,12 +212,12 @@ func.func @test_do_while() {
 }
 
 // CHECK-LABEL:   func.func @test_do_while() {
-// CHECK:           %[[VAL_0:.*]] = memref.alloc() : memref<i32>
+// CHECK:           %[[VAL_0:.*]] = cc.alloca i32
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
-// CHECK:           memref.store %[[VAL_1]], %[[VAL_0]][] : memref<i32>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:           cc.loop do {
 // CHECK:             %[[VAL_2:.*]] = func.call @getI32() : () -> i32
-// CHECK:             memref.store %[[VAL_2]], %[[VAL_0]][] : memref<i32>
+// CHECK:             cc.store %[[VAL_2]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:           } while {
 // CHECK:             %[[VAL_3:.*]] = arith.constant true
 // CHECK:             cc.condition %[[VAL_3]]
@@ -226,12 +226,12 @@ func.func @test_do_while() {
 // CHECK:         }
 
 func.func @test_do_while_with_args() {
-  %1 = memref.alloc() : memref<i32>
+  %1 = cc.alloca i32
   %zero = arith.constant 0 : i32
-  memref.store %zero, %1[] : memref<i32>
+  cc.store %zero, %1 : !cc.ptr<i32>
   cc.loop do ((%i = %zero) -> i32) {
       %8 = func.call @getI32() : () -> i32
-      memref.store %i, %1[] : memref<i32>
+      cc.store %i, %1 : !cc.ptr<i32>
       cc.continue %i : i32
   } while {
     ^bb9(%arg0 : i32):
@@ -245,12 +245,12 @@ func.func @test_do_while_with_args() {
 }
 
 // CHECK-LABEL:   func.func @test_do_while_with_args() {
-// CHECK:           %[[VAL_0:.*]] = memref.alloc() : memref<i32>
+// CHECK:           %[[VAL_0:.*]] = cc.alloca i32
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
-// CHECK:           memref.store %[[VAL_1]], %[[VAL_0]][] : memref<i32>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_2:.*]] = cc.loop do ((%[[VAL_3:.*]] = %[[VAL_1]]) -> (i32)) {
 // CHECK:             %[[VAL_4:.*]] = func.call @getI32() : () -> i32
-// CHECK:             memref.store %[[VAL_3]], %[[VAL_0]][] : memref<i32>
+// CHECK:             cc.store %[[VAL_3]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:             cc.continue %[[VAL_3]] : i32
 // CHECK:           } while {
 // CHECK:           ^bb0(%[[VAL_5:.*]]: i32):
@@ -264,40 +264,40 @@ func.func @test_do_while_with_args() {
 // CHECK:         }
 
 func.func @test_while() {
-  %1 = memref.alloc() : memref<i32>
+  %1 = cc.alloca i32
   %zero = arith.constant 0 : i32
-  memref.store %zero, %1[] : memref<i32>
+  cc.store %zero, %1 : !cc.ptr<i32>
   cc.loop while {
     %3 = arith.constant 1 : i1
     cc.condition %3
   } do {
     ^bb0:
       %8 = func.call @getI32() : () -> i32
-      memref.store %8, %1[] : memref<i32>
+      cc.store %8, %1 : !cc.ptr<i32>
       cc.continue
   }
   func.return
 }
 
 // CHECK-LABEL:   func.func @test_while() {
-// CHECK:           %[[VAL_0:.*]] = memref.alloc() : memref<i32>
+// CHECK:           %[[VAL_0:.*]] = cc.alloca i32
 // CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
-// CHECK:           memref.store %[[VAL_1]], %[[VAL_0]][] : memref<i32>
+// CHECK:           cc.store %[[VAL_1]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:           cc.loop while {
 // CHECK:             %[[VAL_2:.*]] = arith.constant true
 // CHECK:             cc.condition %[[VAL_2]]
 // CHECK:           } do {
 // CHECK:             %[[VAL_3:.*]] = func.call @getI32() : () -> i32
-// CHECK:             memref.store %[[VAL_3]], %[[VAL_0]][] : memref<i32>
+// CHECK:             cc.store %[[VAL_3]], %[[VAL_0]] : !cc.ptr<i32>
 // CHECK:             cc.continue
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
 
 func.func @test_if_else(%c : i1) {
-  %1 = memref.alloc() : memref<i32>
+  %1 = cc.alloca i32
   %zero = arith.constant 0 : i32
-  memref.store %zero, %1[] : memref<i32>
+  cc.store %zero, %1 : !cc.ptr<i32>
   cc.if (%c) {
     ^bb0:
       %3 = arith.constant 1 : i1
@@ -314,9 +314,9 @@ func.func @test_if_else(%c : i1) {
 
 // CHECK-LABEL:   func.func @test_if_else(
 // CHECK-SAME:                            %[[VAL_0:.*]]: i1) {
-// CHECK:           %[[VAL_1:.*]] = memref.alloc() : memref<i32>
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
 // CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-// CHECK:           memref.store %[[VAL_2]], %[[VAL_1]][] : memref<i32>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_1]] : !cc.ptr<i32>
 // CHECK:           cc.if(%[[VAL_0]]) {
 // CHECK:             %[[VAL_3:.*]] = arith.constant true
 // CHECK:           } else {
@@ -329,9 +329,9 @@ func.func @test_if_else(%c : i1) {
 // CHECK:         }
 
 func.func @test_if(%c : i1) {
-  %1 = memref.alloc() : memref<i32>
+  %1 = cc.alloca i32
   %zero = arith.constant 0 : i32
-  memref.store %zero, %1[] : memref<i32>
+  cc.store %zero, %1 : !cc.ptr<i32>
   cc.if (%c) {
     ^bb1:
       %8 = func.call @getI32() : () -> i32
@@ -344,9 +344,9 @@ func.func @test_if(%c : i1) {
 
 // CHECK-LABEL:   func.func @test_if(
 // CHECK-SAME:                       %[[VAL_0:.*]]: i1) {
-// CHECK:           %[[VAL_1:.*]] = memref.alloc() : memref<i32>
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
 // CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-// CHECK:           memref.store %[[VAL_2]], %[[VAL_1]][] : memref<i32>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_1]] : !cc.ptr<i32>
 // CHECK:           cc.if(%[[VAL_0]]) {
 // CHECK:             %[[VAL_3:.*]] = func.call @getI32() : () -> i32
 // CHECK:             cf.br ^bb1
@@ -357,9 +357,9 @@ func.func @test_if(%c : i1) {
 // CHECK:         }
 
 func.func @test_if_else_thread(%c : i1) -> i32 {
-  %1 = memref.alloc() : memref<i32>
+  %1 = cc.alloca i32
   %zero = arith.constant 0 : i32
-  memref.store %zero, %1[] : memref<i32>
+  cc.store %zero, %1 : !cc.ptr<i32>
   %2 = cc.if (%c) -> i32 {
       %3 = arith.constant 1 : i32
       cc.continue %3 : i32
@@ -372,9 +372,9 @@ func.func @test_if_else_thread(%c : i1) -> i32 {
 
 // CHECK-LABEL:   func.func @test_if_else_thread(
 // CHECK-SAME:                                   %[[VAL_0:.*]]: i1) -> i32 {
-// CHECK:           %[[VAL_1:.*]] = memref.alloc() : memref<i32>
+// CHECK:           %[[VAL_1:.*]] = cc.alloca i32
 // CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
-// CHECK:           memref.store %[[VAL_2]], %[[VAL_1]][] : memref<i32>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_1]] : !cc.ptr<i32>
 // CHECK:           %[[VAL_3:.*]] = cc.if(%[[VAL_0]]) -> i32 {
 // CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
 // CHECK:             cc.continue %[[VAL_4]] : i32
diff --git a/test/Quake/memtoreg-2.qke b/test/Quake/memtoreg-2.qke
index 2878c43c6d..c1e0bd1c50 100644
--- a/test/Quake/memtoreg-2.qke
+++ b/test/Quake/memtoreg-2.qke
@@ -754,20 +754,19 @@ func.func @simple_loop() {
   %c1_i64 = arith.constant 1 : i64
   %c42_i64 = arith.constant 42 : i64
   %q0 = quake.alloca !quake.ref
-  // memtoreg does not promote memref types.
-  %alloca = memref.alloca() : memref<i64>
-  memref.store %c0_i64, %alloca[] : memref<i64>
+  %alloca = cc.alloca i64
+  cc.store %c0_i64, %alloca : !cc.ptr<i64>
   cc.loop while {
-    %1 = memref.load %alloca[] : memref<i64>
+    %1 = cc.load %alloca : !cc.ptr<i64>
     %2 = arith.cmpi ult, %1, %c42_i64 : i64
     cc.condition %2
   } do {
     quake.x %q0 : (!quake.ref) -> ()
     cc.continue
   } step {
-    %1 = memref.load %alloca[] : memref<i64>
+    %1 = cc.load %alloca : !cc.ptr<i64>
     %2 = arith.addi %1, %c1_i64 : i64
-    memref.store %2, %alloca[] : memref<i64>
+    cc.store %2, %alloca : !cc.ptr<i64>
   }
   quake.z %q0 : (!quake.ref) -> ()
   quake.dealloc %q0 : !quake.ref
@@ -779,24 +778,21 @@ func.func @simple_loop() {
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.constant 42 : i64
 // CHECK:           %[[VAL_3:.*]] = quake.null_wire
-// CHECK:           %[[VAL_4:.*]] = memref.alloca() : memref<i64>
-// CHECK:           memref.store %[[VAL_0]], %[[VAL_4]][] : memref<i64>
-// CHECK:           %[[VAL_5:.*]] = cc.loop while ((%[[VAL_6:.*]] = %[[VAL_3]]) -> (!quake.wire)) {
-// CHECK:             %[[VAL_7:.*]] = memref.load %[[VAL_4]][] : memref<i64>
+// CHECK:           %[[VAL_4:.*]] = cc.undef i64
+// CHECK:           %[[VAL_5:.*]]:2 = cc.loop while ((%[[VAL_6:.*]] = %[[VAL_3]], %[[VAL_7:.*]] = %[[VAL_0]]) -> (!quake.wire, i64)) {
 // CHECK:             %[[VAL_8:.*]] = arith.cmpi ult, %[[VAL_7]], %[[VAL_2]] : i64
-// CHECK:             cc.condition %[[VAL_8]](%[[VAL_6]] : !quake.wire)
+// CHECK:             cc.condition %[[VAL_8]](%[[VAL_6]], %[[VAL_7]] : !quake.wire, i64)
 // CHECK:           } do {
-// CHECK:           ^bb0(%[[VAL_9:.*]]: !quake.wire):
-// CHECK:             %[[VAL_10:.*]] = quake.x %[[VAL_9]] : (!quake.wire) -> !quake.wire
-// CHECK:             cc.continue %[[VAL_10]] : !quake.wire
+// CHECK:           ^bb0(%[[VAL_9:.*]]: !quake.wire, %[[VAL_10:.*]]: i64):
+// CHECK:             %[[VAL_11:.*]] = quake.x %[[VAL_9]] : (!quake.wire) -> !quake.wire
+// CHECK:             cc.continue %[[VAL_11]], %[[VAL_10]] : !quake.wire, i64
 // CHECK:           } step {
-// CHECK:           ^bb0(%[[VAL_11:.*]]: !quake.wire):
-// CHECK:             %[[VAL_12:.*]] = memref.load %[[VAL_4]][] : memref<i64>
-// CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[VAL_1]] : i64
-// CHECK:             memref.store %[[VAL_13]], %[[VAL_4]][] : memref<i64>
-// CHECK:             cc.continue %[[VAL_11]] : !quake.wire
+// CHECK:           ^bb0(%[[VAL_12:.*]]: !quake.wire, %[[VAL_13:.*]]: i64):
+// CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_1]] : i64
+// CHECK:             cc.continue %[[VAL_12]], %[[VAL_14]] : !quake.wire, i64
 // CHECK:           }
-// CHECK:           %[[VAL_14:.*]] = quake.z %[[VAL_15:.*]] : (!quake.wire) -> !quake.wire
+// CHECK:           %[[VAL_15:.*]] = quake.z %[[VAL_16:.*]]#0 : (!quake.wire) -> !quake.wire
+// CHECK:           quake.sink %[[VAL_15]] : !quake.wire
 // CHECK:           return
 // CHECK:         }
 
@@ -806,51 +802,49 @@ func.func @floop_with_vector_and_qextract() {
   %c2_i64 = arith.constant 2 : i64
   %veq = quake.alloca  !quake.veq<2>
   %q0 = quake.extract_ref %veq[%c0_i64] : (!quake.veq<2> ,i64) -> !quake.ref
-  %alloca = memref.alloca() : memref<i64>
-  memref.store %c0_i64, %alloca[] : memref<i64>
+  %alloca = cc.alloca i64
+  cc.store %c0_i64, %alloca : !cc.ptr<i64>
   cc.loop while {
-    %3 = memref.load %alloca[] : memref<i64>
+    %3 = cc.load %alloca : !cc.ptr<i64>
     %4 = arith.cmpi ult, %3, %c2_i64 : i64
     cc.condition %4
   } do {
-    %3 = memref.load %alloca[] : memref<i64>
+    %3 = cc.load %alloca : !cc.ptr<i64>
     %4 = quake.extract_ref %veq[%3] : (!quake.veq<2> ,i64) -> !quake.ref
     cc.continue
   } step {
-    %3 = memref.load %alloca[] : memref<i64>
+    %3 = cc.load %alloca : !cc.ptr<i64>
     %4 = arith.addi %3, %c1_i64 : i64
-    memref.store %4, %alloca[] : memref<i64>
+    cc.store %4, %alloca : !cc.ptr<i64>
   }
   %2 = quake.mz %veq : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
   quake.dealloc %veq : !quake.veq<2>
   return
 }
 
-
 // CHECK-LABEL:   func.func @floop_with_vector_and_qextract() {
 // CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
 // CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
 // CHECK:           %[[VAL_2:.*]] = arith.constant 2 : i64
 // CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.veq<2>
-// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_3]][%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_3]]{{\[}}%[[VAL_0]]] : (!quake.veq<2>, i64) -> !quake.ref
 // CHECK:           %[[VAL_5:.*]] = quake.unwrap %[[VAL_4]] : (!quake.ref) -> !quake.wire
-// CHECK:           %[[VAL_6:.*]] = memref.alloca() : memref<i64>
-// CHECK:           memref.store %[[VAL_0]], %[[VAL_6]][] : memref<i64>
-// CHECK:           cc.loop while {
-// CHECK:             %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i64>
-// CHECK:             %[[VAL_8:.*]] = arith.cmpi ult, %[[VAL_7]], %[[VAL_2]] : i64
-// CHECK:             cc.condition %[[VAL_8]]
+// CHECK:           %[[VAL_6:.*]] = cc.undef i64
+// CHECK:           %[[VAL_7:.*]] = cc.loop while ((%[[VAL_8:.*]] = %[[VAL_0]]) -> (i64)) {
+// CHECK:             %[[VAL_9:.*]] = arith.cmpi ult, %[[VAL_8]], %[[VAL_2]] : i64
+// CHECK:             cc.condition %[[VAL_9]](%[[VAL_8]] : i64)
 // CHECK:           } do {
-// CHECK:             %[[VAL_9:.*]] = memref.load %[[VAL_6]][] : memref<i64>
-// CHECK:             %[[VAL_10:.*]] = quake.extract_ref %[[VAL_3]][%[[VAL_9]]] : (!quake.veq<2>, i64) -> !quake.ref
-// CHECK:             %[[VAL_11:.*]] = quake.unwrap %[[VAL_10]] : (!quake.ref) -> !quake.wire
-// CHECK:             cc.continue
+// CHECK:           ^bb0(%[[VAL_10:.*]]: i64):
+// CHECK:             %[[VAL_11:.*]] = quake.extract_ref %[[VAL_3]]{{\[}}%[[VAL_10]]] : (!quake.veq<2>, i64) -> !quake.ref
+// CHECK:             %[[VAL_12:.*]] = quake.unwrap %[[VAL_11]] : (!quake.ref) -> !quake.wire
+// CHECK:             cc.continue %[[VAL_10]] : i64
 // CHECK:           } step {
-// CHECK:             %[[VAL_12:.*]] = memref.load %[[VAL_6]][] : memref<i64>
-// CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[VAL_1]] : i64
-// CHECK:             memref.store %[[VAL_13]], %[[VAL_6]][] : memref<i64>
+// CHECK:           ^bb0(%[[VAL_13:.*]]: i64):
+// CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_1]] : i64
+// CHECK:             cc.continue %[[VAL_14]] : i64
 // CHECK:           }
-// CHECK:           quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:           %[[VAL_15:.*]] = quake.mz %[[VAL_3]] : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
+// CHECK:           quake.dealloc %[[VAL_3]] : !quake.veq<2>
 // CHECK:           return
 // CHECK:         }
 
diff --git a/test/Quake/observeAnsatz.qke b/test/Quake/observeAnsatz.qke
index ea7eefa157..3c08197d80 100644
--- a/test/Quake/observeAnsatz.qke
+++ b/test/Quake/observeAnsatz.qke
@@ -11,12 +11,12 @@
   func.func @__nvqpp__mlirgen__ansatz(%arg0: f64) {
     %c0_i64 = arith.constant 0 : i64
     %c1_i64 = arith.constant 1 : i64
-    %0 = memref.alloca() : memref<f64>
-    memref.store %arg0, %0[] : memref<f64>
+    %0 = cc.alloca f64
+    cc.store %arg0, %0 : !cc.ptr<f64>
     %1 = quake.alloca  !quake.veq<2>
     %2 = quake.extract_ref %1[%c0_i64] : (!quake.veq<2>,i64) -> !quake.ref
     quake.x %2 : (!quake.ref) -> ()
-    %3 = memref.load %0[] : memref<f64>
+    %3 = cc.load %0 : !cc.ptr<f64>
     %4 = quake.extract_ref %1[%c1_i64] : (!quake.veq<2>,i64) -> !quake.ref
     quake.ry (%3) %4 : (f64, !quake.ref) -> ()
     %5 = quake.extract_ref %1[%c1_i64] : (!quake.veq<2>,i64) -> !quake.ref
diff --git a/test/Translate/alloca_no_operand.qke b/test/Translate/alloca_no_operand.qke
index a321371a41..af2feccaf5 100644
--- a/test/Translate/alloca_no_operand.qke
+++ b/test/Translate/alloca_no_operand.qke
@@ -10,7 +10,7 @@
 
 func.func @adder_n4() {
   %0 = quake.alloca !quake.veq<4>
-  %1 = memref.alloc() : memref<4xi1>
+  %1 = cc.alloca !cc.array<i1 x 4>
   %c0 = arith.constant 0 : index
   %2 = quake.extract_ref %0[%c0] : (!quake.veq<4>, index) -> !quake.ref
   quake.x %2 : (!quake.ref) -> ()
@@ -44,20 +44,20 @@ func.func @adder_n4() {
   quake.h %4 : (!quake.ref) -> ()
   %6 = quake.mz %2 : (!quake.ref) -> !quake.measure
   %61 = quake.discriminate %6 : (!quake.measure) -> i1
-  %c0_0 = arith.constant 0 : index
-  memref.store %61, %1[%c0_0] : memref<4xi1>
+  %a = cc.compute_ptr %1[0] : (!cc.ptr<!cc.array<i1 x 4>>) -> !cc.ptr<i1>
+  cc.store %61, %a : !cc.ptr<i1>
   %7 = quake.mz %3 : (!quake.ref) -> !quake.measure
   %71 = quake.discriminate %7 : (!quake.measure) -> i1
-  %c1_1 = arith.constant 1 : index
-  memref.store %71, %1[%c1_1] : memref<4xi1>
+  %b = cc.compute_ptr %1[1] : (!cc.ptr<!cc.array<i1 x 4>>) -> !cc.ptr<i1>
+  cc.store %71, %b : !cc.ptr<i1>
   %8 = quake.mz %5 : (!quake.ref) -> !quake.measure
   %81 = quake.discriminate %8 : (!quake.measure) -> i1
-  %c2_2 = arith.constant 2 : index
-  memref.store %81, %1[%c2_2] : memref<4xi1>
+  %c = cc.compute_ptr %1[2] : (!cc.ptr<!cc.array<i1 x 4>>) -> !cc.ptr<i1>
+  cc.store %81, %c : !cc.ptr<i1>
   %9 = quake.mz %4 : (!quake.ref) -> !quake.measure
   %91 = quake.discriminate %9 : (!quake.measure) -> i1
-  %c3_3 = arith.constant 3 : index
-  memref.store %91, %1[%c3_3] : memref<4xi1>
+  %d = cc.compute_ptr %1[3] : (!cc.ptr<!cc.array<i1 x 4>>) -> !cc.ptr<i1>
+  cc.store %91, %d : !cc.ptr<i1>
   return
 }
 
diff --git a/tools/cudaq-opt/cudaq-opt.cpp b/tools/cudaq-opt/cudaq-opt.cpp
index 3fb349cf2f..0ebd2a1146 100644
--- a/tools/cudaq-opt/cudaq-opt.cpp
+++ b/tools/cudaq-opt/cudaq-opt.cpp
@@ -6,6 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
+#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/Common/InlinerInterface.h"
@@ -75,6 +76,7 @@ int main(int argc, char **argv) {
 
   mlir::DialectRegistry registry;
   cudaq::registerAllDialects(registry);
+  registry.insert<cudaq::codegen::CodeGenDialect>();
   registerInlinerExtension(registry);
   return mlir::asMainReturnCode(
       mlir::MlirOptMain(argc, argv, "nvq++ optimizer\n", registry));
diff --git a/tools/cudaq-quake/CMakeLists.txt b/tools/cudaq-quake/CMakeLists.txt
index 277df39c22..52e3d92750 100644
--- a/tools/cudaq-quake/CMakeLists.txt
+++ b/tools/cudaq-quake/CMakeLists.txt
@@ -25,8 +25,6 @@ target_link_libraries(cudaq-quake
   MLIRLLVMCommonConversion
   MLIRLLVMToLLVMIRTranslation
 
-  MLIRMemRefDialect
-  
   clangCodeGen
   clangFrontendTool
   clangFrontend
diff --git a/unittests/Optimizer/QuakeSynthTester.cpp b/unittests/Optimizer/QuakeSynthTester.cpp
index 5c2e324cab..ceb3184183 100644
--- a/unittests/Optimizer/QuakeSynthTester.cpp
+++ b/unittests/Optimizer/QuakeSynthTester.cpp
@@ -283,7 +283,10 @@ TEST(QuakeSynthTests, checkVectorOfInt) {
   kernel.h(aq);
   kernel.z(aq);
   kernel.h(q);
-  for (std::size_t i = 0; i < *q.constantSize(); ++i) {
+  // FIXME: This test never really tested the c_if in this loop. The call to
+  // constantSize just returned 0.
+  std::size_t unrollBy = q.constantSize().has_value() ? *q.constantSize() : 0;
+  for (std::size_t i = 0; i < unrollBy; ++i) {
     kernel.c_if(hiddenBits[i], [&]() { kernel.x<cudaq::ctrl>(aq, q[i]); });
   }
   kernel.h(q);