From 74c90bf1157e335de1e608d1c40de5123887a42d Mon Sep 17 00:00:00 2001
From: Colin Unger <unger@stanford.edu>
Date: Sat, 16 Mar 2024 10:13:36 -0700
Subject: [PATCH] Remove unnecessary dependencies and allow using external
 installs (#1321)

* Remove unnecessary dependencies and allow using external installs

* Cleanup before PR and format

* Add FindNCCL from torch, remove unnecessary cmake files

* Cleanup nccl cmake

* Fix remaining optional bugs in kernels

* Try nix-based CI

* Fix nix-develop version

* Fix flake devshell name

* Move ci build code to script

* Satisfy shellcheck

* Try removing ccache

* Attempt to skip ccache install

* Move to using in-flake caches

* Format
---
 .flake/pkgs/legion.nix                        |  53 ++++
 .flake/pkgs/tokenizers-cpp.nix                |  43 ++++
 .github/workflows/helpers/build_cuda.sh       |  29 +++
 .github/workflows/per-lib-check.yml           |  45 ++--
 .gitmodules                                   |  18 --
 CMakeLists.txt                                |  14 +-
 cmake/Modules/FindNCCL.cmake                  | 175 +++++++++++++
 cmake/any.cmake                               |  16 --
 cmake/fmt.cmake                               |   8 +-
 cmake/invoke.cmake                            |   5 -
 cmake/json.cmake                              |  12 +-
 cmake/nccl.cmake                              | 121 ++-------
 cmake/optional.cmake                          |   4 -
 cmake/spdlog.cmake                            |   8 +-
 cmake/variant.cmake                           |   5 -
 config/config.linux                           |   2 +-
 deps/any                                      |   1 -
 deps/googletest                               |   1 -
 deps/invoke                                   |   1 -
 deps/optional                                 |   1 -
 deps/pybind11                                 |   1 -
 deps/variant                                  |   1 -
 flake.lock                                    |  60 +++++
 flake.nix                                     |  99 ++++++++
 lib/kernels/CMakeLists.txt                    |  12 +-
 lib/kernels/include/kernels/array_shape.h     |   7 +-
 lib/kernels/include/kernels/device.h          |   1 +
 lib/kernels/include/kernels/perf_metrics.h    |  30 +--
 lib/kernels/src/cuda/batch_norm_kernels.cu    |  30 +--
 lib/kernels/src/cuda/cast_kernels.cu          |   6 +-
 lib/kernels/src/device.h                      |   1 +
 lib/kernels/src/perf_metrics.cc               |  16 +-
 lib/op-attrs/include/op-attrs/datatype.h      |  14 +-
 lib/op-attrs/include/op-attrs/get_op_type.h   |   2 +-
 .../include/op-attrs/get_output_shapes.h      |   8 +-
 .../include/op-attrs/operator_attrs.h         |  54 ++--
 lib/op-attrs/include/op-attrs/ops/conv_2d.h   |   4 +-
 lib/op-attrs/include/op-attrs/ops/linear.h    |  12 +-
 .../include/op-attrs/ops/loss_functions.h     |   4 +-
 lib/op-attrs/src/operator_attrs.cc            |   4 +-
 .../src/parallel_dim_mapping_record.cc        |   4 +-
 .../src/parallel_dim_mapping_record.h         |   7 +-
 .../src/parallel_dim_mapping_record_solver.cc |   8 +-
 .../src/parallel_dim_mapping_record_solver.h  |   8 +-
 .../include/pcg/computation_graph_builder.h   | 237 ++++++++++--------
 lib/pcg/include/pcg/device_id.h               |   3 +-
 .../include/pcg/file_format/v1/data_type.h    |   4 +-
 lib/pcg/include/pcg/file_format/v1/graphs.h   |   2 +-
 .../include/pcg/file_format/v1/initializer.h  |  18 +-
 .../pcg/file_format/v1/operator_attrs.h       |   6 +-
 .../pcg/file_format/v1/parallel_tensor.h      |  10 +-
 lib/pcg/include/pcg/file_format/v1/tensor.h   |  10 +-
 lib/pcg/include/pcg/initializer.h             |  18 +-
 lib/pcg/include/pcg/layer.h                   |   7 +-
 lib/pcg/include/pcg/operator.h                |   3 +-
 lib/pcg/include/pcg/parallel_tensor.h         |  12 +-
 lib/pcg/include/pcg/tensor.h                  |   6 +-
 lib/pcg/src/computation_graph_builder.cc      | 136 +++++-----
 lib/pcg/src/device_id.cc                      |   5 +-
 lib/pcg/src/layer.cc                          |   2 +-
 lib/pcg/src/operator.cc                       |   2 +-
 lib/pcg/src/parallel_tensor.cc                |   4 +-
 lib/runtime/src/serialization.h               |  10 +-
 lib/substitutions/src/substitution.cc         |   2 +-
 lib/utils/CMakeLists.txt                      |   4 -
 lib/utils/include/utils/containers.decl.h     |  23 +-
 lib/utils/include/utils/containers.h          |  26 +-
 lib/utils/include/utils/disjoint_set.h        |  33 +--
 lib/utils/include/utils/dot_file.h            |  13 +-
 lib/utils/include/utils/fmt.h                 |  18 +-
 lib/utils/include/utils/graph/algorithms.h    |  30 +--
 .../graph/labelled/output_labelled_open.h     |   4 +-
 lib/utils/include/utils/graph/open_edge.h     |   6 +-
 lib/utils/include/utils/graph/query_set.h     |   8 +-
 .../include/utils/graph/serialparallel.h      |  10 +-
 lib/utils/include/utils/graph/traversal.h     |   4 +-
 lib/utils/include/utils/graph/views.h         |   5 +-
 lib/utils/include/utils/invoke.h              |  12 -
 lib/utils/include/utils/json.h                |  27 +-
 lib/utils/include/utils/optional.decl         |   8 +-
 lib/utils/include/utils/optional.h            |   8 +-
 lib/utils/include/utils/sequence.h            |   4 +-
 lib/utils/include/utils/stack_map.h           |   9 +-
 lib/utils/include/utils/stack_vector.h        |  11 +-
 lib/utils/include/utils/tuple.h               |  11 +-
 lib/utils/include/utils/type_traits.h         |   1 -
 lib/utils/include/utils/variant.h             | 125 +++++----
 lib/utils/include/utils/visitable.h           |   6 +-
 lib/utils/src/graph/algorithms.cc             |  25 +-
 lib/utils/src/graph/serialparallel.cc         |  24 +-
 lib/utils/src/graph/serialparallel_internal.h |   6 +-
 lib/utils/src/graph/traversal.cc              |   4 +-
 lib/utils/src/stack_vector.cc                 |   1 +
 lib/utils/src/tuple.cc                        |   1 +
 94 files changed, 1130 insertions(+), 789 deletions(-)
 create mode 100644 .flake/pkgs/legion.nix
 create mode 100644 .flake/pkgs/tokenizers-cpp.nix
 create mode 100755 .github/workflows/helpers/build_cuda.sh
 create mode 100644 cmake/Modules/FindNCCL.cmake
 delete mode 100644 cmake/any.cmake
 delete mode 100644 cmake/invoke.cmake
 delete mode 100644 cmake/optional.cmake
 delete mode 100644 cmake/variant.cmake
 delete mode 160000 deps/any
 delete mode 160000 deps/googletest
 delete mode 160000 deps/invoke
 delete mode 160000 deps/optional
 delete mode 160000 deps/pybind11
 delete mode 160000 deps/variant
 create mode 100644 flake.lock
 create mode 100644 flake.nix
 delete mode 100644 lib/utils/include/utils/invoke.h
 create mode 100644 lib/utils/src/stack_vector.cc
 create mode 100644 lib/utils/src/tuple.cc

diff --git a/.flake/pkgs/legion.nix b/.flake/pkgs/legion.nix
new file mode 100644
index 0000000000..814ef85e00
--- /dev/null
+++ b/.flake/pkgs/legion.nix
@@ -0,0 +1,53 @@
+{ lib
+, stdenv
+, fetchFromGitLab
+, cmake
+, python3
+, cudaPackages ? { }
+, cudaCapabilities ? [ "60" "70" "80" "86" ]
+, maxDim ? 5
+}:
+
+# from https://codeberg.org/Uli/nix-things/src/commit/776519e382c81b136c1d0b10d8c7b52b4acb9192/overlays/cq/python/libclang-python.nix
+
+let 
+  cmakeFlag = x: if x then "1" else "0";
+
+  inherit (cudaPackages) cudatoolkit;
+in
+
+stdenv.mkDerivation rec {
+  pname = "legion_flexflow";
+  version = "2024-03-13";
+
+  src = fetchFromGitLab {
+    owner = "StanfordLegion";
+    repo = "legion";
+    rev = "24e8c452341dea41427e0ce61e154d61715e6835";
+    sha256 = "sha256-NjCSjphOIew/V24i74I6DModSGcWKLeiSIjts3cFtx4=";
+    fetchSubmodules = true;
+  };
+
+  nativeBuildInputs = [
+    cmake
+  ];
+
+  cmakeFlags = [
+    "-DLegion_USE_Python=1"
+    "-DLegion_BUILD_BINDINGS=1"
+    "-DLegion_USE_CUDA=1"
+    "-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}"
+    "-DLegion_MAX_DIM=${toString maxDim}"
+  ];
+
+  buildInputs = [ 
+    python3
+    cudatoolkit
+  ];
+
+  meta = with lib; {
+    description = "Legion is a parallel programming model for distributed, heterogeneous machines";
+    homepage = "https://github.com/StanfordLegion/legion";
+    license = licenses.asl20;
+  };
+}
diff --git a/.flake/pkgs/tokenizers-cpp.nix b/.flake/pkgs/tokenizers-cpp.nix
new file mode 100644
index 0000000000..a705667ae6
--- /dev/null
+++ b/.flake/pkgs/tokenizers-cpp.nix
@@ -0,0 +1,43 @@
+{ lib
+, stdenv
+, fetchFromGitHub
+, cmake
+, rustc
+, cargo
+}:
+
+stdenv.mkDerivation rec {
+  pname = "tokenizers-cpp";
+  version = "2024-03-13";
+
+  src = fetchFromGitHub {
+    owner = "mlc-ai";
+    repo = "tokenizers-cpp";
+    rev = "4f42c9fa74946d70af86671a3804b6f2433e5dac";
+    sha256 = "sha256-p7OYx9RVnKUAuMexy3WjW2zyfMJ/Q9ss4xFLsbQK7wA=";
+    fetchSubmodules = true;
+  };
+
+  nativeBuildInputs = [
+    cmake
+    rustc
+  ];
+
+  # cmakeFlags = [
+  #   "-DLegion_USE_Python=1"
+  #   "-DLegion_BUILD_BINDINGS=1"
+  #   "-DLegion_USE_CUDA=1"
+  #   "-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}"
+  # ];
+
+  buildInputs = [ ];
+    # python3
+    # cudatoolkit
+  # ];
+
+  meta = with lib; {
+    description = "Universal cross-platform tokenizers binding to HF and sentencepiece";
+    homepage = "https://github.com/mlc-ai/tokenizers-cpp";
+    license = licenses.asl20;
+  };
+}
diff --git a/.github/workflows/helpers/build_cuda.sh b/.github/workflows/helpers/build_cuda.sh
new file mode 100755
index 0000000000..3524f885a7
--- /dev/null
+++ b/.github/workflows/helpers/build_cuda.sh
@@ -0,0 +1,29 @@
+#! /usr/bin/env bash
+
+set -euo pipefail
+set -x
+
+DIR="$(realpath -- "$(dirname "${BASH_SOURCE[0]}")")"
+REPO="$(realpath -- "$DIR/../../../")"
+
+export FF_GPU_BACKEND="cuda"
+export FF_CUDA_ARCH=70
+cd "$REPO"
+mkdir build
+cd build
+#if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
+#  export FF_BUILD_ALL_EXAMPLES=ON
+#  export FF_BUILD_UNIT_TESTS=ON
+#fi
+../config/config.linux \
+        -DCMAKE_CXX_COMPILER="clang++" \
+        -DCMAKE_C_COMPILER="clang" \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+        -DFF_USE_EXTERNAL_LEGION=ON \
+        -DFF_USE_EXTERNAL_JSON=ON \
+        -DFF_USE_EXTERNAL_FMT=ON \
+        -DFF_USE_EXTERNAL_SPDLOG=ON
+
+# vim: set tabstop=2 shiftwidth=2 expandtab:
diff --git a/.github/workflows/per-lib-check.yml b/.github/workflows/per-lib-check.yml
index f21621b265..fa8252bc20 100644
--- a/.github/workflows/per-lib-check.yml
+++ b/.github/workflows/per-lib-check.yml
@@ -8,9 +8,7 @@ jobs:
   cmake-build:
     name: Library CMake Build
     runs-on: ubuntu-20.04
-    defaults:
-      run:
-        shell: bash -l {0} # required to use an activated conda environment
+
     strategy:
       max-parallel: 1
       matrix:
@@ -22,23 +20,27 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Free additional space on runner
-        run: .github/workflows/helpers/free_space_on_runner.sh
+      - name: Install nix
+        uses: cachix/install-nix-action@v25
+        with:
+          github_access_token: '${{ secrets.GITHUB_TOKEN }}'
 
-      - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
-        id: cuda-toolkit
+      - uses: cachix/cachix-action@v14
         with:
-          cuda: "12.1.0"
-          # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
-          use-github-cache: "false"
-          linux-local-args: '["--toolkit"]'
+          name: ff
+          skipPush: true
+          # authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+
+      - name: setup nix develop shell
+        uses: nicknovitski/nix-develop@v1.1.0
+        with:
+          arguments: "--accept-flake-config .#ci"
 
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2
 
-      - name: Install system dependencies
-        run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh
+      # - name: Install system dependencies
+      #   run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh
 
       # - name: Install conda and FlexFlow dependencies
       #   uses: conda-incubator/setup-miniconda@v2
@@ -49,20 +51,7 @@ jobs:
 
       - name: Run cmake
         run: |
-          export CUDNN_DIR=/usr/local/cuda
-          export CUDA_DIR=/usr/local/cuda
-          export FF_HOME=$(pwd)
-          export FF_GPU_BACKEND=${{ matrix.gpu_backend }}
-          export FF_CUDA_ARCH=70
-          n_build_cores=$(( $(nproc) cores_available -1 ))
-          if (( $n_build_cores < 1 )) ; then n_build_cores=1 ; fi
-          mkdir build
-          cd build
-          #if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
-          #  export FF_BUILD_ALL_EXAMPLES=ON 
-          #  export FF_BUILD_UNIT_TESTS=ON
-          #fi
-          ../config/config.linux -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache
+          .github/workflows/helpers/build_${{ matrix.gpu_backend }}.sh
 
       - name: Build utils
         run: |
diff --git a/.gitmodules b/.gitmodules
index e6068aa368..7ee487bb6a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,18 +4,6 @@
 [submodule "deps/nccl"]
 	path = deps/nccl
 	url = https://github.com/NVIDIA/nccl.git
-[submodule "deps/pybind11"]
-	path = deps/pybind11
-	url = https://github.com/pybind/pybind11.git
-[submodule "deps/googletest"]
-	path = deps/googletest
-	url = https://github.com/google/googletest.git
-[submodule "deps/variant"]
-	path = deps/variant
-	url = https://github.com/mpark/variant
-[submodule "deps/optional"]
-	path = deps/optional
-	url = https://github.com/TartanLlama/optional.git
 [submodule "deps/json"]
 	path = deps/json
 	url = https://github.com/nlohmann/json.git
@@ -37,9 +25,3 @@
 [submodule "deps/fmt"]
 	path = deps/fmt
 	url = https://github.com/fmtlib/fmt.git
-[submodule "deps/invoke"]
-	path = deps/invoke
-	url = https://github.com/BlackMATov/invoke.hpp.git
-[submodule "deps/any"]
-	path = deps/any
-	url = https://github.com/thelink2012/any.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 418a2a7538..e04aa622c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,12 @@
 cmake_minimum_required(VERSION 3.10)
 project(FlexFlow)
 
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake)
+set(
+  CMAKE_MODULE_PATH 
+  ${CMAKE_MODULE_PATH} 
+  ${CMAKE_CURRENT_LIST_DIR}/cmake 
+  ${CMAKE_CURRENT_LIST_DIR}/cmake/Modules
+)
 
 # Detect OS type and Linux version (if it applies)
 set(LINUX_VERSION "")
@@ -28,7 +33,7 @@ set(FF_MAX_NUM_TASK_REGIONS "20" CACHE STRING
 set(FF_MAX_NUM_TASK_ARGUMENTS "5" CACHE STRING
   "Maximum number of arguments that can be declared in a TaskSignature")
 option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF)
-option(FF_USE_PREBUILT_NCCL "Enable use of NCCL pre-compiled library, if available" ON)
+option(FF_USE_EXTERNAL_NCCL "Enable use of NCCL pre-compiled library, if available" ON)
 option(FF_USE_PREBUILT_LEGION "Enable use of Legion pre-compiled library, if available" ON)
 option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)
 option(FF_USE_PYTHON "Enable Python" ON)
@@ -77,20 +82,15 @@ include(nccl)
 # set_property(CACHE FF_GPU_BACKEND PROPERTY STRINGS ${FF_GPU_BACKENDS})
 
 include(json)
-include(optional)
 include(expected)
 include(spdlog)
-include(variant)
 include(doctest)
 include(visit_struct)
 include(CTest)
 include(fmt)
 include(legion)
 include(rapidcheck)
-include(invoke)
-include(any)
 #include(gtest)
-#include(fmt)
 
 include(flexflow-utils)
 
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
new file mode 100644
index 0000000000..796818c0cf
--- /dev/null
+++ b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,175 @@
+# from https://github.com/pytorch/pytorch/blob/818b14025a1d70872b52d28a1e83e7797f6e271a/cmake/Modules/FindNCCL.cmake
+
+################################################################################
+#
+# From PyTorch:
+#
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+#
+# From Caffe2:
+#
+# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+#
+# All contributions by Facebook:
+# Copyright (c) 2016 Facebook Inc.
+#
+# All contributions by Google:
+# Copyright (c) 2015 Google Inc.
+# All rights reserved.
+#
+# All contributions by Yangqing Jia:
+# Copyright (c) 2015 Yangqing Jia
+# All rights reserved.
+#
+# All contributions by Kakao Brain:
+# Copyright 2019-2020 Kakao Brain
+#
+# All contributions by Cruise LLC:
+# Copyright (c) 2022 Cruise LLC.
+# All rights reserved.
+#
+# All contributions from Caffe:
+# Copyright(c) 2013, 2014, 2015, the respective contributors
+# All rights reserved.
+#
+# All other contributions:
+# Copyright(c) 2015, 2016 the respective contributors
+# All rights reserved.
+#
+# Caffe2 uses a copyright model similar to Caffe: each contributor holds
+# copyright over their contributions to Caffe2. The project versioning records
+# all such contribution and copyright details. If a contributor wants to further
+# mark their specific copyright on a particular contribution, they should
+# indicate their copyright solely in the commit message of the change when it is
+# committed.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+#    and IDIAP Research Institute nor the names of its contributors may be
+#    used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+
+# Find the nccl libraries
+#
+# The following variables are optionally searched for defaults
+#  NCCL_ROOT: Base directory where all NCCL components are found
+#  NCCL_INCLUDE_DIR: Directory where NCCL header is found
+#  NCCL_LIB_DIR: Directory where NCCL library is found
+#
+# The following are set after configuration is done:
+#  NCCL_FOUND
+#  NCCL_INCLUDE_DIRS
+#  NCCL_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NCCL in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+
+set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
+set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
+set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
+
+if ($ENV{NCCL_ROOT_DIR})
+  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
+endif()
+list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
+
+find_path(NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS ${NCCL_INCLUDE_DIR})
+
+if (USE_STATIC_NCCL)
+  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
+  SET(NCCL_LIBNAME "nccl_static")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+else()
+  SET(NCCL_LIBNAME "nccl")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+endif()
+
+find_library(NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+
+if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
+  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
+  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
+
+  if (NCCL_VERSION_DEFINED)
+    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
+    file(WRITE ${file} "
+      #include <iostream>
+      #include <nccl.h>
+      int main()
+      {
+        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
+
+        int x;
+        ncclGetVersion(&x);
+        return x == NCCL_VERSION_CODE;
+      }
+")
+    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
+          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
+          LINK_LIBRARIES ${NCCL_LIBRARIES})
+    if (NOT NCCL_VERSION_MATCHED)
+      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
+(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
+    endif()
+    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
+  else()
+    message(STATUS "NCCL version < 2.3.5-5")
+  endif ()
+  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+
+  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()
diff --git a/cmake/any.cmake b/cmake/any.cmake
deleted file mode 100644
index 9a6164da4f..0000000000
--- a/cmake/any.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-add_library(
-  any
-  INTERFACE
-)
-target_include_directories(
-  any
-  INTERFACE
-  ${CMAKE_CURRENT_SOURCE_DIR}/deps/any/
-)
-set_target_properties(
-  any
-  PROPERTIES
-    CXX_STANDARD 11
-    CXX_STANDARD_REQUIRED YES
-    CXX_EXTENSIONS NO
-)
diff --git a/cmake/fmt.cmake b/cmake/fmt.cmake
index 9eeb85611c..283caad69d 100644
--- a/cmake/fmt.cmake
+++ b/cmake/fmt.cmake
@@ -1,5 +1,9 @@
 include(aliasing)
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/fmt)
+if (FF_USE_EXTERNAL_FMT)
+  find_package(fmt REQUIRED)
+else()
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/fmt)
 
-alias_library(fmt fmt::fmt)
+  alias_library(fmt fmt::fmt)
+endif()
diff --git a/cmake/invoke.cmake b/cmake/invoke.cmake
deleted file mode 100644
index 3ec406ed05..0000000000
--- a/cmake/invoke.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-include(aliasing)
-
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/invoke)
-
-alias_library(invoke invoke.hpp::invoke.hpp)
diff --git a/cmake/json.cmake b/cmake/json.cmake
index 97d4e5f9f7..093ec51cdc 100644
--- a/cmake/json.cmake
+++ b/cmake/json.cmake
@@ -1,6 +1,12 @@
 include(aliasing)
 
-set(JSON_BuildTests OFF CACHE INTERNAL "")
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json)
+if (FF_USE_EXTERNAL_JSON)
+  find_package(nlohmann_json REQUIRED)
 
-alias_library(json nlohmann_json::nlohmann_json)
+  alias_library(json nlohmann_json)
+else()
+  set(JSON_BuildTests OFF CACHE INTERNAL "")
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/json)
+
+  alias_library(json nlohmann_json::nlohmann_json)
+endif()
diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index 12062958cd..e89bee04c6 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -1,109 +1,36 @@
-set(NCCL_NAME nccl_internal)
-# set(NCCL_CUDA_ARCH "-gencode=arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}")
-# message("NCCL_CUDA_ARCH: ${NCCL_CUDA_ARCH}")
+include(aliasing)
 
-set(NCCL_URL "")
-if((FF_USE_PREBUILT_NCCL OR FF_USE_ALL_PREBUILT_LIBRARIES) AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
-  if(LINUX_VERSION MATCHES "20.04")
-    if (CUDA_VERSION VERSION_EQUAL "11.0")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.0.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.1.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.2.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.3")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.3.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.4")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.4.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.5")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.5.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.6")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.6.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.7")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.7.0.tar.gz")
-    endif()
-  elseif(LINUX_VERSION MATCHES "18.04")
-    if (CUDA_VERSION VERSION_EQUAL "10.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.1.243.tar.gz")
-    elseif (CUDA_VERSION VERSION_EQUAL "10.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.2.89.tar.gz")
-    elseif (CUDA_VERSION VERSION_EQUAL "11.0")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.0.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.1")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.1.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.2")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.2.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.3")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.3.1.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.4")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.4.3.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.5")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.5.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.6")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.6.2.tar.gz")
-    elseif(CUDA_VERSION VERSION_EQUAL "11.7")
-      set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.7.0.tar.gz")
-    endif()
-  endif()
-endif()
-
-if(NCCL_URL)
-  # Download and import pre-compiled NCCL library
-  message(STATUS "Using pre-compiled NCCL library")
-  message(STATUS "NCCL_URL: ${NCCL_URL}")
-
-  include(FetchContent)
-  FetchContent_Declare(${NCCL_NAME}
-    URL ${NCCL_URL}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND ""
-  )
-  FetchContent_GetProperties(${NCCL_NAME})
-  if(NOT ${NCCL_NAME}_POPULATED)
-    FetchContent_Populate(${NCCL_NAME})
-  endif()
-  
-  set(NCCL_FOLDER_PATH ${${NCCL_NAME}_SOURCE_DIR}/deps/nccl)
-  set(NCCL_INCLUDE_DIR ${NCCL_FOLDER_PATH}/include)
-  set(NCCL_LIB_DIR ${NCCL_FOLDER_PATH}/lib)
-  message(STATUS "NCCL library path: ${NCCL_FOLDER_PATH}")
-  add_library(nccl SHARED IMPORTED)
-  set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_FOLDER_PATH})
+add_library(nccl INTERFACE)
 
-  list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
-  list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIB_DIR}/libnccl${LIBEXT})
-  install(DIRECTORY ${NCCL_INCLUDE_DIR}/ DESTINATION include)
-  install(DIRECTORY ${NCCL_LIB_DIR}/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
-    
-  set(NCCL_LIB "${INSTALL_DIR}/lib/libnccl${LIBEXT}")
+if (FF_USE_EXTERNAL_NCCL)
+  find_package(NCCL REQUIRED)
 else()
-  # Build NCCL from source
   message(STATUS "Building NCCL from source")
   list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
 
-  include(ExternalProject)
-  ExternalProject_Add(${NCCL_NAME}
-   SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/nccl
-   PREFIX ${CMAKE_BINARY_DIR}/deps/nccl
-   INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/nccl
-   BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/nccl/lib/libnccl${LIBEXT}
-   INSTALL_COMMAND ""
-   CONFIGURE_COMMAND ""
-   BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/nccl" "CXX=${CMAKE_CXX_COMPILER} -w" CC="${CMAKE_CC_COMPILER}"
-   BUILD_IN_SOURCE 1
+  ExternalProject_Add(nccl_source_build
+    SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
+    PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+    INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+    BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
+    INSTALL_COMMAND ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}"
+    BUILD_IN_SOURCE 1
   )
 
-  ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
-  message(STATUS "NCCL install dir: ${INSTALL_DIR}")
-  set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/nccl/lib/")
+  ExternalProject_Get_Property(nccl_source_build INSTALL_DIR)
+  set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/nccl_source_build/lib/")
+  
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/nccl_source_build/include/ DESTINATION include)
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/nccl_source_build/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
 
   set(NCCL_INCLUDE_DIR "${INSTALL_DIR}/include")
-  set(NCCL_LIB "${INSTALL_DIR}/lib/libnccl${LIBEXT}")
+  set(NCCL_LIBRARIES "${INSTALL_DIR}/lib/libnccl${LIBEXT}")
+
+  add_dependencies(nccl nccl_source_build)
 endif()
-message("NCCL_LIB = ${NCCL_LIB}")
-message("INSTALL_DIR = ${INSTALL_DIR}")
 
-add_library(nccl INTERFACE)
-target_include_directories(nccl SYSTEM INTERFACE ${NCCL_INCLUDE_DIR})
-add_dependencies(nccl ${NCCL_NAME})
-target_link_libraries(nccl INTERFACE ${NCCL_LIB})
+message(STATUS "NCCL_LIBRARIES = ${NCCL_LIBRARIES}")
+target_include_directories(nccl SYSTEM INTERFACE ${NCCL_INCLUDE_DIRS})
+target_link_libraries(nccl INTERFACE ${NCCL_LIBRARIES})
diff --git a/cmake/optional.cmake b/cmake/optional.cmake
deleted file mode 100644
index afaa6330c0..0000000000
--- a/cmake/optional.cmake
+++ /dev/null
@@ -1,4 +0,0 @@
-set(OPTIONAL_BUILD_TESTS OFF)
-set(OPTIONAL_BUILD_PACKAGE OFF)
-
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/optional)
diff --git a/cmake/spdlog.cmake b/cmake/spdlog.cmake
index a0d36fc3b2..cd18944460 100644
--- a/cmake/spdlog.cmake
+++ b/cmake/spdlog.cmake
@@ -1,5 +1,9 @@
 include(aliasing)
 
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/spdlog)
+if (FF_USE_EXTERNAL_SPDLOG)
+  find_package(spdlog REQUIRED)
+else()
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/spdlog)
 
-alias_library(spdlog spdlog::spdlog)
+  alias_library(spdlog spdlog::spdlog)
+endif()
diff --git a/cmake/variant.cmake b/cmake/variant.cmake
deleted file mode 100644
index ddf5781281..0000000000
--- a/cmake/variant.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-include(aliasing)
-
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/deps/variant)
-
-alias_library(variant mpark_variant)
diff --git a/config/config.linux b/config/config.linux
index 2b87ec0eb5..94cb348a5a 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -1,4 +1,4 @@
-#!/bin/bash
+#! /usr/bin/env bash
 
 # set the CC and CXX, usually it is not needed as cmake can detect it
 # set CC and CXX to mpicc and mpic++ when enable gasnet
diff --git a/deps/any b/deps/any
deleted file mode 160000
index e88b1bfc16..0000000000
--- a/deps/any
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit e88b1bfc160fa9b01e6174dd29c812eeeece3be9
diff --git a/deps/googletest b/deps/googletest
deleted file mode 160000
index 2fe3bd994b..0000000000
--- a/deps/googletest
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2fe3bd994b3189899d93f1d5a881e725e046fdc2
diff --git a/deps/invoke b/deps/invoke
deleted file mode 160000
index 2c1eabc2e2..0000000000
--- a/deps/invoke
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2c1eabc2e20ab02961f95c704ff0c0818671ddd1
diff --git a/deps/optional b/deps/optional
deleted file mode 160000
index c28fcf74d2..0000000000
--- a/deps/optional
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c28fcf74d207fc667c4ed3dbae4c251ea551c8c1
diff --git a/deps/pybind11 b/deps/pybind11
deleted file mode 160000
index 8de7772cc7..0000000000
--- a/deps/pybind11
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 8de7772cc72daca8e947b79b83fea46214931604
diff --git a/deps/variant b/deps/variant
deleted file mode 160000
index 23cb94f027..0000000000
--- a/deps/variant
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 23cb94f027d4ef33bf48133acc2695c7e5c6f1e7
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000000..205d2b2290
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,60 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1689068808,
+        "narHash": "sha256-6ixXo3wt24N/melDWjq70UuHQLxGV8jZvooRanIHXw0=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "919d646de7be200f3bf08cb76ae1f09402b6f9b4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1710162809,
+        "narHash": "sha256-i2R2bcnQp+85de67yjgZVvJhd6rRnJbSYNpGmB6Leb8=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "ddcd7598b2184008c97e6c9c6a21c5f37590b8d2",
+        "type": "github"
+      },
+      "original": {
+        "id": "nixpkgs",
+        "ref": "nixos-23.11",
+        "type": "indirect"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000000..3d357ca86c
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,99 @@
+{
+  description = "A framework for automatic performance optimization of DNN training and inference";
+
+  nixConfig = {
+    bash-prompt-prefix = "(ff) ";
+    extra-substituters = [
+      "https://ff.cachix.org"
+      "https://cuda-maintainers.cachix.org/"
+    ];
+    extra-trusted-public-keys = [
+      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
+      "ff.cachix.org-1:/kyZ0w35ToSJBjpiNfPLrL3zTjuPkUiqf2WH0GIShXM="
+    ];
+  };
+
+  # Nixpkgs / NixOS version to use.
+  inputs = {
+    nixpkgs.url = "nixpkgs/nixos-23.11";
+    flake-utils.url = "github:numtide/flake-utils";
+  };
+
+  outputs = { self, nixpkgs, flake-utils, ... }: flake-utils.lib.eachSystem [ "x86_64-linux" ] (system: 
+    let 
+      pkgs = import nixpkgs {
+        inherit system;
+        config.allowUnfree = true;
+      };
+
+      mkShell = pkgs.mkShell.override {
+        stdenv = pkgs.llvmPackages.libcxxStdenv;
+      };
+    in 
+      {
+        packages = {
+          legion = pkgs.callPackage ./.flake/pkgs/legion.nix { };
+        };
+
+        devShells = rec {
+          ci = mkShell {
+            buildInputs = (with pkgs; [
+              llvmPackages_17.clang
+              cmakeCurses
+              gcc10Stdenv
+              gcc10
+              ccache
+              cudatoolkit
+              zlib
+              pkg-config
+              python3
+              self.packages.${system}.legion
+              nlohmann_json
+              spdlog
+              range-v3
+              rapidcheck
+              doctest
+              fmt
+              cudaPackages.cuda_nvcc
+              cudaPackages.cudnn
+              cudaPackages.nccl
+              cudaPackages.libcublas
+              cudaPackages.cuda_cudart
+            ]) ++ (with pkgs.python3Packages; [
+            ]);
+        };
+
+        default = mkShell {
+          inputsFrom = [ ci ];
+  
+          buildInputs = builtins.concatLists [
+            (with pkgs; [
+              clang-tools_17
+              gh-markdown-preview
+              plantuml
+              gdb
+              ruff
+              compdb
+              jq
+              gh
+            ])
+            (with pkgs.python3Packages; [
+              gitpython
+              ipython
+              mypy
+              python-lsp-server
+              pylsp-mypy
+              python-lsp-ruff
+              pygithub
+              sqlitedict
+              frozendict
+              black
+              toml
+            ])
+          ];
+        };
+      };
+    }
+  );
+}
+# vim: set tabstop=2 shiftwidth=2 expandtab:
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index 59c7d44b60..a963c7b49b 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -1,12 +1,14 @@
 set(project_target kernels)
 
 project(${project_target} 
-  LANGUAGES CUDA)
+	LANGUAGES CXX CUDA)
 
 file(GLOB_RECURSE SRC
      CONFIGURE_DEPENDS
      LIST_DIRECTORIES False
-     src/*.cc)
+     src/*.cc
+     # src/*.cu
+     )
 
 add_library(
   ${project_target}
@@ -16,7 +18,7 @@ add_library(
 target_include_directories(
   ${project_target}
   PRIVATE
-    src/cuda/
+    src/
   PUBLIC
     include/
 )
@@ -28,10 +30,10 @@ target_link_libraries(
   nccl
 )
 
-define_ff_vars(kernels)
+define_ff_vars(${project_target})
 
 set_target_properties(
   ${project_target} 
   PROPERTIES 
-  CUDA_STANDARD 11
+  CUDA_STANDARD 17
 )
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 41c8275b1c..0a0124e13c 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 
 #include "legion_dim.h"
-#include "utils/optional.h"
 #include "utils/stack_vector.h"
 #include "utils/visitable.h"
 #include <cstddef>
@@ -36,11 +35,11 @@ struct ArrayShape {
   legion_dim_t last_idx() const;
   legion_dim_t neg_idx(int) const;
 
-  optional<std::size_t> at_maybe(std::size_t) const;
+  std::optional<std::size_t> at_maybe(std::size_t) const;
 
   ArrayShape reversed_dim_order() const;
-  ArrayShape sub_shape(optional<legion_dim_t> start,
-                       optional<legion_dim_t> end);
+  ArrayShape sub_shape(std::optional<legion_dim_t> start,
+                       std::optional<legion_dim_t> end);
 
 public:
   LegionTensorDims dims;
diff --git a/lib/kernels/include/kernels/device.h b/lib/kernels/include/kernels/device.h
index b1571da1b3..652c59e976 100644
--- a/lib/kernels/include/kernels/device.h
+++ b/lib/kernels/include/kernels/device.h
@@ -22,6 +22,7 @@
 #endif
 
 #include <cassert>
+#include <cstdlib>
 #include <iostream>
 #include <sstream>
 
diff --git a/lib/kernels/include/kernels/perf_metrics.h b/lib/kernels/include/kernels/perf_metrics.h
index 1a3d2509d0..c4a34e4f79 100644
--- a/lib/kernels/include/kernels/perf_metrics.h
+++ b/lib/kernels/include/kernels/perf_metrics.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_PERF_METRICS_H
 
 #include "utils/fmt.h"
-#include "utils/optional.h"
 #include "utils/visitable.h"
 
 namespace FlexFlow {
@@ -11,23 +10,24 @@ struct PerfMetrics : public use_visitable_cmp<PerfMetrics> {
   PerfMetrics() = delete;
   PerfMetrics(double start_time);
   PerfMetrics(int train_all,
-              optional<int> train_correct,
-              optional<float> cce_loss,
-              optional<float> sparse_cce_loss,
-              optional<float> mse_loss,
-              optional<float> rmse_loss,
-              optional<float> mae_loss,
+              std::optional<int> train_correct,
+              std::optional<float> cce_loss,
+              std::optional<float> sparse_cce_loss,
+              std::optional<float> mse_loss,
+              std::optional<float> rmse_loss,
+              std::optional<float> mae_loss,
               double start_time_micro,
               double current_time_micro);
 
-  int train_all = 0;                  // measure_accuracy_denominator
-  optional<int> train_correct = 0;    // measure_accuracy numerator
-  optional<float> cce_loss = nullopt; // measure_categorical_crossentropy
-  optional<float> sparse_cce_loss =
-      0.0f;                         // measure_sparse_categorical_crossentropy
-  optional<float> mse_loss = 0.0f;  // measure_mean_squared_error
-  optional<float> rmse_loss = 0.0f; // measure_root_mean_squared_error
-  optional<float> mae_loss = 0.0f;  // measure_mean_absolute_error
+  int train_all = 0;                    // measure_accuracy_denominator
+  std::optional<int> train_correct = 0; // measure_accuracy numerator
+  std::optional<float> cce_loss =
+      std::nullopt; // measure_categorical_crossentropy
+  std::optional<float> sparse_cce_loss =
+      0.0f; // measure_sparse_categorical_crossentropy
+  std::optional<float> mse_loss = 0.0f;  // measure_mean_squared_error
+  std::optional<float> rmse_loss = 0.0f; // measure_root_mean_squared_error
+  std::optional<float> mae_loss = 0.0f;  // measure_mean_absolute_error
   double start_time;
   double current_time;
 };
diff --git a/lib/kernels/src/cuda/batch_norm_kernels.cu b/lib/kernels/src/cuda/batch_norm_kernels.cu
index 8c5ea76f73..6529351a77 100644
--- a/lib/kernels/src/cuda/batch_norm_kernels.cu
+++ b/lib/kernels/src/cuda/batch_norm_kernels.cu
@@ -13,8 +13,11 @@
  * limitations under the License.
  */
 
+#include "device.h"
+#include "kernels/allocation.h"
 #include "kernels/batch_norm_kernels.h"
-#include "kernels/cuda_helper.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 namespace Kernels {
@@ -86,7 +89,7 @@ void backward_kernel(cudaStream_t stream,
                                              m->saveVar));
 }
 
-BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handler,
+BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     Allocator allocator,
                                     float *runningMean,
                                     int output_n,
@@ -125,19 +128,18 @@ BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handler,
   checkCUDNN(cudnnSetTensor4dDescriptor(
       biasTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_c, 1, 1));
   // allocate memory for runningMean, runningVar, saveMean, saveVar
-  {
-    size_t totalSize = sizeof(float) * output_c * 4;
-    runningMean = (float *)allocator.allocate(totalSize);
-    float *runningVar = (float *)runningMean + output_c;
-    float *saveMean = (float *)runningVar + output_c;
-    float *saveVar = (float *)saveMean + output_c;
-    cudaStream_t stream;
+  size_t totalSize = sizeof(float) * output_c * 4;
+  runningMean = (float *)allocator.allocate(totalSize);
+  float *runningVar = (float *)runningMean + output_c;
+  float *saveMean = (float *)runningVar + output_c;
+  float *saveVar = (float *)saveMean + output_c;
+  cudaStream_t stream;
+
+  assign_kernel<<<GET_BLOCKS(output_c), CUDA_NUM_THREADS, 0, stream>>>(
+      runningMean, output_c, 0.0f);
+  assign_kernel<<<GET_BLOCKS(output_c), CUDA_NUM_THREADS, 0, stream>>>(
+      runningVar, output_c, 0.0f);
 
-    assign_kernel<<<GET_BLOCKS(output_c), CUDA_NUM_THREADS, 0, stream>>>(
-        runningMean, output_c, 0.0f);
-    assign_kernel<<<GET_BLOCKS(output_c), CUDA_NUM_THREADS, 0, stream>>>(
-        runningVar, output_c, 0.0f);
-  }
   if (relu) {
     checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));
     checkCUDNN(cudnnSetActivationDescriptor(
diff --git a/lib/kernels/src/cuda/cast_kernels.cu b/lib/kernels/src/cuda/cast_kernels.cu
index 3d8804862d..e7716ee06b 100644
--- a/lib/kernels/src/cuda/cast_kernels.cu
+++ b/lib/kernels/src/cuda/cast_kernels.cu
@@ -59,7 +59,8 @@ struct BackwardKernel {
   }
 };
 
-void forward_kernel(ffStream_t stream,
+void forward_kernel(PerDeviceFFHandle handle,
+                    ffStream_t stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output,
                     DataType input_type,
@@ -68,7 +69,8 @@ void forward_kernel(ffStream_t stream,
       input_type, output_type, stream, handle, input, output);
 }
 
-void backward_kernel(ffStream_t stream,
+void backward_kernel(PerDeviceFFHandle handle,
+                     ffStream_t stream,
                      GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &output,
                      DataType input_type,
diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h
index 5c4239a5cf..00f2888f45 100644
--- a/lib/kernels/src/device.h
+++ b/lib/kernels/src/device.h
@@ -5,6 +5,7 @@
 #include "kernels/device.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/op.h"
+#include <cstddef>
 
 #if defined(FF_USE_CUDA)
 #include <cuda_fp16.h>
diff --git a/lib/kernels/src/perf_metrics.cc b/lib/kernels/src/perf_metrics.cc
index 07bb8de815..2036ddd35a 100644
--- a/lib/kernels/src/perf_metrics.cc
+++ b/lib/kernels/src/perf_metrics.cc
@@ -6,12 +6,12 @@ PerfMetrics::PerfMetrics(double _start_time)
     : start_time(_start_time), current_time(_start_time) {}
 
 PerfMetrics::PerfMetrics(int _train_all,
-                         optional<int> _train_correct,
-                         optional<float> _cce_loss,
-                         optional<float> _sparse_cce_loss,
-                         optional<float> _mse_loss,
-                         optional<float> _rmse_loss,
-                         optional<float> _mae_loss,
+                         std::optional<int> _train_correct,
+                         std::optional<float> _cce_loss,
+                         std::optional<float> _sparse_cce_loss,
+                         std::optional<float> _mse_loss,
+                         std::optional<float> _rmse_loss,
+                         std::optional<float> _mae_loss,
                          double _start_time_micro,
                          double _current_time_micro)
     : train_all(_train_all), train_correct(_train_correct), cce_loss(_cce_loss),
@@ -29,7 +29,7 @@ float get_accuracy(PerfMetrics const &m) {
 PerfMetrics update(PerfMetrics const &lhs, PerfMetrics const &rhs) {
   PerfMetrics out(lhs);
 
-  auto update_val = [](optional<float> &l, optional<float> const &r) {
+  auto update_val = [](std::optional<float> &l, std::optional<float> const &r) {
     if (l.has_value()) {
       l.value() += r.value();
     }
@@ -52,7 +52,7 @@ PerfMetrics update(PerfMetrics const &lhs, PerfMetrics const &rhs) {
 PerfMetrics apply_scale(PerfMetrics const &pm, float scale) {
   PerfMetrics out(pm);
 
-  auto scale_val = [&](optional<float> &l) {
+  auto scale_val = [&](std::optional<float> &l) {
     if (l.has_value()) {
       l.value() *= scale;
     }
diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h
index 2417f37fdb..4a8de665b4 100644
--- a/lib/op-attrs/include/op-attrs/datatype.h
+++ b/lib/op-attrs/include/op-attrs/datatype.h
@@ -3,7 +3,7 @@
 
 #include "utils/fmt.h"
 #include "utils/fp16.h"
-#include "utils/variant.h"
+#include <variant>
 
 namespace FlexFlow {
 
@@ -50,12 +50,12 @@ typename data_type_enum_to_class<DT>::type cast_to(T t) {
 template <DataType DT>
 using real_type = typename data_type_enum_to_class<DT>::type;
 
-using DataTypeValue = variant<real_type<DataType::FLOAT>,
-                              real_type<DataType::DOUBLE>,
-                              real_type<DataType::INT32>,
-                              real_type<DataType::INT64>,
-                              real_type<DataType::HALF>,
-                              real_type<DataType::BOOL>>;
+using DataTypeValue = std::variant<real_type<DataType::FLOAT>,
+                                   real_type<DataType::DOUBLE>,
+                                   real_type<DataType::INT32>,
+                                   real_type<DataType::INT64>,
+                                   real_type<DataType::HALF>,
+                                   real_type<DataType::BOOL>>;
 
 size_t size_of(DataType);
 
diff --git a/lib/op-attrs/include/op-attrs/get_op_type.h b/lib/op-attrs/include/op-attrs/get_op_type.h
index 421c464843..a2db4ab5f0 100644
--- a/lib/op-attrs/include/op-attrs/get_op_type.h
+++ b/lib/op-attrs/include/op-attrs/get_op_type.h
@@ -45,7 +45,7 @@ struct GetOpTypeFunctor {
 };
 
 template <typename... Ts>
-OperatorType get_op_type(variant<Ts...> const &attrs) {
+OperatorType get_op_type(std::variant<Ts...> const &attrs) {
   return visit(GetOpTypeFunctor{}, attrs);
 }
 
diff --git a/lib/op-attrs/include/op-attrs/get_output_shapes.h b/lib/op-attrs/include/op-attrs/get_output_shapes.h
index 5f78ec2d3f..6fb93aac91 100644
--- a/lib/op-attrs/include/op-attrs/get_output_shapes.h
+++ b/lib/op-attrs/include/op-attrs/get_output_shapes.h
@@ -177,19 +177,19 @@ struct GetOutputShapesFunctor {
 
 template <typename... Ts>
 std::vector<ParallelTensorShape>
-    get_output_shapes(variant<Ts...> const &t,
+    get_output_shapes(std::variant<Ts...> const &t,
                       std::vector<ParallelTensorShape> const &s) {
   return get_output_shape(GetOutputShapesFunctor{s}, t);
 }
 
 template <typename T>
-typename std::enable_if<!has_unary_output_t<T>::value, optional<int>>::type
+typename std::enable_if<!has_unary_output_t<T>::value, std::optional<int>>::type
     get_num_outputs(T const &) {
-  return nullopt;
+  return std::nullopt;
 }
 
 template <typename T>
-typename std::enable_if<has_unary_output_t<T>::value, optional<int>>::type
+typename std::enable_if<has_unary_output_t<T>::value, std::optional<int>>::type
     get_num_outputs(T const &) {
   return 1;
 }
diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h
index a7ba84624c..9da787cbf8 100644
--- a/lib/op-attrs/include/op-attrs/operator_attrs.h
+++ b/lib/op-attrs/include/op-attrs/operator_attrs.h
@@ -35,31 +35,31 @@
 
 namespace FlexFlow {
 
-using SharedOperatorAttrs = variant<BatchMatmulAttrs,
-                                    BatchNormAttrs,
-                                    CastAttrs,
-                                    ConcatAttrs,
-                                    Conv2DAttrs,
-                                    DropoutAttrs,
-                                    ElementBinaryAttrs,
-                                    ElementUnaryAttrs,
-                                    ElementScalarUnaryAttrs,
-                                    EmbeddingAttrs,
-                                    FlatAttrs,
-                                    GatherAttrs,
-                                    InputAttrs,
-                                    LayerNormAttrs,
-                                    LinearAttrs,
-                                    MultiHeadAttentionAttrs,
-                                    NoopAttrs,
-                                    Pool2DAttrs,
-                                    ReduceAttrs,
-                                    ReverseAttrs,
-                                    ReshapeAttrs,
-                                    SplitAttrs,
-                                    SoftmaxAttrs,
-                                    TopKAttrs,
-                                    TransposeAttrs>;
+using SharedOperatorAttrs = std::variant<BatchMatmulAttrs,
+                                         BatchNormAttrs,
+                                         CastAttrs,
+                                         ConcatAttrs,
+                                         Conv2DAttrs,
+                                         DropoutAttrs,
+                                         ElementBinaryAttrs,
+                                         ElementUnaryAttrs,
+                                         ElementScalarUnaryAttrs,
+                                         EmbeddingAttrs,
+                                         FlatAttrs,
+                                         GatherAttrs,
+                                         InputAttrs,
+                                         LayerNormAttrs,
+                                         LinearAttrs,
+                                         MultiHeadAttentionAttrs,
+                                         NoopAttrs,
+                                         Pool2DAttrs,
+                                         ReduceAttrs,
+                                         ReverseAttrs,
+                                         ReshapeAttrs,
+                                         SplitAttrs,
+                                         SoftmaxAttrs,
+                                         TopKAttrs,
+                                         TransposeAttrs>;
 
 static_assert(is_valid_opattr<BatchMatmulAttrs>::value, "");
 static_assert(is_valid_opattr<CastAttrs>::value, "");
@@ -85,11 +85,11 @@ static_assert(is_valid_opattr<SoftmaxAttrs>::value, "");
 static_assert(is_valid_opattr<TopKAttrs>::value, "");
 static_assert(is_valid_opattr<TransposeAttrs>::value, "");
 
-using ParallelOperatorAttrs =
+using ParallelOperatorAttrs = std::
     variant<CombineAttrs, ReductionAttrs, RepartitionAttrs, ReplicateAttrs>;
 
 using ComputationGraphAttrs =
-    variant_join<SharedOperatorAttrs, variant<BroadcastAttrs>>;
+    variant_join<SharedOperatorAttrs, std::variant<BroadcastAttrs>>;
 using CompGraphOperatorAttrs = ComputationGraphAttrs;
 
 using PCGOperatorAttrs =
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d.h b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
index 3034dc8c62..79980d545d 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d.h
@@ -10,9 +10,9 @@
 namespace FlexFlow {
 
 struct Conv2DAttrs {
-  req<int> out_channels, kernel_h, kernel_w, stride_h, stride_w, padding_h,
+  int out_channels, kernel_h, kernel_w, stride_h, stride_w, padding_h,
       padding_w, groups;
-  req<optional<Activation>> activation;
+  std::optional<Activation> activation;
   req<bool> use_bias;
 };
 
diff --git a/lib/op-attrs/include/op-attrs/ops/linear.h b/lib/op-attrs/include/op-attrs/ops/linear.h
index 3be8be2040..2c27b09f7c 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear.h
+++ b/lib/op-attrs/include/op-attrs/ops/linear.h
@@ -21,14 +21,14 @@ struct L2RegularizerAttrs {
 FF_VISITABLE_STRUCT(L2RegularizerAttrs, lambda);
 CHECK_VALID_OP_ATTR(L2RegularizerAttrs);
 
-using RegularizerAttrs = variant<L1RegularizerAttrs, L2RegularizerAttrs>;
+using RegularizerAttrs = std::variant<L1RegularizerAttrs, L2RegularizerAttrs>;
 
 struct LinearAttrs {
-  req<int> out_channels;
-  req<bool> use_bias;
-  req<DataType> data_type;
-  req<Activation> activation;
-  req<optional<RegularizerAttrs>> regularizer;
+  int out_channels;
+  bool use_bias;
+  DataType data_type;
+  Activation activation;
+  req<std::optional<RegularizerAttrs>> regularizer;
 };
 FF_VISITABLE_STRUCT(
     LinearAttrs, out_channels, use_bias, data_type, activation, regularizer);
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h
index 7a3db05329..58d372d9e5 100644
--- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h
+++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h
@@ -3,8 +3,8 @@
 
 #include "core.h"
 #include "utils/exception.h"
-#include "utils/variant.h"
 #include "utils/visitable.h"
+#include <variant>
 
 namespace FlexFlow {
 
@@ -31,7 +31,7 @@ FF_VISITABLE_STRUCT(OtherLossAttrs, loss_type);
 CHECK_VALID_OP_ATTR(OtherLossAttrs);
 
 using LossAttrs =
-    variant<SparseCategoricalCrossEntropyLossAttrs, OtherLossAttrs>;
+    std::variant<SparseCategoricalCrossEntropyLossAttrs, OtherLossAttrs>;
 
 LossFunction get_loss_function(OtherLossAttrs const &);
 LossFunction get_loss_function(SparseCategoricalCrossEntropyLossAttrs const &);
diff --git a/lib/op-attrs/src/operator_attrs.cc b/lib/op-attrs/src/operator_attrs.cc
index 16f410f870..a524ab3d14 100644
--- a/lib/op-attrs/src/operator_attrs.cc
+++ b/lib/op-attrs/src/operator_attrs.cc
@@ -166,8 +166,8 @@ struct AsDot {
 };
 
 template <typename... Args>
-RecordFormatter as_dot(variant<Args...> const &o) {
-  return mpark::visit(AsDot{}, o);
+RecordFormatter as_dot(std::variant<Args...> const &o) {
+  return std::visit(AsDot{}, o);
 }
 
 struct IsValidFunctor {
diff --git a/lib/op-attrs/src/parallel_dim_mapping_record.cc b/lib/op-attrs/src/parallel_dim_mapping_record.cc
index a5fa6823e9..5e734e88cd 100644
--- a/lib/op-attrs/src/parallel_dim_mapping_record.cc
+++ b/lib/op-attrs/src/parallel_dim_mapping_record.cc
@@ -13,7 +13,7 @@ ParallelDimMappingRecord ParallelDimMappingRecord::input_output_record(
     int input_dim,
     int output_idx,
     int output_dim,
-    tl::optional<MappingOperation> operation) {
+    std::optional<MappingOperation> operation) {
   ParallelDimMappingRecord r(MappingRecordType::INPUT_OUTPUT);
   r.operation = operation;
 
@@ -36,7 +36,7 @@ ParallelDimMappingRecord ParallelDimMappingRecord::input_weight_record(
     int input_dim,
     int weight_idx,
     int weight_dim,
-    tl::optional<MappingOperation> operation) {
+    std::optional<MappingOperation> operation) {
   ParallelDimMappingRecord r(MappingRecordType::INPUT_WEIGHT);
   r.operation = operation;
 
diff --git a/lib/op-attrs/src/parallel_dim_mapping_record.h b/lib/op-attrs/src/parallel_dim_mapping_record.h
index c0f325ab7e..c37ac79b40 100644
--- a/lib/op-attrs/src/parallel_dim_mapping_record.h
+++ b/lib/op-attrs/src/parallel_dim_mapping_record.h
@@ -1,7 +1,6 @@
 #ifndef _FLEXFLOW_OP_META_SRC_PARELLEL_DIM_MAPPING_RECORD_H
 #define _FLEXFLOW_OP_META_SRC_PARELLEL_DIM_MAPPING_RECORD_H
 
-#include "tl/optional.hpp"
 #include "utils/visitable.h"
 #include <vector>
 
@@ -23,18 +22,18 @@ class ParallelDimMappingRecord {
       int input_dim,
       int output_idx,
       int output_dim,
-      tl::optional<MappingOperation> operation = tl::nullopt);
+      std::optional<MappingOperation> operation = std::nullopt);
   static ParallelDimMappingRecord input_weight_record(
       int input_idx,
       int input_dim,
       int weight_idx,
       int weight_dim,
-      tl::optional<MappingOperation> operation = tl::nullopt);
+      std::optional<MappingOperation> operation = std::nullopt);
   MappingRecordType get_type() const;
 
 public:
   MappingRecordType type;
-  tl::optional<MappingOperation> operation;
+  std::optional<MappingOperation> operation;
 
   int output_dim, input_dim, weight_dim;
   int output_idx, input_idx, weight_idx;
diff --git a/lib/op-attrs/src/parallel_dim_mapping_record_solver.cc b/lib/op-attrs/src/parallel_dim_mapping_record_solver.cc
index 68686393f5..11cfbc125c 100644
--- a/lib/op-attrs/src/parallel_dim_mapping_record_solver.cc
+++ b/lib/op-attrs/src/parallel_dim_mapping_record_solver.cc
@@ -44,7 +44,7 @@ ParallelDimMappingRecord
                                    int output_dim,
                                    int input_idx,
                                    int output_idx,
-                                   tl::optional<MappingOperation> operation) {
+                                   std::optional<MappingOperation> operation) {
   NOT_IMPLEMENTED();
 }
 
@@ -53,7 +53,7 @@ ParallelDimMappingRecord
                                    int weight_dim,
                                    int input_idx,
                                    int weight_idx,
-                                   tl::optional<MappingOperation> operation) {
+                                   std::optional<MappingOperation> operation) {
   NOT_IMPLEMENTED();
 }
 /* int get_output_to_input_dim_mapping(ParallelTensorShape const &output, */
@@ -209,7 +209,7 @@ void construct_weight_parallel_dims(
     int weight_dim,
     int input_idx,
     int weight_idx,
-    tl::optional<MappingOperation> operation) {
+    std::optional<MappingOperation> operation) {
   records.push_back(ParallelDimMappingRecord::input_weight_record(
       input_idx, input_dim, weight_idx, weight_dim, operation));
 }
@@ -274,7 +274,7 @@ void construct_output_parallel_dims(
     int output_dim,
     int input_idx,
     int output_idx,
-    tl::optional<MappingOperation> operation) {
+    std::optional<MappingOperation> operation) {
   records.push_back(ParallelDimMappingRecord::input_output_record(
       input_idx, input_dim, output_idx, output_dim, operation));
 }
diff --git a/lib/op-attrs/src/parallel_dim_mapping_record_solver.h b/lib/op-attrs/src/parallel_dim_mapping_record_solver.h
index d28cd419ca..a46192edeb 100644
--- a/lib/op-attrs/src/parallel_dim_mapping_record_solver.h
+++ b/lib/op-attrs/src/parallel_dim_mapping_record_solver.h
@@ -29,7 +29,7 @@ ParallelDimMappingRecord construct_weight_parallel_dims(
     int weight_dim,
     int input_idx = 0,
     int weight_idx = 0,
-    tl::optional<MappingOperation> operation = tl::nullopt);
+    std::optional<MappingOperation> operation = std::nullopt);
 
 std::vector<ParallelDimMappingRecord>
     construct_output_parallel_dims(std::vector<std::pair<int, int>> mappings,
@@ -44,7 +44,7 @@ ParallelDimMappingRecord construct_output_parallel_dims(
     int output_dim,
     int input_idx = 0,
     int output_idx = 0,
-    tl::optional<MappingOperation> operation = tl::nullopt);
+    std::optional<MappingOperation> operation = std::nullopt);
 
 struct ParallelDimMappingSolution {
   std::vector<ParallelTensorShape> weight_shapes;
@@ -85,7 +85,7 @@ ParallelDimMappingSolution solve_parallel_dim_mappings(
 /*       int weight_dim, */
 /*       int input_idx = 0, */
 /*       int weight_idx = 0, */
-/*       tl::optional<MappingOperation> operation = tl::nullopt); */
+/*       std::optional<MappingOperation> operation = std::nullopt); */
 /*   void register_output_parallel_dims( */
 /*       std::vector<std::tuple<int, MappingOperation, int>> mappings, */
 /*       int input_idx = 0, */
@@ -95,7 +95,7 @@ ParallelDimMappingSolution solve_parallel_dim_mappings(
 /*       int output_dim, */
 /*       int input_idx = 0, */
 /*       int output_idx = 0, */
-/*       tl::optional<MappingOperation> operation = tl::nullopt); */
+/*       std::optional<MappingOperation> operation = std::nullopt); */
 
 /* private: */
 /*   std::vector<ParallelDimMappingRecord> *parallel_dims_mapping; */
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index ae937c590d..035f0cad0b 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -12,101 +12,114 @@ struct ComputationGraphBuilder
 
   // C++ APIs for constructing models
   // Add an exp layer
-  Tensor exp(Tensor const &, optional<std::string> const &name = nullopt);
+  Tensor exp(Tensor const &,
+             std::optional<std::string> const &name = std::nullopt);
   // Add an add layer
   Tensor add(Tensor const &x,
              Tensor const &y,
-             optional<std::string> const &name = nullopt);
+             std::optional<std::string> const &name = std::nullopt);
   // Add a subtract layer
   Tensor subtract(Tensor const &x,
                   Tensor const &y,
-                  optional<std::string> const &name = nullopt);
+                  std::optional<std::string> const &name = std::nullopt);
   // Add a multiply layer
   Tensor multiply(Tensor const &x,
                   Tensor const &y,
-                  optional<std::string> const &name = nullopt);
+                  std::optional<std::string> const &name = std::nullopt);
   // Add a divide layer
   Tensor divide(Tensor const &x,
                 Tensor const &y,
-                optional<std::string> const &name = nullopt);
+                std::optional<std::string> const &name = std::nullopt);
   // Add a max layer
   Tensor max(Tensor const &x,
              Tensor const &y,
-             optional<std::string> const &name = nullopt);
+             std::optional<std::string> const &name = std::nullopt);
   // Add a min layer
   Tensor min(Tensor const &x,
              Tensor const &y,
-             optional<std::string> const &name = nullopt);
+             std::optional<std::string> const &name = std::nullopt);
   // Add a rsqrt layer
-  Tensor rsqrt(Tensor const &x, optional<std::string> const &name = nullopt);
+  Tensor rsqrt(Tensor const &x,
+               std::optional<std::string> const &name = std::nullopt);
   // Add a pow layer
   Tensor pow(Tensor const &x,
              float exponent,
-             optional<std::string> const &name = nullopt);
+             std::optional<std::string> const &name = std::nullopt);
   // Add a scalar multiply layer
   Tensor scalar_multiply(Tensor const &x,
                          float scalar,
-                         optional<std::string> const &name = nullopt);
+                         std::optional<std::string> const &name = std::nullopt);
   Tensor scalar_add(Tensor const &x,
                     float scalar,
-                    optional<std::string> const &name = nullopt);
+                    std::optional<std::string> const &name = std::nullopt);
   Tensor scalar_sub(Tensor const &lhs,
                     float rhs,
-                    optional<std::string> const &name = nullopt);
+                    std::optional<std::string> const &name = std::nullopt);
   Tensor scalar_truediv(Tensor const &numerator,
                         float denominator,
-                        optional<std::string> const &name = nullopt);
+                        std::optional<std::string> const &name = std::nullopt);
   // Add a sin layer
-  Tensor sin(Tensor const &x, optional<std::string> const &name = nullopt);
+  Tensor sin(Tensor const &x,
+             std::optional<std::string> const &name = std::nullopt);
   // Add a cos layer
-  Tensor cos(Tensor const &x, optional<std::string> const &name = nullopt);
+  Tensor cos(Tensor const &x,
+             std::optional<std::string> const &name = std::nullopt);
   // Add an activation layer
-  Tensor relu(Tensor const &x, optional<std::string> const &name = nullopt);
-  Tensor identity(Tensor const &x, optional<std::string> const &name = nullopt);
-  Tensor gelu(Tensor const &x, optional<std::string> const &name = nullopt);
-  Tensor sigmoid(Tensor const &x, optional<std::string> const &name = nullopt);
-  Tensor tanh(Tensor const &x, optional<std::string> const &name = nullopt);
-  Tensor elu(Tensor const &x, optional<std::string> const &name = nullopt);
+  Tensor relu(Tensor const &x,
+              std::optional<std::string> const &name = std::nullopt);
+  Tensor identity(Tensor const &x,
+                  std::optional<std::string> const &name = std::nullopt);
+  Tensor gelu(Tensor const &x,
+              std::optional<std::string> const &name = std::nullopt);
+  Tensor sigmoid(Tensor const &x,
+                 std::optional<std::string> const &name = std::nullopt);
+  Tensor tanh(Tensor const &x,
+              std::optional<std::string> const &name = std::nullopt);
+  Tensor elu(Tensor const &x,
+             std::optional<std::string> const &name = std::nullopt);
   // Add a 2D convolutional layer
-  Tensor conv2d(Tensor const &input,
-                int outChannels,
-                int kernelH,
-                int kernelW,
-                int strideH,
-                int strideW,
-                int paddingH,
-                int paddingW,
-                optional<Activation> const &activation = nullopt,
-                int groups = 1,
-                bool use_bias = true,
-                optional<Initializer const &> kernel_initializer = nullopt,
-                optional<Initializer const &> bias_initializer = nullopt,
-                optional<RegularizerAttrs const &> kernel_regularizer = nullopt,
-                optional<std::string> const &name = nullopt);
+  Tensor conv2d(
+      Tensor const &input,
+      int outChannels,
+      int kernelH,
+      int kernelW,
+      int strideH,
+      int strideW,
+      int paddingH,
+      int paddingW,
+      std::optional<Activation> const &activation = std::nullopt,
+      int groups = 1,
+      bool use_bias = true,
+      std::optional<Initializer> const &kernel_initializer = std::nullopt,
+      std::optional<Initializer> const &bias_initializer = std::nullopt,
+      std::optional<RegularizerAttrs> const &kernel_regularizer = std::nullopt,
+      std::optional<std::string> const &name = std::nullopt);
   // Add a dropout layer
   Tensor dropout(Tensor const &input,
                  float rate,
                  unsigned long long seed = 0,
-                 optional<std::string> const &name = nullopt);
+                 std::optional<std::string> const &name = std::nullopt);
   // Add an embedding layer
-  Tensor embedding(Tensor const &input,
-                   int num_entries,
-                   int outDim,
-                   AggregateOp aggr,
-                   DataType dtype = DataType::FLOAT,
-                   optional<Initializer const &> kernel_initializer = nullopt,
-                   optional<std::string> const &name = nullopt);
+  Tensor embedding(
+      Tensor const &input,
+      int num_entries,
+      int outDim,
+      AggregateOp aggr,
+      DataType dtype = DataType::FLOAT,
+      std::optional<Initializer> const &kernel_initializer = std::nullopt,
+      std::optional<std::string> const &name = std::nullopt);
   // Add a gather layer
-  std::vector<Tensor> gather(Tensor const &input,
-                             Tensor const &index,
-                             ff_dim_t dim,
-                             optional<std::string> const &name = nullopt);
+  std::vector<Tensor>
+      gather(Tensor const &input,
+             Tensor const &index,
+             ff_dim_t dim,
+             std::optional<std::string> const &name = std::nullopt);
   // Add a cache layer
   Tensor cache(Tensor const &input,
                int num_batches,
                std::function<float(float *, void const *, void const *, int)>
                    score_f = {},
-               optional<std::string> const &name = nullopt);
+               std::optional<std::string> const &name = std::nullopt);
   // Add a 2D pooling layer
   Tensor pool2d(Tensor const &input,
                 int kernelH,
@@ -116,38 +129,39 @@ struct ComputationGraphBuilder
                 int paddingH,
                 int paddingW,
                 PoolOp type = PoolOp::MAX,
-                optional<Activation> const &activation = nullopt,
-                optional<std::string> const &name = nullopt);
+                std::optional<Activation> const &activation = std::nullopt,
+                std::optional<std::string> const &name = std::nullopt);
   Tensor layer_norm(Tensor const &input,
                     std::vector<int> const &axes,
                     bool elementwise_affine,
                     float eps,
-                    optional<std::string> const &name = nullopt);
+                    std::optional<std::string> const &name = std::nullopt);
   Tensor batch_norm(Tensor const &input,
                     bool relu = true,
-                    optional<std::string> const &name = nullopt);
+                    std::optional<std::string> const &name = std::nullopt);
   Tensor batch_matmul(Tensor const &A,
                       Tensor const &B,
                       int a_seq_length_dim = -1,
                       int b_seq_length_dim = -1,
-                      optional<std::string> const &name = nullopt);
-  Tensor dense(Tensor const &input,
-               int outDim,
-               optional<Activation> activation = nullopt,
-               bool use_bias = true,
-               DataType data_type = DataType::FLOAT,
-               optional<Initializer const &> kernel_initializer = nullopt,
-               optional<Initializer const &> bias_initializer = nullopt,
-               optional<std::string> const &name = nullopt);
+                      std::optional<std::string> const &name = std::nullopt);
+  Tensor
+      dense(Tensor const &input,
+            int outDim,
+            std::optional<Activation> activation = std::nullopt,
+            bool use_bias = true,
+            DataType data_type = DataType::FLOAT,
+            std::optional<Initializer> const &kernel_initializer = std::nullopt,
+            std::optional<Initializer> const &bias_initializer = std::nullopt,
+            std::optional<std::string> const &name = std::nullopt);
   // Add a cast layer
   Tensor cast(Tensor const &input,
               DataType dtype,
-              optional<std::string> const &name = nullopt);
+              std::optional<std::string> const &name = std::nullopt);
   // Add a concat layer
   Tensor concat(int n,
                 std::vector<Tensor> const &tensors,
                 int axis,
-                optional<std::string> const &name = nullopt);
+                std::optional<std::string> const &name = std::nullopt);
   // Add a mean layer
   Tensor mean(Tensor const &input,
               std::vector<int> const &dims,
@@ -158,51 +172,53 @@ struct ComputationGraphBuilder
              Tensor *outputs,
              std::vector<int> const &split,
              int axis,
-             optional<std::string> const &name = nullopt);
+             std::optional<std::string> const &name = std::nullopt);
   // Add a flat layer
-  Tensor flat(Tensor const &input, optional<std::string> const &name = nullopt);
+  Tensor flat(Tensor const &input,
+              std::optional<std::string> const &name = std::nullopt);
   // Add a softmax layer
   Tensor softmax(Tensor const &input,
                  int dim = -1,
-                 optional<std::string> const &name = nullopt);
+                 std::optional<std::string> const &name = std::nullopt);
   // Create input tensors and constants
   Tensor transpose(Tensor const &input,
                    std::vector<int> const &perm,
-                   optional<std::string> const &name = nullopt);
+                   std::optional<std::string> const &name = std::nullopt);
   Tensor reduce_sum(Tensor const &input,
                     std::vector<int> const &axes,
                     bool keepdims = false,
-                    optional<std::string> const &name = nullopt);
+                    std::optional<std::string> const &name = std::nullopt);
   Tensor reshape(Tensor const &input,
                  std::vector<int> const &shape,
-                 optional<std::string> const &name = nullopt);
+                 std::optional<std::string> const &name = std::nullopt);
   Tensor reverse(Tensor const &input,
                  int axis,
-                 optional<std::string> const &name = nullopt);
+                 std::optional<std::string> const &name = std::nullopt);
   void top_k(Tensor const &input,
              Tensor *outputs,
              int k,
              bool sorted,
-             optional<std::string> const &name = nullopt);
-  Tensor
-      multihead_attention(Tensor const &query,
-                          Tensor const &key,
-                          Tensor const &value,
-                          int embed_dim,
-                          int num_heads,
-                          int kdim = 0,
-                          int vdim = 0,
-                          float dropout = 0.0f,
-                          bool bias = true,
-                          bool add_bias_kv = false,
-                          bool add_zero_attn = false,
-                          optional<Initializer const &> initializer = nullopt,
-                          optional<std::string> const &name = nullopt);
+             std::optional<std::string> const &name = std::nullopt);
+  Tensor multihead_attention(
+      Tensor const &query,
+      Tensor const &key,
+      Tensor const &value,
+      int embed_dim,
+      int num_heads,
+      int kdim = 0,
+      int vdim = 0,
+      float dropout = 0.0f,
+      bool bias = true,
+      bool add_bias_kv = false,
+      bool add_zero_attn = false,
+      std::optional<Initializer> initializer = std::nullopt,
+      std::optional<std::string> const &name = std::nullopt);
   Tensor create_tensor(TensorShape const &, bool create_grad = true);
-  Parameter create_weight(TensorShape const &,
-                          bool create_grad = true,
-                          optional<Initializer const &> initializer = nullopt,
-                          optional<ParamSync> sync_type = nullopt);
+  Parameter create_weight(
+      TensorShape const &,
+      bool create_grad = true,
+      std::optional<Initializer> const &initializer = std::nullopt,
+      std::optional<ParamSync> sync_type = std::nullopt);
 
   std::vector<Tensor> get_outputs(Layer const &) const;
   Tensor get_output(Layer const &, int idx) const;
@@ -217,18 +233,18 @@ struct ComputationGraphBuilder
                  std::vector<Tensor> const &inputs,
                  std::vector<Tensor> const &weights,
                  std::vector<Tensor> const &outputs);
-  Tensor
-      add_layer(Layer const &layer,
-                std::vector<Tensor> const &inputs,
-                std::vector<std::pair<TensorShape, optional<Initializer>>> const
-                    &weight_shapes,
-                TensorShape const &output_shape);
-  std::vector<Tensor>
-      add_layer(Layer const &layer,
-                std::vector<Tensor> const &inputs,
-                std::vector<std::pair<TensorShape, optional<Initializer>>> const
-                    &weight_shapes,
-                std::vector<TensorShape> const &output_shapes);
+  Tensor add_layer(
+      Layer const &layer,
+      std::vector<Tensor> const &inputs,
+      std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
+          &weight_shapes,
+      TensorShape const &output_shape);
+  std::vector<Tensor> add_layer(
+      Layer const &layer,
+      std::vector<Tensor> const &inputs,
+      std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
+          &weight_shapes,
+      std::vector<TensorShape> const &output_shapes);
 
   Tensor as_type(Tensor const &, DataType, std::string const &);
 
@@ -237,21 +253,22 @@ struct ComputationGraphBuilder
   Tensor element_binary(OperatorType,
                         Tensor const &lhs,
                         Tensor const &rhs,
-                        optional<std::string> const &name = nullopt);
+                        std::optional<std::string> const &name = std::nullopt);
 
   Tensor element_unary(OperatorType,
                        Tensor const &input,
-                       optional<std::string> const &name = nullopt);
-  Tensor element_scalar_unary(OperatorType,
-                              Tensor const &input,
-                              float scalar,
-                              optional<std::string> const &name = nullopt);
+                       std::optional<std::string> const &name = std::nullopt);
+  Tensor element_scalar_unary(
+      OperatorType,
+      Tensor const &input,
+      float scalar,
+      std::optional<std::string> const &name = std::nullopt);
   Tensor element_unary(ElementUnaryAttrs const &,
                        Tensor const &input,
-                       optional<std::string> const &name = nullopt);
+                       std::optional<std::string> const &name = std::nullopt);
   Tensor element_scalar_unary(ElementScalarUnaryAttrs const &attrs,
                               Tensor const &x,
-                              optional<std::string> const &maybe_name);
+                              std::optional<std::string> const &maybe_name);
 
 public:
   ComputationGraph computation_graph;
diff --git a/lib/pcg/include/pcg/device_id.h b/lib/pcg/include/pcg/device_id.h
index 3ace2fd84e..50c2558e39 100644
--- a/lib/pcg/include/pcg/device_id.h
+++ b/lib/pcg/include/pcg/device_id.h
@@ -3,7 +3,6 @@
 
 #include "device_type.h"
 #include "utils/strong_typedef.h"
-#include "utils/variant.h"
 
 namespace FlexFlow {
 
@@ -15,7 +14,7 @@ struct cpu_id_t : strong_typedef<cpu_id_t, int> {
   using strong_typedef::strong_typedef;
 };
 
-using device_id_t = variant<gpu_id_t, cpu_id_t>;
+using device_id_t = std::variant<gpu_id_t, cpu_id_t>;
 device_id_t operator+(device_id_t, size_t);
 
 DeviceType get_device_type(device_id_t);
diff --git a/lib/pcg/include/pcg/file_format/v1/data_type.h b/lib/pcg/include/pcg/file_format/v1/data_type.h
index dad98e462d..eab188155f 100644
--- a/lib/pcg/include/pcg/file_format/v1/data_type.h
+++ b/lib/pcg/include/pcg/file_format/v1/data_type.h
@@ -3,11 +3,11 @@
 
 #include "utils/fp16.h"
 #include "utils/json.h"
-#include "utils/variant.h"
 
 namespace FlexFlow {
 
-using V1DataTypeValue = variant<bool, int32_t, int64_t, half, float, double>;
+using V1DataTypeValue =
+    std::variant<bool, int32_t, int64_t, half, float, double>;
 
 enum class V1DataType {
   BOOL,
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs.h b/lib/pcg/include/pcg/file_format/v1/graphs.h
index 71a8adb344..6bc852b0f1 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs.h
+++ b/lib/pcg/include/pcg/file_format/v1/graphs.h
@@ -53,7 +53,7 @@ struct V1JsonableGraph {
 
 struct V1Layer {
   V1CompGraphOperatorAttrs attrs;
-  req<optional<std::string>> name;
+  req<std::optional<std::string>> name;
 };
 FF_VISITABLE_STRUCT(V1Layer, attrs, name);
 V1Layer to_v1(Layer const &);
diff --git a/lib/pcg/include/pcg/file_format/v1/initializer.h b/lib/pcg/include/pcg/file_format/v1/initializer.h
index 24f0320bd9..21af7d55e0 100644
--- a/lib/pcg/include/pcg/file_format/v1/initializer.h
+++ b/lib/pcg/include/pcg/file_format/v1/initializer.h
@@ -19,15 +19,15 @@ struct V1ZeroInitializer {};
 FF_VISITABLE_STRUCT(V1ZeroInitializer);
 
 struct V1UniformInitializer {
-  req<int> seed;
-  req<float> min_val;
+  int seed;
+  float min_val;
   req<float> max_val;
 };
 FF_VISITABLE_STRUCT(V1UniformInitializer, seed, min_val, max_val);
 
 struct V1NormInitializer {
-  req<int> seed;
-  req<float> mean;
+  int seed;
+  float mean;
   req<float> stddev;
 };
 FF_VISITABLE_STRUCT(V1NormInitializer, seed, mean, stddev);
@@ -37,11 +37,11 @@ struct V1ConstantInitializer {
 };
 FF_VISITABLE_STRUCT(V1ConstantInitializer, value);
 
-using V1Initializer = variant<V1GlorotInitializer,
-                              V1ZeroInitializer,
-                              V1UniformInitializer,
-                              V1NormInitializer,
-                              V1ConstantInitializer>;
+using V1Initializer = std::variant<V1GlorotInitializer,
+                                   V1ZeroInitializer,
+                                   V1UniformInitializer,
+                                   V1NormInitializer,
+                                   V1ConstantInitializer>;
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/file_format/v1/operator_attrs.h b/lib/pcg/include/pcg/file_format/v1/operator_attrs.h
index 2ea87cbf56..2830fbd301 100644
--- a/lib/pcg/include/pcg/file_format/v1/operator_attrs.h
+++ b/lib/pcg/include/pcg/file_format/v1/operator_attrs.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_PCG_INCLUDE_PCG_FILE_FORMAT_V1_OPERATOR_ATTRS_H
 
 #include "utils/json.h"
-#include "utils/variant.h"
+#include <variant>
 
 namespace FlexFlow {
 
@@ -12,8 +12,8 @@ FF_VISITABLE_STRUCT(V1Conv2DAttrs);
 static_assert(
     std::is_same<visit_as_tuple_t<V1Conv2DAttrs>, std::tuple<>>::value, "");
 
-using V1CompGraphOperatorAttrs = variant<V1Conv2DAttrs>;
-using V1PCGOperatorAttrs = variant<V1Conv2DAttrs>;
+using V1CompGraphOperatorAttrs = std::variant<V1Conv2DAttrs>;
+using V1PCGOperatorAttrs = std::variant<V1Conv2DAttrs>;
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/file_format/v1/parallel_tensor.h b/lib/pcg/include/pcg/file_format/v1/parallel_tensor.h
index 1ea4cd04de..c215569b21 100644
--- a/lib/pcg/include/pcg/file_format/v1/parallel_tensor.h
+++ b/lib/pcg/include/pcg/file_format/v1/parallel_tensor.h
@@ -11,22 +11,22 @@
 namespace FlexFlow {
 
 struct V1ParallelDim {
-  req<size_t> size;
-  req<int> degree;
+  size_t size;
+  int degree;
   req<bool> is_replica_dim;
 };
 FF_VISITABLE_STRUCT(V1ParallelDim, size, degree, is_replica_dim);
 
 struct V1ParallelTensorShape {
-  req<std::vector<V1ParallelDim>> dims;
+  std::vector<V1ParallelDim> dims;
   req<V1DataType> data_type;
 };
 FF_VISITABLE_STRUCT(V1ParallelTensorShape, dims, data_type);
 
 struct V1ParallelTensor {
   V1ParallelTensorShape shape;
-  req<optional<V1ParamSync>> sync_type;
-  req<optional<V1Initializer>> initializer;
+  std::optional<V1ParamSync> sync_type;
+  std::optional<V1Initializer> initializer;
   req<bool> create_grad;
 };
 FF_VISITABLE_STRUCT(
diff --git a/lib/pcg/include/pcg/file_format/v1/tensor.h b/lib/pcg/include/pcg/file_format/v1/tensor.h
index e1f6828186..c304a41401 100644
--- a/lib/pcg/include/pcg/file_format/v1/tensor.h
+++ b/lib/pcg/include/pcg/file_format/v1/tensor.h
@@ -12,7 +12,7 @@
 namespace FlexFlow {
 
 struct V1TensorShape {
-  req<std::vector<size_t>> dims;
+  std::vector<size_t> dims;
   req<V1DataType> data_type;
 };
 FF_VISITABLE_STRUCT(V1TensorShape, dims, data_type);
@@ -21,10 +21,10 @@ V1TensorShape to_v1(TensorShape const &);
 
 struct V1Tensor {
   V1TensorShape shape;
-  req<optional<V1Initializer>> initializer;
-  req<bool> create_gradients;
-  req<optional<V1ParamSync>> sync_type;
-  req<optional<std::string>> name;
+  std::optional<V1Initializer> initializer;
+  bool create_gradients;
+  std::optional<V1ParamSync> sync_type;
+  req<std::optional<std::string>> name;
 };
 FF_VISITABLE_STRUCT(
     V1Tensor, shape, initializer, create_gradients, sync_type, name);
diff --git a/lib/pcg/include/pcg/initializer.h b/lib/pcg/include/pcg/initializer.h
index 58e4fcc242..6913289653 100644
--- a/lib/pcg/include/pcg/initializer.h
+++ b/lib/pcg/include/pcg/initializer.h
@@ -20,15 +20,15 @@ struct ZeroInitializer {
 FF_VISITABLE_STRUCT(ZeroInitializer);
 
 struct UniformInitializer {
-  req<int> seed;
-  req<float> min_val;
+  int seed;
+  float min_val;
   req<float> max_val;
 };
 FF_VISITABLE_STRUCT(UniformInitializer, seed, min_val, max_val);
 
 struct NormInitializer {
-  req<int> seed;
-  req<float> mean;
+  int seed;
+  float mean;
   req<float> stddev;
 };
 FF_VISITABLE_STRUCT(NormInitializer, seed, mean, stddev);
@@ -38,11 +38,11 @@ struct ConstantInitializer {
 };
 FF_VISITABLE_STRUCT(ConstantInitializer, value);
 
-using Initializer = variant<GlorotUniform,
-                            ZeroInitializer,
-                            UniformInitializer,
-                            NormInitializer,
-                            ConstantInitializer>;
+using Initializer = std::variant<GlorotUniform,
+                                 ZeroInitializer,
+                                 UniformInitializer,
+                                 NormInitializer,
+                                 ConstantInitializer>;
 CHECK_WELL_BEHAVED_VALUE_TYPE(Initializer);
 
 } // namespace FlexFlow
diff --git a/lib/pcg/include/pcg/layer.h b/lib/pcg/include/pcg/layer.h
index abf1fe6a91..9749cb9d06 100644
--- a/lib/pcg/include/pcg/layer.h
+++ b/lib/pcg/include/pcg/layer.h
@@ -7,13 +7,14 @@
 
 namespace FlexFlow {
 
-struct Layer : public use_visitable_cmp<Layer> {
+struct Layer {
 public:
   Layer() = delete;
-  Layer(CompGraphOperatorAttrs const &attrs, optional<std::string> const &name);
+  Layer(CompGraphOperatorAttrs const &attrs,
+        std::optional<std::string> const &name);
 
 public:
-  optional<stack_string<MAX_OPNAME>> name;
+  std::optional<stack_string<MAX_OPNAME>> name;
   CompGraphOperatorAttrs attrs;
 };
 
diff --git a/lib/pcg/include/pcg/operator.h b/lib/pcg/include/pcg/operator.h
index c7a49bb57e..5804e38f95 100644
--- a/lib/pcg/include/pcg/operator.h
+++ b/lib/pcg/include/pcg/operator.h
@@ -11,7 +11,8 @@ namespace FlexFlow {
 struct Operator : public use_visitable_cmp<Operator> {
 public:
   Operator() = delete;
-  Operator(PCGOperatorAttrs const &attrs, optional<std::string> const &name);
+  Operator(PCGOperatorAttrs const &attrs,
+           std::optional<std::string> const &name);
 
   operator PCGOperatorAttrs() const;
 
diff --git a/lib/pcg/include/pcg/parallel_tensor.h b/lib/pcg/include/pcg/parallel_tensor.h
index eadc83d9fd..c3f7ebdfed 100644
--- a/lib/pcg/include/pcg/parallel_tensor.h
+++ b/lib/pcg/include/pcg/parallel_tensor.h
@@ -39,19 +39,19 @@ struct ParallelTensor : public use_visitable_cmp<ParallelTensor> {
 
   ParallelTensor(ParallelTensorShape const &,
                  CreateGrad create_gradients,
-                 optional<ParamSync> sync_type = nullopt,
-                 optional<Initializer> initializer = nullopt);
+                 std::optional<ParamSync> sync_type = std::nullopt,
+                 std::optional<Initializer> initializer = std::nullopt);
   ParallelTensor(ParallelTensorDims const &,
                  DataType,
                  CreateGrad create_gradients,
-                 optional<ParamSync> sync_type = nullopt,
-                 optional<Initializer> initializer = nullopt);
+                 std::optional<ParamSync> sync_type = std::nullopt,
+                 std::optional<Initializer> initializer = std::nullopt);
 
 public:
   ParallelTensorDims dims;
   DataType data_type;
-  optional<ParamSync> sync_type = nullopt;
-  optional<Initializer> initializer = nullopt;
+  std::optional<ParamSync> sync_type = std::nullopt;
+  std::optional<Initializer> initializer = std::nullopt;
   CreateGrad create_gradients;
 };
 
diff --git a/lib/pcg/include/pcg/tensor.h b/lib/pcg/include/pcg/tensor.h
index cb79be245a..975a69809d 100644
--- a/lib/pcg/include/pcg/tensor.h
+++ b/lib/pcg/include/pcg/tensor.h
@@ -24,9 +24,9 @@ struct Tensor {
 public:
   TensorDims dims;
   DataType data_type;
-  req<optional<Initializer>> initializer;
-  req<bool> create_gradients;
-  req<optional<ParamSync>> sync_type;
+  std::optional<Initializer> initializer;
+  bool create_gradients;
+  req<std::optional<ParamSync>> sync_type;
 };
 FF_VISITABLE_STRUCT(
     Tensor, dims, data_type, initializer, create_gradients, sync_type);
diff --git a/lib/pcg/src/computation_graph_builder.cc b/lib/pcg/src/computation_graph_builder.cc
index 9f8e930919..c2e008231e 100644
--- a/lib/pcg/src/computation_graph_builder.cc
+++ b/lib/pcg/src/computation_graph_builder.cc
@@ -15,7 +15,7 @@ void ComputationGraphBuilder::add_layer(Layer const &layer,
 Tensor ComputationGraphBuilder::add_layer(
     Layer const &layer,
     std::vector<Tensor> const &inputs,
-    std::vector<std::pair<TensorShape, optional<Initializer>>> const
+    std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
         &weight_shapes,
     TensorShape const &output_shape) {
   NOT_IMPLEMENTED();
@@ -23,7 +23,7 @@ Tensor ComputationGraphBuilder::add_layer(
 std::vector<Tensor> ComputationGraphBuilder::add_layer(
     Layer const &layer,
     std::vector<Tensor> const &inputs,
-    std::vector<std::pair<TensorShape, optional<Initializer>>> const
+    std::vector<std::pair<TensorShape, std::optional<Initializer>>> const
         &weight_shapes,
     std::vector<TensorShape> const &output_shapes) {
   NOT_IMPLEMENTED();
@@ -34,7 +34,7 @@ Tensor ComputationGraphBuilder::broadcast(Tensor const &, TensorShape const &) {
 }
 Tensor ComputationGraphBuilder::cast(Tensor const &input,
                                      DataType dtype,
-                                     optional<std::string> const &name){
+                                     std::optional<std::string> const &name){
     NOT_IMPLEMENTED()}
 
 Tensor ComputationGraphBuilder::as_type(Tensor const &x,
@@ -60,14 +60,14 @@ static std::string get_default_name(ComputationGraphAttrs const &attrs) {
 }
 
 template <typename... Args>
-static std::string get_default_name(variant<Args...> const &attrs) {
+static std::string get_default_name(std::variant<Args...> const &attrs) {
   return get_default_name(widen<ComputationGraphAttrs>(attrs));
 }
 
 Tensor ComputationGraphBuilder::element_unary(
     ElementUnaryAttrs const &attrs,
     Tensor const &x,
-    optional<std::string> const &maybe_name) {
+    std::optional<std::string> const &maybe_name) {
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
   Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
@@ -81,7 +81,7 @@ Tensor ComputationGraphBuilder::element_unary(
 Tensor ComputationGraphBuilder::element_scalar_unary(
     ElementScalarUnaryAttrs const &attrs,
     Tensor const &x,
-    optional<std::string> const &maybe_name) {
+    std::optional<std::string> const &maybe_name) {
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
   Tensor input = this->as_type(x, DataType::FLOAT, name + "input_pre_cast");
@@ -92,10 +92,10 @@ Tensor ComputationGraphBuilder::element_scalar_unary(
   return this->add_layer(layer, {input}, {}, output_shape);
 }
 
-Tensor
-    ComputationGraphBuilder::element_unary(OperatorType op_type,
-                                           Tensor const &input,
-                                           optional<std::string> const &name) {
+Tensor ComputationGraphBuilder::element_unary(
+    OperatorType op_type,
+    Tensor const &input,
+    std::optional<std::string> const &name) {
   ElementUnaryAttrs attrs = {op_type};
   return this->element_unary(attrs, input, name);
 }
@@ -104,7 +104,7 @@ Tensor ComputationGraphBuilder::element_scalar_unary(
     OperatorType op_type,
     Tensor const &input,
     float scalar,
-    optional<std::string> const &name) {
+    std::optional<std::string> const &name) {
   ElementScalarUnaryAttrs attrs = {op_type, scalar};
   return this->element_scalar_unary(attrs, input, name);
 }
@@ -113,7 +113,7 @@ Tensor ComputationGraphBuilder::element_binary(
     OperatorType op_type,
     Tensor const &lhs,
     Tensor const &rhs,
-    optional<std::string> const &maybe_name) {
+    std::optional<std::string> const &maybe_name) {
   std::string name = maybe_name.value_or(get_default_name(op_type));
 
   TensorShape compute_shape = this->get_broadcast_target_shape({lhs, rhs});
@@ -135,119 +135,121 @@ Tensor ComputationGraphBuilder::element_binary(
 }
 
 Tensor ComputationGraphBuilder::exp(Tensor const &input,
-                                    optional<std::string> const &name) {
+                                    std::optional<std::string> const &name) {
   return this->element_unary(Op::EXP, input, name);
 }
 
 Tensor ComputationGraphBuilder::add(Tensor const &lhs,
                                     Tensor const &rhs,
-                                    optional<std::string> const &name) {
+                                    std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_ADD, lhs, rhs, name);
 }
 
-Tensor ComputationGraphBuilder::subtract(Tensor const &lhs,
-                                         Tensor const &rhs,
-                                         optional<std::string> const &name) {
+Tensor
+    ComputationGraphBuilder::subtract(Tensor const &lhs,
+                                      Tensor const &rhs,
+                                      std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_SUB, lhs, rhs, name);
 }
 
-Tensor ComputationGraphBuilder::multiply(Tensor const &lhs,
-                                         Tensor const &rhs,
-                                         optional<std::string> const &name) {
+Tensor
+    ComputationGraphBuilder::multiply(Tensor const &lhs,
+                                      Tensor const &rhs,
+                                      std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_MUL, lhs, rhs, name);
 }
 
 Tensor ComputationGraphBuilder::divide(Tensor const &lhs,
                                        Tensor const &rhs,
-                                       optional<std::string> const &name) {
+                                       std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_DIV, lhs, rhs, name);
 }
 
 Tensor ComputationGraphBuilder::max(Tensor const &lhs,
                                     Tensor const &rhs,
-                                    optional<std::string> const &name) {
+                                    std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_MAX, lhs, rhs, name);
 }
 
 Tensor ComputationGraphBuilder::min(Tensor const &lhs,
                                     Tensor const &rhs,
-                                    optional<std::string> const &name) {
+                                    std::optional<std::string> const &name) {
   return this->element_binary(Op::EW_MIN, lhs, rhs, name);
 }
 
 Tensor ComputationGraphBuilder::rsqrt(Tensor const &input,
-                                      optional<std::string> const &name) {
+                                      std::optional<std::string> const &name) {
   return this->element_unary(Op::RSQRT, input, name);
 }
 
 Tensor ComputationGraphBuilder::pow(Tensor const &input,
                                     float exponent,
-                                    optional<std::string> const &name) {
+                                    std::optional<std::string> const &name) {
   return this->element_scalar_unary(Op::POW, input, exponent, name);
 }
 
 Tensor ComputationGraphBuilder::scalar_multiply(
-    Tensor const &input, float scalar, optional<std::string> const &name) {
+    Tensor const &input, float scalar, std::optional<std::string> const &name) {
   return this->element_scalar_unary(Op::SCALAR_MULTIPLY, input, scalar, name);
 }
 
-Tensor ComputationGraphBuilder::scalar_add(Tensor const &input,
-                                           float scalar,
-                                           optional<std::string> const &name) {
+Tensor ComputationGraphBuilder::scalar_add(
+    Tensor const &input, float scalar, std::optional<std::string> const &name) {
   return this->element_scalar_unary(Op::SCALAR_ADD, input, scalar, name);
 }
 
-Tensor ComputationGraphBuilder::scalar_sub(Tensor const &lhs,
-                                           float rhs,
-                                           optional<std::string> const &name) {
+Tensor ComputationGraphBuilder::scalar_sub(
+    Tensor const &lhs, float rhs, std::optional<std::string> const &name) {
   return this->element_scalar_unary(Op::SCALAR_SUB, lhs, rhs, name);
 }
 
-Tensor
-    ComputationGraphBuilder::scalar_truediv(Tensor const &numerator,
-                                            float denominator,
-                                            optional<std::string> const &name) {
+Tensor ComputationGraphBuilder::scalar_truediv(
+    Tensor const &numerator,
+    float denominator,
+    std::optional<std::string> const &name) {
   return this->element_scalar_unary(
       Op::SCALAR_TRUE_DIV, numerator, denominator, name);
 }
 
 Tensor ComputationGraphBuilder::sin(Tensor const &input,
-                                    optional<std::string> const &name) {
+                                    std::optional<std::string> const &name) {
   return this->element_unary(Op::SIN, input, name);
 }
 
 Tensor ComputationGraphBuilder::cos(Tensor const &input,
-                                    optional<std::string> const &name) {
+                                    std::optional<std::string> const &name) {
   return this->element_unary(Op::COS, input, name);
 }
 
 Tensor ComputationGraphBuilder::relu(Tensor const &input,
-                                     optional<std::string> const &name) {
+                                     std::optional<std::string> const &name) {
   return this->element_unary(Op::RELU, input, name);
 }
 
-Tensor ComputationGraphBuilder::identity(Tensor const &input,
-                                         optional<std::string> const &name) {
+Tensor
+    ComputationGraphBuilder::identity(Tensor const &input,
+                                      std::optional<std::string> const &name) {
   return this->element_unary(Op::IDENTITY, input, name);
 }
 
 Tensor ComputationGraphBuilder::gelu(Tensor const &input,
-                                     optional<std::string> const &name) {
+                                     std::optional<std::string> const &name) {
   return this->element_unary(Op::GELU, input, name);
 }
 
-Tensor ComputationGraphBuilder::sigmoid(Tensor const &input,
-                                        optional<std::string> const &name) {
+Tensor
+    ComputationGraphBuilder::sigmoid(Tensor const &input,
+                                     std::optional<std::string> const &name) {
   return this->element_unary(Op::SIGMOID, input, name);
 }
 
 Tensor ComputationGraphBuilder::tanh(Tensor const &input,
-                                     optional<std::string> const &name) {
+                                     std::optional<std::string> const &name) {
   return this->element_unary(Op::TANH, input, name);
 }
 
 Tensor ComputationGraphBuilder::elu(Tensor const &input,
-                                    optional<std::string> const &name) {
+                                    std::optional<std::string> const &name) {
   return this->element_unary(Op::ELU, input, name);
 }
 
@@ -260,13 +262,13 @@ Tensor ComputationGraphBuilder::conv2d(
     int strideW,
     int paddingH,
     int paddingW,
-    optional<Activation> const &activation,
+    std::optional<Activation> const &activation,
     int groups,
     bool use_bias,
-    optional<Initializer const &> kernel_initializer,
-    optional<Initializer const &> bias_initializer,
-    optional<RegularizerAttrs const &> kernel_regularizer,
-    optional<std::string> const &maybe_name) {
+    std::optional<Initializer> const &kernel_initializer,
+    std::optional<Initializer> const &bias_initializer,
+    std::optional<RegularizerAttrs> const &kernel_regularizer,
+    std::optional<std::string> const &maybe_name) {
   Conv2DAttrs attrs = {outChannels,
                        kernelH,
                        kernelW,
@@ -284,7 +286,7 @@ Tensor ComputationGraphBuilder::conv2d(
   Layer layer = {attrs, name};
   TensorShape output_shape = get_output_shape(attrs, input);
 
-  std::vector<std::pair<TensorShape, optional<Initializer>>> weights;
+  std::vector<std::pair<TensorShape, std::optional<Initializer>>> weights;
 
   weights.push_back({get_kernel_shape(attrs, input), kernel_initializer});
 
@@ -295,11 +297,11 @@ Tensor ComputationGraphBuilder::conv2d(
   return this->add_layer(layer, {input}, weights, output_shape);
 }
 
-Tensor
-    ComputationGraphBuilder::dropout(Tensor const &x,
-                                     float rate,
-                                     unsigned long long seed,
-                                     optional<std::string> const &maybe_name) {
+Tensor ComputationGraphBuilder::dropout(
+    Tensor const &x,
+    float rate,
+    unsigned long long seed,
+    std::optional<std::string> const &maybe_name) {
   DropoutAttrs attrs = {rate, seed};
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
@@ -317,8 +319,8 @@ Tensor ComputationGraphBuilder::embedding(
     int outDim,
     AggregateOp aggr,
     DataType dtype,
-    optional<Initializer const &> kernel_initializer,
-    optional<std::string> const &maybe_name) {
+    std::optional<Initializer> const &kernel_initializer,
+    std::optional<std::string> const &maybe_name) {
   EmbeddingAttrs attrs = {num_entries, outDim, aggr, dtype};
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
@@ -332,11 +334,11 @@ Tensor ComputationGraphBuilder::embedding(
       layer, {input}, {{weights_shape, kernel_initializer}}, output_shape);
 }
 
-std::vector<Tensor>
-    ComputationGraphBuilder::gather(Tensor const &input,
-                                    Tensor const &index,
-                                    ff_dim_t dim,
-                                    optional<std::string> const &maybe_name) {
+std::vector<Tensor> ComputationGraphBuilder::gather(
+    Tensor const &input,
+    Tensor const &index,
+    ff_dim_t dim,
+    std::optional<std::string> const &maybe_name) {
   GatherAttrs attrs = {dim};
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
@@ -370,7 +372,7 @@ std::vector<TensorShape> get_shape(std::vector<Tensor> const &) {
 //     std::vector<Tensor> const &exp_preds,
 //     int n,
 //     float lambda_bal,
-//     optional<std::string> const &maybe_name) {
+//     std::optional<std::string> const &maybe_name) {
 //   AggregateAttrs attrs = {n, lambda_bal};
 //   std::string name = maybe_name.value_or(get_default_name(attrs));
 
@@ -389,7 +391,9 @@ std::vector<TensorShape> get_shape(std::vector<Tensor> const &) {
 // }
 
 Tensor ComputationGraphBuilder::batch_norm(
-    Tensor const &input, bool relu, optional<std::string> const &maybe_name) {
+    Tensor const &input,
+    bool relu,
+    std::optional<std::string> const &maybe_name) {
   BatchNormAttrs attrs = BatchNormAttrs{relu};
   std::string name = maybe_name.value_or(get_default_name(attrs));
 
diff --git a/lib/pcg/src/device_id.cc b/lib/pcg/src/device_id.cc
index 64be75667a..2849df7c3c 100644
--- a/lib/pcg/src/device_id.cc
+++ b/lib/pcg/src/device_id.cc
@@ -1,13 +1,14 @@
 #include "pcg/device_id.h"
+#include "utils/exception.h"
 #include <cassert>
 
 namespace FlexFlow {
 
 DeviceType get_device_type(device_id_t const &id) {
-  if (holds_alternative<gpu_id_t>(id)) {
+  if (std::holds_alternative<gpu_id_t>(id)) {
     return DeviceType::GPU;
   } else {
-    assert(holds_alternative<cpu_id_t>(id));
+    assert(std::holds_alternative<cpu_id_t>(id));
     return DeviceType::CPU;
   }
 }
diff --git a/lib/pcg/src/layer.cc b/lib/pcg/src/layer.cc
index 27d5b31003..00fb07a8c5 100644
--- a/lib/pcg/src/layer.cc
+++ b/lib/pcg/src/layer.cc
@@ -3,7 +3,7 @@
 namespace FlexFlow {
 
 Layer::Layer(CompGraphOperatorAttrs const &_attrs,
-             optional<std::string> const &_name)
+             std::optional<std::string> const &_name)
     : attrs(_attrs), name(_name) {}
 
 } // namespace FlexFlow
diff --git a/lib/pcg/src/operator.cc b/lib/pcg/src/operator.cc
index 8c79c67464..92ece9a2bf 100644
--- a/lib/pcg/src/operator.cc
+++ b/lib/pcg/src/operator.cc
@@ -3,7 +3,7 @@
 namespace FlexFlow {
 
 Operator::Operator(PCGOperatorAttrs const &attrs,
-                   optional<std::string> const &name)
+                   std::optional<std::string> const &name)
     : attrs(attrs) {}
 
 Operator::operator PCGOperatorAttrs() const {
diff --git a/lib/pcg/src/parallel_tensor.cc b/lib/pcg/src/parallel_tensor.cc
index a8d7b15ea9..19dc1e96d3 100644
--- a/lib/pcg/src/parallel_tensor.cc
+++ b/lib/pcg/src/parallel_tensor.cc
@@ -5,8 +5,8 @@ namespace FlexFlow {
 ParallelTensor::ParallelTensor(ParallelTensorDims const &dims,
                                DataType data_type,
                                CreateGrad create_gradients,
-                               optional<ParamSync> sync_type,
-                               optional<Initializer> initializer)
+                               std::optional<ParamSync> sync_type,
+                               std::optional<Initializer> initializer)
     : dims(dims), data_type(data_type), sync_type(sync_type),
       initializer(initializer), create_gradients(create_gradients) {}
 
diff --git a/lib/runtime/src/serialization.h b/lib/runtime/src/serialization.h
index 53edb09075..65601990b0 100644
--- a/lib/runtime/src/serialization.h
+++ b/lib/runtime/src/serialization.h
@@ -106,11 +106,12 @@ struct is_trivially_serializable<DimOrdered<Idx, T>>
     : is_trivially_serializable<T> {};
 
 template <typename... Ts>
-struct is_trivially_serializable<variant<Ts...>>
-    : elements_satisfy<is_trivially_serializable, variant<Ts...>> {};
+struct is_trivially_serializable<std::variant<Ts...>>
+    : elements_satisfy<is_trivially_serializable, std::variant<Ts...>> {};
 
 template <typename T>
-struct is_trivially_serializable<optional<T>> : is_trivially_serializable<T> {};
+struct is_trivially_serializable<std::optional<T>>
+    : is_trivially_serializable<T> {};
 
 template <typename T>
 struct std_array_size_helper;
@@ -146,7 +147,8 @@ static_assert(is_trivially_serializable<int32_t>::value, "");
 static_assert(is_trivially_serializable<int64_t>::value, "");
 static_assert(is_trivially_serializable<half>::value, "");
 static_assert(is_trivially_serializable<bool>::value, "");
-static_assert(is_trivially_serializable<variant<float, double>>::value, "");
+static_assert(is_trivially_serializable<std::variant<float, double>>::value,
+              "");
 static_assert(std::is_same<visit_as_tuple_t<InternalTestType>,
                            std::tuple<int, float>>::value,
               "");
diff --git a/lib/substitutions/src/substitution.cc b/lib/substitutions/src/substitution.cc
index 72c9248e6c..dd28a9aa5d 100644
--- a/lib/substitutions/src/substitution.cc
+++ b/lib/substitutions/src/substitution.cc
@@ -226,7 +226,7 @@ Operator get_operator_attrs(SubParallelComputationGraph const &graph,
               get<bool>(assignments.at(OperatorAttributeKey::USE_BIAS)),
               get<DataType>(assignments.at(OperatorAttributeKey::DATA_TYPE)),
               get<Activation>(assignments.at(OperatorAttributeKey::ACTIVATION)),
-              get<optional<RegularizerAttrs>>(
+              get<std::optional<RegularizerAttrs>>(
                   assignments.at(OperatorAttributeKey::REGULARIZER))},
           nullopt);
     case Op::MULTIHEAD_ATTENTION:
diff --git a/lib/utils/CMakeLists.txt b/lib/utils/CMakeLists.txt
index ac23248db6..a0d77b9f76 100644
--- a/lib/utils/CMakeLists.txt
+++ b/lib/utils/CMakeLists.txt
@@ -8,14 +8,10 @@ ff_add_library(
   PRIVATE_INCLUDE
     src/
   DEPS
-    optional
     expected
-    variant
     visit_struct
     fmt
-    invoke
     json
-    any
     cuda
 )      
 
diff --git a/lib/utils/include/utils/containers.decl.h b/lib/utils/include/utils/containers.decl.h
index 8ad65a4488..84fd4a5acc 100644
--- a/lib/utils/include/utils/containers.decl.h
+++ b/lib/utils/include/utils/containers.decl.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_UTILS_INCLUDE_UTILS_CONTAINERS_DECL_H
 
 #include "utils/bidict.h"
-#include "utils/invoke.h"
 #include "utils/optional.decl"
 #include "utils/required_core.h"
 #include "utils/type_traits_core.h"
@@ -119,14 +118,14 @@ template <typename C, typename T = typename C::value_type>
 std::unordered_set<T> without_order(C const &c);
 
 template <typename Container, typename Element>
-optional<std::size_t> index_of(Container const &c, Element const &e);
+std::optional<std::size_t> index_of(Container const &c, Element const &e);
 
 template <typename T>
 std::unordered_set<T> intersection(std::unordered_set<T> const &l,
                                    std::unordered_set<T> const &r);
 
 template <typename C, typename T = typename C::value_type>
-optional<T> intersection(C const &c);
+std::optional<T> intersection(C const &c);
 
 template <typename T>
 bool are_disjoint(std::unordered_set<T> const &l,
@@ -146,13 +145,13 @@ bidict<K, V> merge_maps(bidict<K, V> const &lhs, bidict<K, V> const &rhs);
 template <typename F,
           typename C,
           typename K = get_element_type_t<C>,
-          typename V = invoke_result_t<F, K>>
+          typename V = std::invoke_result_t<F, K>>
 std::unordered_map<K, V> generate_map(C const &c, F const &f);
 
 template <typename F,
           typename C,
           typename K = get_element_type_t<C>,
-          typename V = invoke_result_t<F, K>>
+          typename V = std::invoke_result_t<F, K>>
 bidict<K, V> generate_bidict(C const &c, F const &f);
 
 template <typename K, typename V>
@@ -189,7 +188,7 @@ std::unordered_set<D>
                            std::unordered_set<S> const &input);
 
 template <typename C>
-optional<typename C::value_type> maybe_get_only(C const &c);
+std::optional<typename C::value_type> maybe_get_only(C const &c);
 
 template <typename C>
 typename C::value_type get_only(C const &c);
@@ -204,7 +203,7 @@ template <typename T, typename C>
 void extend(std::unordered_set<T> &lhs, C const &rhs);
 
 template <typename C, typename E = typename C::value_type>
-void extend(C &lhs, optional<E> const &e);
+void extend(C &lhs, std::optional<E> const &e);
 
 template <typename C, typename F>
 bool all_of(C const &c, F const &f);
@@ -240,7 +239,7 @@ std::unordered_set<Out> transform(std::unordered_set<In> const &v, F const &f);
 template <typename F>
 std::string transform(std::string const &s, F const &f);
 
-template <typename F, typename Out = invoke_result_t<F>>
+template <typename F, typename Out = std::invoke_result_t<F>>
 std::vector<Out> repeat(int n, F const &f);
 
 template <typename T>
@@ -256,7 +255,7 @@ std::vector<Out> flatmap(std::vector<In> const &v, F const &f);
 
 template <typename In,
           typename F,
-          typename Out = get_element_type_t<invoke_result_t<F, In>>>
+          typename Out = get_element_type_t<std::invoke_result_t<F, In>>>
 std::unordered_set<Out> flatmap(std::unordered_set<In> const &v, F const &f);
 
 template <typename Out, typename In>
@@ -292,12 +291,12 @@ template <typename T>
 T reversed(T const &t);
 
 template <typename T>
-std::vector<T> value_all(std::vector<optional<T>> const &v);
+std::vector<T> value_all(std::vector<std::optional<T>> const &v);
 
 template <typename T>
 std::vector<T> subvec(std::vector<T> const &v,
-                      optional<int> const &maybe_start,
-                      optional<int> const &maybe_end);
+                      std::optional<int> const &maybe_start,
+                      std::optional<int> const &maybe_end);
 
 template <typename C>
 struct reversed_container_t;
diff --git a/lib/utils/include/utils/containers.h b/lib/utils/include/utils/containers.h
index 679586ba69..cdf4591cdb 100644
--- a/lib/utils/include/utils/containers.h
+++ b/lib/utils/include/utils/containers.h
@@ -3,11 +3,9 @@
 
 #include "bidict.h"
 #include "containers.decl.h"
-#include "invoke.h"
 #include "required_core.h"
 #include "type_traits_core.h"
 #include "utils/exception.h"
-#include "utils/optional.h"
 #include "utils/type_traits.h"
 #include <algorithm>
 #include <cassert>
@@ -244,10 +242,10 @@ std::unordered_set<T> without_order(C const &c) {
 }
 
 template <typename Container, typename Element>
-tl::optional<std::size_t> index_of(Container const &c, Element const &e) {
+std::optional<std::size_t> index_of(Container const &c, Element const &e) {
   auto it = std::find(c.cbegin(), c.cend(), e);
   if (it == c.cend()) {
-    return tl::nullopt;
+    return std::nullopt;
   } else {
     return std::distance(c.cbegin(), it);
   }
@@ -266,8 +264,8 @@ std::unordered_set<T> intersection(std::unordered_set<T> const &l,
 }
 
 template <typename C, typename T>
-optional<T> intersection(C const &c) {
-  optional<T> result;
+std::optional<T> intersection(C const &c) {
+  std::optional<T> result;
   for (T const &t : c) {
     result = intersection(result.value_or(t), t);
   }
@@ -420,11 +418,11 @@ std::unordered_set<D>
 }
 
 template <typename C>
-optional<typename C::value_type> maybe_get_only(C const &c) {
+std::optional<typename C::value_type> maybe_get_only(C const &c) {
   if (c.size() == 1) {
     return *c.cbegin();
   } else {
-    return nullopt;
+    return std::nullopt;
   }
 }
 
@@ -454,7 +452,7 @@ void extend(std::unordered_set<T> &lhs, C const &rhs) {
 }
 
 template <typename C, typename E>
-void extend(C &lhs, optional<E> const &e) {
+void extend(C &lhs, std::optional<E> const &e) {
   if (e.has_value()) {
     return extend(lhs, e.value());
   }
@@ -570,7 +568,7 @@ struct get_element_type {
 };
 
 template <typename T>
-struct get_element_type<optional<T>> {
+struct get_element_type<std::optional<T>> {
   using type = T;
 };
 
@@ -666,8 +664,8 @@ T reversed(T const &t) {
 }
 
 template <typename T>
-std::vector<T> value_all(std::vector<optional<T>> const &v) {
-  return transform(v, [](optional<T> const &element) {
+std::vector<T> value_all(std::vector<std::optional<T>> const &v) {
+  return transform(v, [](std::optional<T> const &element) {
     return unwrap(element, [] {
       throw mk_runtime_error(
           "Encountered element without value in call to value_all");
@@ -677,8 +675,8 @@ std::vector<T> value_all(std::vector<optional<T>> const &v) {
 
 template <typename T>
 std::vector<T> subvec(std::vector<T> const &v,
-                      optional<int> const &maybe_start,
-                      optional<int> const &maybe_end) {
+                      std::optional<int> const &maybe_start,
+                      std::optional<int> const &maybe_end) {
   auto begin_iter = v.cbegin();
   auto end_iter = v.cend();
 
diff --git a/lib/utils/include/utils/disjoint_set.h b/lib/utils/include/utils/disjoint_set.h
index e0a3aaa5ee..4810e5b29e 100644
--- a/lib/utils/include/utils/disjoint_set.h
+++ b/lib/utils/include/utils/disjoint_set.h
@@ -12,19 +12,19 @@ namespace FlexFlow {
 template <typename T>
 class m_disjoint_set {
 public:
-  void m_union(optional<T> const &l, optional<T> const &r) {
+  void m_union(std::optional<T> const &l, std::optional<T> const &r) {
     this->add_node_if_missing(l);
     this->add_node_if_missing(r);
-    optional<T> const ll = this->find(l);
-    optional<T> const rr = this->find(r);
+    std::optional<T> const ll = this->find(l);
+    std::optional<T> const rr = this->find(r);
     if (ll != rr) {
       this->mapping[ll] = rr;
     }
   }
 
-  optional<T> const find(optional<T> const &t) const {
+  std::optional<T> const find(std::optional<T> const &t) const {
     this->add_node_if_missing(t);
-    optional<T> const parent = this->mapping.at(t);
+    std::optional<T> const parent = this->mapping.at(t);
     if (!parent.has_value()) {
       return t;
     } else {
@@ -33,18 +33,19 @@ class m_disjoint_set {
   }
 
 private:
-  void add_node_if_missing(optional<T> const &t) const {
+  void add_node_if_missing(std::optional<T> const &t) const {
     if (mapping.find(t) == mapping.end()) {
-      mapping[t] = nullopt;
+      mapping[t] = std::nullopt;
     }
   }
-  mutable std::unordered_map<optional<T>, optional<T>> mapping;
+  mutable std::unordered_map<std::optional<T>, std::optional<T>> mapping;
 };
 
 // Custom comparator for optional
 template <typename T>
 struct OptionalComparator {
-  bool operator()(optional<T> const &lhs, optional<T> const &rhs) const {
+  bool operator()(std::optional<T> const &lhs,
+                  std::optional<T> const &rhs) const {
     if (!lhs.has_value() || !rhs.has_value()) {
       return false;
     }
@@ -55,34 +56,34 @@ struct OptionalComparator {
 template <typename T, typename Compare = OptionalComparator<T>>
 class disjoint_set {
 public:
-  void m_union(optional<T> const &l, optional<T> const &r) const {
+  void m_union(std::optional<T> const &l, std::optional<T> const &r) const {
     this->nodes.insert(l);
     this->nodes.insert(r);
     this->ds.m_union(this->get_node(l), this->get_node(r));
   }
 
-  optional<T> const find(optional<T> const &t) const {
+  std::optional<T> const find(std::optional<T> const &t) const {
     this->nodes.insert(t); // Make sure the node is in the set
     return this->ds.find(this->get_node(t));
   }
 
-  std::map<optional<T>, optional<T>, Compare> get_mapping() const {
-    std::map<optional<T>, optional<T>, Compare> mapping;
-    for (optional<T> const &t : this->nodes) {
+  std::map<std::optional<T>, std::optional<T>, Compare> get_mapping() const {
+    std::map<std::optional<T>, std::optional<T>, Compare> mapping;
+    for (std::optional<T> const &t : this->nodes) {
       mapping[t] = this->ds.find(t);
     }
     return mapping;
   }
 
 private:
-  optional<T> const get_node(optional<T> const &t) const {
+  std::optional<T> const get_node(std::optional<T> const &t) const {
     auto it = this->nodes.find(t);
     assert(it != this->nodes.end());
     return *it;
   }
 
   mutable m_disjoint_set<T> ds;
-  mutable std::set<optional<T>, Compare>
+  mutable std::set<std::optional<T>, Compare>
       nodes; // Note(lambda): make mutable to allow using ds->find() to be const
              // because while the result is invariant to path compression, etc.
 };
diff --git a/lib/utils/include/utils/dot_file.h b/lib/utils/include/utils/dot_file.h
index 9529c659e2..6cdc78f6d4 100644
--- a/lib/utils/include/utils/dot_file.h
+++ b/lib/utils/include/utils/dot_file.h
@@ -2,7 +2,6 @@
 #define _DOT_FILE_H
 
 #include "record_formatter.h"
-#include "tl/optional.hpp"
 #include <cassert>
 #include <fstream>
 #include <map>
@@ -20,9 +19,9 @@ class DotFile {
   std::map<T, size_t> node_ids;
   std::unordered_map<size_t, std::unordered_set<size_t>> subgraphs;
   std::unordered_map<size_t, std::unordered_set<size_t>> subgraph_children;
-  std::unordered_map<size_t, tl::optional<size_t>> subgraph_parents;
-  tl::optional<std::ofstream> owned_fstream = tl::nullopt;
-  tl::optional<std::ostream &> out = tl::nullopt;
+  std::unordered_map<size_t, std::optional<size_t>> subgraph_parents;
+  std::optional<std::ofstream> owned_fstream = std::nullopt;
+  std::ostream *out = nullptr;
   std::string get_node_name(size_t node_id) const {
     std::ostringstream s;
     s << "node" << node_id;
@@ -52,7 +51,7 @@ class DotFile {
   DotFile(std::string const &filename) : owned_fstream(filename) {
     this->start_output();
   }
-  DotFile(std::ostream &s) : node_id(0), out(s) {
+  DotFile(std::ostream &s) : node_id(0), out(&s) {
     this->start_output();
   }
 
@@ -113,7 +112,7 @@ class DotFile {
     this->get_ostream().flush();
   }
 
-  size_t add_subgraph(tl::optional<size_t> parent_id = tl::nullopt) {
+  size_t add_subgraph(std::optional<size_t> parent_id = std::nullopt) {
     size_t subgraph = this->subgraph_id;
     subgraph_id++;
     this->subgraph_children[subgraph];
@@ -134,7 +133,7 @@ class DotFile {
       throw std::runtime_error(oss.str());
     }
     this->subgraphs[subgraph].insert(this->node_ids.at(node));
-    tl::optional<size_t> parent = this->subgraph_parents.at(subgraph);
+    std::optional<size_t> parent = this->subgraph_parents.at(subgraph);
     if (parent.has_value()) {
       this->add_node_to_subgraph(node, parent.value());
     }
diff --git a/lib/utils/include/utils/fmt.h b/lib/utils/include/utils/fmt.h
index ddf5b00355..58982d6f36 100644
--- a/lib/utils/include/utils/fmt.h
+++ b/lib/utils/include/utils/fmt.h
@@ -40,15 +40,15 @@ operator<<(std::ostream &s, T const &t) {
                 #__VA_ARGS__ " must be fmtable");
 
 // This will not
-template <typename T>
-typename std::enable_if<!already_has_ostream_operator<T>::value,
-                        std::ostream &>::type
-    operator<<(std::ostream &s, T const &t) {
-  // CHECK_FMTABLE(T);
-
-  std::string result = fmt::to_string(t);
-  return s << result;
-}
+/* template <typename T> */
+/* typename std::enable_if<!already_has_ostream_operator<T>::value, */
+/*                         std::ostream &>::type */
+/*     operator<<(std::ostream &s, T const &t) { */
+/*   // CHECK_FMTABLE(T); */
+
+/*   std::string result = fmt::to_string(t); */
+/*   return s << result; */
+/* } */
 
 // template <typename T>
 // typename std::enable_if<is_fmtable<T>::value, std::ostream &>::type
diff --git a/lib/utils/include/utils/graph/algorithms.h b/lib/utils/include/utils/graph/algorithms.h
index 4b08fd5e4a..bb70a9093c 100644
--- a/lib/utils/include/utils/graph/algorithms.h
+++ b/lib/utils/include/utils/graph/algorithms.h
@@ -174,12 +174,12 @@ struct GetDstNodeFunctor {
 };
 
 template <typename... Args>
-Node get_src_node(variant<Args...> const &t) {
+Node get_src_node(std::variant<Args...> const &t) {
   return visit(GetSrcNodeFunctor{}, t);
 }
 
 template <typename... Args>
-Node get_dst_node(variant<Args...> const &t) {
+Node get_dst_node(std::variant<Args...> const &t) {
   return visit(GetDstNodeFunctor{}, t);
 }
 
@@ -203,12 +203,12 @@ struct GetDstIdxFunctor {
 };
 
 template <typename... Args>
-NodePort get_src_idx(variant<Args...> const &t) {
+NodePort get_src_idx(std::variant<Args...> const &t) {
   return visit(GetSrcIdxFunctor{}, t);
 }
 
 template <typename... Args>
-NodePort get_dst_idx(variant<Args...> const &t) {
+NodePort get_dst_idx(std::variant<Args...> const &t) {
   return visit(GetDstIdxFunctor{}, t);
 }
 
@@ -229,8 +229,8 @@ std::unordered_set<Node> get_open_sources(OpenMultiDiGraphView const &g);
 std::unordered_set<Node> get_open_sinks(OpenMultiDiGraphView const &g);
 
 bool is_acyclic(MultiDiGraphView const &, std::unordered_set<Node> const &);
-tl::optional<bool> is_acyclic(DiGraphView const &);
-tl::optional<bool> is_acyclic(MultiDiGraphView const &);
+std::optional<bool> is_acyclic(DiGraphView const &);
+std::optional<bool> is_acyclic(MultiDiGraphView const &);
 
 std::unordered_map<Node, std::unordered_set<Node>>
     get_dominators(DiGraphView const &);
@@ -240,15 +240,15 @@ std::unordered_set<Node> get_dominators(DiGraphView const &,
 
 std::unordered_map<Node, std::unordered_set<Node>>
     get_post_dominators(DiGraphView const &);
-std::unordered_map<Node, optional<Node>>
+std::unordered_map<Node, std::optional<Node>>
     get_imm_dominators(DiGraphView const &);
-std::unordered_map<Node, optional<Node>>
+std::unordered_map<Node, std::optional<Node>>
     get_imm_post_dominators(DiGraphView const &);
-tl::optional<Node> get_imm_post_dominator(DiGraphView const &, Node const &);
-tl::optional<Node> get_imm_post_dominator(MultiDiGraphView const &,
-                                          Node const &);
-tl::optional<Node> get_imm_post_dominator(DiGraphView const &,
-                                          std::unordered_set<Node> const &);
+std::optional<Node> get_imm_post_dominator(DiGraphView const &, Node const &);
+std::optional<Node> get_imm_post_dominator(MultiDiGraphView const &,
+                                           Node const &);
+std::optional<Node> get_imm_post_dominator(DiGraphView const &,
+                                           std::unordered_set<Node> const &);
 
 std::vector<Node>
     get_dfs_ordering(DiGraphView const &,
@@ -328,8 +328,8 @@ void export_as_dot(
     DotFile<Node> &,
     DiGraphView const &,
     std::function<RecordFormatter(Node const &)> const &,
-    tl::optional<std::function<std::string(DirectedEdge const &)> const &> =
-        tl::nullopt);
+    std::optional<std::function<std::string(DirectedEdge const &)>> =
+        std::nullopt);
 
 } // namespace FlexFlow
 
diff --git a/lib/utils/include/utils/graph/labelled/output_labelled_open.h b/lib/utils/include/utils/graph/labelled/output_labelled_open.h
index cb41a7158a..eb406d1804 100644
--- a/lib/utils/include/utils/graph/labelled/output_labelled_open.h
+++ b/lib/utils/include/utils/graph/labelled/output_labelled_open.h
@@ -129,12 +129,12 @@ struct OutputLabelledOpenMultiDiGraph
   }
 
   template <typename... Ts>
-  EdgeLabel const &at(variant<Ts...> const &e) const {
+  EdgeLabel const &at(std::variant<Ts...> const &e) const {
     return visit([&](auto const &e) -> auto const & { return this->at(e); }, e);
   }
 
   template <typename... Ts>
-  EdgeLabel &at(variant<Ts...> const &e) {
+  EdgeLabel &at(std::variant<Ts...> const &e) {
     return visit([&](auto const &e) -> auto & { return this->at(e); }, e);
   }
 
diff --git a/lib/utils/include/utils/graph/open_edge.h b/lib/utils/include/utils/graph/open_edge.h
index e83c58196b..37e98a419d 100644
--- a/lib/utils/include/utils/graph/open_edge.h
+++ b/lib/utils/include/utils/graph/open_edge.h
@@ -6,11 +6,11 @@
 namespace FlexFlow {
 
 using OpenMultiDiEdge =
-    variant<InputMultiDiEdge, OutputMultiDiEdge, MultiDiEdge>;
+    std::variant<InputMultiDiEdge, OutputMultiDiEdge, MultiDiEdge>;
 
-using DownwardOpenMultiDiEdge = variant<OutputMultiDiEdge, MultiDiEdge>;
+using DownwardOpenMultiDiEdge = std::variant<OutputMultiDiEdge, MultiDiEdge>;
 
-using UpwardOpenMultiDiEdge = variant<InputMultiDiEdge, MultiDiEdge>;
+using UpwardOpenMultiDiEdge = std::variant<InputMultiDiEdge, MultiDiEdge>;
 
 bool is_input_edge(OpenMultiDiEdge const &);
 bool is_output_edge(OpenMultiDiEdge const &);
diff --git a/lib/utils/include/utils/graph/query_set.h b/lib/utils/include/utils/graph/query_set.h
index 8192949cb0..c835afa6a6 100644
--- a/lib/utils/include/utils/graph/query_set.h
+++ b/lib/utils/include/utils/graph/query_set.h
@@ -4,7 +4,7 @@
 #include "utils/bidict.h"
 #include "utils/containers.decl.h"
 #include "utils/exception.h"
-#include "utils/optional.h"
+#include <optional>
 #include <unordered_set>
 
 namespace FlexFlow {
@@ -16,7 +16,7 @@ struct query_set {
 
   query_set(std::unordered_set<T> const &query) : query(query) {}
 
-  query_set(optional<std::unordered_set<T>> const &query) : query(query) {}
+  query_set(std::optional<std::unordered_set<T>> const &query) : query(query) {}
 
   query_set(std::initializer_list<T> const &l)
       : query_set(std::unordered_set<T>{l}) {}
@@ -43,11 +43,11 @@ struct query_set {
   }
 
   static query_set<T> matchall() {
-    return {nullopt};
+    return {std::nullopt};
   }
 
 private:
-  optional<std::unordered_set<T>> query;
+  std::optional<std::unordered_set<T>> query;
 };
 
 template <typename T>
diff --git a/lib/utils/include/utils/graph/serialparallel.h b/lib/utils/include/utils/graph/serialparallel.h
index b58281de7d..47bcb4031e 100644
--- a/lib/utils/include/utils/graph/serialparallel.h
+++ b/lib/utils/include/utils/graph/serialparallel.h
@@ -4,7 +4,7 @@
 #include "digraph.h"
 #include "multidigraph.h"
 #include "utils/optional.h"
-#include "utils/variant.h"
+#include <variant>
 #include <vector>
 
 namespace FlexFlow {
@@ -12,22 +12,22 @@ namespace FlexFlow {
 Node find_source_node(DiGraphView const &);
 Node find_sink_node(DiGraphView const &);
 
-optional<Node> find_bottleneck_node(DiGraphView const &);
+std::optional<Node> find_bottleneck_node(DiGraphView const &);
 
 struct Parallel;
 
 struct Serial {
-  std::vector<variant<Parallel, Node>> children;
+  std::vector<std::variant<Parallel, Node>> children;
 };
 
 struct Parallel {
-  std::vector<variant<Serial, Node>> children;
+  std::vector<std::variant<Serial, Node>> children;
 };
 
 FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Parallel, children);
 FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Serial, children);
 
-using SerialParallelDecomposition = variant<Serial, Parallel, Node>;
+using SerialParallelDecomposition = std::variant<Serial, Parallel, Node>;
 
 SerialParallelDecomposition
     get_serial_parallel_decomposition(DiGraphView const &);
diff --git a/lib/utils/include/utils/graph/traversal.h b/lib/utils/include/utils/graph/traversal.h
index a4101de64d..3c3992cd53 100644
--- a/lib/utils/include/utils/graph/traversal.h
+++ b/lib/utils/include/utils/graph/traversal.h
@@ -76,7 +76,7 @@ struct bfs_iterator {
 
   bfs_iterator(DiGraphView const &,
                std::queue<Node> const &,
-               optional<std::unordered_set<Node>> const &);
+               std::optional<std::unordered_set<Node>> const &);
   bfs_iterator(DiGraphView const &,
                std::unordered_set<Node> const &starting_points);
 
@@ -91,7 +91,7 @@ struct bfs_iterator {
 private:
   DiGraphView graph;
   std::queue<Node> q;
-  optional<std::unordered_set<Node>> seen;
+  std::optional<std::unordered_set<Node>> seen;
 };
 
 struct CheckedDFSView {
diff --git a/lib/utils/include/utils/graph/views.h b/lib/utils/include/utils/graph/views.h
index 776a72e6d5..e891a948f0 100644
--- a/lib/utils/include/utils/graph/views.h
+++ b/lib/utils/include/utils/graph/views.h
@@ -6,7 +6,6 @@
 #include "labelled_graphs.h"
 #include "multidigraph.h"
 #include "open_graphs.h"
-#include "tl/optional.hpp"
 #include "undirected.h"
 #include "utils/bidict.h"
 #include "utils/graph/digraph_interfaces.h"
@@ -217,8 +216,8 @@ struct SingleSourceNodeView : public IDiGraphView {
 
 private:
   DiGraphView g;
-  optional<AdjacencyDiGraph> singleton_src;
-  optional<JoinedDigraphView> joined_view;
+  std::optional<AdjacencyDiGraph> singleton_src;
+  std::optional<JoinedDigraphView> joined_view;
   std::unique_ptr<AddDirectedEdgesView> added_edges_view;
 };
 
diff --git a/lib/utils/include/utils/invoke.h b/lib/utils/include/utils/invoke.h
deleted file mode 100644
index cee1eaee0e..0000000000
--- a/lib/utils/include/utils/invoke.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef _FLEXFLOW_UTILS_INCLUDE_UTILS_INVOKE_H
-#define _FLEXFLOW_UTILS_INCLUDE_UTILS_INVOKE_H
-
-#include "invoke.hpp/invoke.hpp"
-
-namespace FlexFlow {
-
-using namespace ::invoke_hpp;
-
-}
-
-#endif
diff --git a/lib/utils/include/utils/json.h b/lib/utils/include/utils/json.h
index a753c52daa..010943a9f9 100644
--- a/lib/utils/include/utils/json.h
+++ b/lib/utils/include/utils/json.h
@@ -149,7 +149,7 @@ struct VariantToJsonFunctor {
 };
 
 template <typename... Args>
-void variant_to_json(json &j, variant<Args...> const &v) {
+void variant_to_json(json &j, std::variant<Args...> const &v) {
   visit(::FlexFlow::VariantToJsonFunctor{j}, v.value);
 }
 
@@ -160,8 +160,9 @@ struct VariantFromJsonFunctor {
   json const &j;
 
   template <int Idx>
-  optional<Variant> operator()(std::integral_constant<int, Idx> const &) const {
-    using Type = typename variant_alternative<Idx, Variant>::type;
+  std::optional<Variant>
+      operator()(std::integral_constant<int, Idx> const &) const {
+    using Type = typename std::variant_alternative<Idx, Variant>::type;
 
     if (visit_struct::get_name<Type>()) {
       return j.at("value").get<Type>();
@@ -170,8 +171,8 @@ struct VariantFromJsonFunctor {
 };
 
 template <typename... Args>
-variant<Args...> variant_from_json(json const &j) {
-  ::FlexFlow::VariantFromJsonFunctor<::FlexFlow::variant<Args...>> func(j);
+std::variant<Args...> variant_from_json(json const &j) {
+  ::FlexFlow::VariantFromJsonFunctor<std::variant<Args...>> func(j);
   auto result = seq_map(func, seq_enumerate_args_t<Args...>{});
   if (!result.has_value()) {
     throw ::FlexFlow::mk_runtime_error("Invalid type {} found in json",
@@ -219,9 +220,9 @@ struct adl_serializer<
 
 template <typename T>
 struct adl_serializer<
-    ::FlexFlow::optional<T>,
+    std::optional<T>,
     typename std::enable_if<::FlexFlow::is_jsonable<T>::value>::type> {
-  static void to_json(json &j, ::FlexFlow::optional<T> const &t) {
+  static void to_json(json &j, std::optional<T> const &t) {
     if (t.has_value()) {
       to_json(j, t.value());
     } else {
@@ -229,9 +230,9 @@ struct adl_serializer<
     }
   }
 
-  static void from_json(json const &j, ::FlexFlow::optional<T> &t) {
+  static void from_json(json const &j, std::optional<T> &t) {
     if (j == nullptr) {
-      t = ::FlexFlow::nullopt;
+      t = std::nullopt;
     } else {
       t = j.get<T>();
     }
@@ -239,15 +240,15 @@ struct adl_serializer<
 };
 
 template <typename... Args>
-struct adl_serializer<::FlexFlow::variant<Args...>,
+struct adl_serializer<std::variant<Args...>,
                       typename std::enable_if<::FlexFlow::elements_satisfy<
                           ::FlexFlow::is_json_serializable,
-                          ::FlexFlow::variant<Args...>>::value>::type> {
-  static void to_json(json &j, ::FlexFlow::variant<Args...> const &v) {
+                          std::variant<Args...>>::value>::type> {
+  static void to_json(json &j, std::variant<Args...> const &v) {
     return ::FlexFlow::variant_to_json(j, v);
   }
 
-  static ::FlexFlow::variant<Args...> from_json(json const &j) {
+  static std::variant<Args...> from_json(json const &j) {
     return ::FlexFlow::variant_from_json<Args...>(j);
   }
 };
diff --git a/lib/utils/include/utils/optional.decl b/lib/utils/include/utils/optional.decl
index 370026fcc0..82f4bd984d 100644
--- a/lib/utils/include/utils/optional.decl
+++ b/lib/utils/include/utils/optional.decl
@@ -1,17 +1,15 @@
 #ifndef _FLEXFLOW_UTILS_OPTIONAL_H
 #define _FLEXFLOW_UTILS_OPTIONAL_H
 
-#include "tl/optional.hpp"
+#include <optional>
 
 namespace FlexFlow {
 
-using namespace tl;
-
 template <typename T, typename F>
-T const &unwrap(optional<T> const &o, F const &f);
+T const &unwrap(std::optional<T> const &o, F const &f);
 
 template <typename T>
-T const &assert_unwrap(optional<T> const &o);
+T const &assert_unwrap(std::optional<T> const &o);
 
 } // namespace FlexFlow
 
diff --git a/lib/utils/include/utils/optional.h b/lib/utils/include/utils/optional.h
index 43d2ef4104..71b6d9d975 100644
--- a/lib/utils/include/utils/optional.h
+++ b/lib/utils/include/utils/optional.h
@@ -8,7 +8,7 @@
 namespace FlexFlow {
 
 template <typename T, typename F>
-T const &unwrap(optional<T> const &o, F const &f) {
+T const &unwrap(std::optional<T> const &o, F const &f) {
   if (o.has_value()) {
     return o.value();
   } else {
@@ -18,7 +18,7 @@ T const &unwrap(optional<T> const &o, F const &f) {
 }
 
 template <typename T>
-T const &assert_unwrap(optional<T> const &o) {
+T const &assert_unwrap(std::optional<T> const &o) {
   assert(o.has_value());
   return o.value();
 }
@@ -28,9 +28,9 @@ T const &assert_unwrap(optional<T> const &o) {
 namespace fmt {
 
 template <typename T>
-struct formatter<::FlexFlow::optional<T>> : formatter<std::string> {
+struct formatter<::std::optional<T>> : formatter<std::string> {
   template <typename FormatContext>
-  auto format(::FlexFlow::optional<T> const &q, FormatContext &ctx)
+  auto format(::std::optional<T> const &q, FormatContext &ctx)
       -> decltype(ctx.out()) {
     std::string result;
     if (q.has_value()) {
diff --git a/lib/utils/include/utils/sequence.h b/lib/utils/include/utils/sequence.h
index 67c2e72ac1..6c66949fd8 100644
--- a/lib/utils/include/utils/sequence.h
+++ b/lib/utils/include/utils/sequence.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_UTILS_INCLUDE_UTILS_SEQUENCE_H
 #define _FLEXFLOW_UTILS_INCLUDE_UTILS_SEQUENCE_H
 
-#include "optional.h"
 #include "utils/tuple.h"
 #include "utils/visitable_core.h"
+#include <optional>
 #include <utility>
 
 namespace FlexFlow {
@@ -119,7 +119,7 @@ auto seq_select(F const &f, int i, seq<X, S...> const &s)
 template <typename F>
 auto seq_select(F const &f, int i, seq<> const &)
     -> decltype(f(std::declval<std::integral_constant<int, 0>>())) {
-  return nullopt;
+  return std::nullopt;
 }
 
 template <typename F, int X, int... S>
diff --git a/lib/utils/include/utils/stack_map.h b/lib/utils/include/utils/stack_map.h
index f2cdf0d88b..76e6e951df 100644
--- a/lib/utils/include/utils/stack_map.h
+++ b/lib/utils/include/utils/stack_map.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_UTILS_STACK_MAP_H
 
 #include "containers.h"
-#include "optional.h"
 #include "stack_vector.h"
 
 namespace std {
@@ -22,7 +21,7 @@ struct stack_map {
   stack_map() = default;
 
   V &operator[](K const &k) {
-    optional<size_t> idx = get_idx(k);
+    std::optional<size_t> idx = get_idx(k);
     if (!idx.has_value()) {
       this->contents.push_back({k, {}});
       idx = this->contents.size() - 1;
@@ -35,7 +34,7 @@ struct stack_map {
   }
 
   void insert(K const &k, V const &v) {
-    optional<size_t> idx = get_idx(k);
+    std::optional<size_t> idx = get_idx(k);
     if (!idx.has_value()) {
       this->contents.push_back({k, v});
     } else {
@@ -116,14 +115,14 @@ struct stack_map {
     return sorted_by(this->contents, comparator);
   }
 
-  optional<size_t> get_idx(K const &k) const {
+  std::optional<size_t> get_idx(K const &k) const {
     for (std::size_t idx = 0; idx < contents.size(); idx++) {
       if (contents.at(idx).first == k) {
         return idx;
       }
     }
 
-    return nullopt;
+    return std::nullopt;
   }
 
   stack_vector<std::pair<K, V>, MAXSIZE> contents;
diff --git a/lib/utils/include/utils/stack_vector.h b/lib/utils/include/utils/stack_vector.h
index 3d5a433725..fe665ed749 100644
--- a/lib/utils/include/utils/stack_vector.h
+++ b/lib/utils/include/utils/stack_vector.h
@@ -3,12 +3,12 @@
 
 #include "containers.h"
 #include "hash-utils.h"
-#include "optional.h"
 #include "utils/fmt.h"
 #include "utils/test_types.h"
 #include "utils/type_traits.h"
 #include <array>
 #include <cassert>
+#include <optional>
 #include <type_traits>
 
 namespace FlexFlow {
@@ -16,19 +16,20 @@ namespace FlexFlow {
 template <typename T, std::size_t MAXSIZE>
 struct stack_vector {
 private:
-  using element_type =
-      conditional_t<std::is_default_constructible<T>::value, T, optional<T>>;
+  using element_type = conditional_t<std::is_default_constructible<T>::value,
+                                     T,
+                                     std::optional<T>>;
 
   static T const &get_value(T const &t) {
     return t;
   }
-  static T const &get_value(optional<T> const &t) {
+  static T const &get_value(std::optional<T> const &t) {
     return t.value();
   }
   static T &get_value(T &t) {
     return t;
   }
-  static T &get_value(optional<T> &t) {
+  static T &get_value(std::optional<T> &t) {
     return t.value();
   }
 
diff --git a/lib/utils/include/utils/tuple.h b/lib/utils/include/utils/tuple.h
index 202e62b5ad..71c369df6a 100644
--- a/lib/utils/include/utils/tuple.h
+++ b/lib/utils/include/utils/tuple.h
@@ -1,8 +1,9 @@
 #ifndef _FLEXFLOW_UTILS_TUPLE_H
 #define _FLEXFLOW_UTILS_TUPLE_H
 
-#include "utils/any.h"
 #include "utils/exception.decl.h"
+#include "utils/type_traits_core.h"
+#include <any>
 #include <cstddef>
 #include <tuple>
 #include <type_traits>
@@ -48,11 +49,11 @@ void visit_tuple(Visitor &v, std::tuple<Types...> const &tup) {
 
 struct tuple_get_visitor {
   tuple_get_visitor() = delete;
-  tuple_get_visitor(int requested_idx, any &result)
+  tuple_get_visitor(int requested_idx, std::any &result)
       : requested_idx(requested_idx), result(result) {}
 
   int requested_idx;
-  any &result;
+  std::any &result;
 
   template <typename T>
   void operator()(int idx, T const &t) {
@@ -63,13 +64,13 @@ struct tuple_get_visitor {
 };
 
 template <typename... Types>
-any get(std::tuple<Types...> const &t, int idx) {
+std::any get(std::tuple<Types...> const &t, int idx) {
   size_t tuple_size = std::tuple_size<decltype(t)>::value;
   if (idx < 0 || idx >= tuple_size) {
     throw mk_runtime_error(
         "Error: idx {} out of bounds for tuple of size {}", idx, tuple_size);
   }
-  any result;
+  std::any result;
   visit_tuple(t, tuple_get_visitor{idx, result});
   return result;
 }
diff --git a/lib/utils/include/utils/type_traits.h b/lib/utils/include/utils/type_traits.h
index ee45e8dc2e..0c0408723d 100644
--- a/lib/utils/include/utils/type_traits.h
+++ b/lib/utils/include/utils/type_traits.h
@@ -1,7 +1,6 @@
 #ifndef _FLEXFLOW_UTILS_INCLUDE_TYPE_TRAITS_H
 #define _FLEXFLOW_UTILS_INCLUDE_TYPE_TRAITS_H
 
-#include "utils/invoke.h"
 #include "utils/metafunction.h"
 #include "utils/type_traits_core.h"
 #include "utils/visitable_core.h"
diff --git a/lib/utils/include/utils/variant.h b/lib/utils/include/utils/variant.h
index b1a1dc1081..b3ae3de115 100644
--- a/lib/utils/include/utils/variant.h
+++ b/lib/utils/include/utils/variant.h
@@ -1,28 +1,12 @@
 #ifndef _FLEXFLOW_UTILS_VARIANT_H
 #define _FLEXFLOW_UTILS_VARIANT_H
 
-#include "mpark/variant.hpp"
-#include "utils/optional.h"
 #include "utils/type_traits.h"
+#include <optional>
+#include <variant>
 
 namespace FlexFlow {
 
-/* using mp = mpark; */
-
-/* template <typename ...Ts> */
-/* using variant = ::mpark::variant<Ts...>; */
-
-using namespace ::mpark;
-
-/* template <typename T> */
-/* using optional = ::tl::optional<T>; */
-
-/* template <typename T> */
-/* using get = ::mpark::get; */
-
-/* template <typename T> */
-/* using holds_alternative = ::mpark::holds_alternative<T>; */
-
 template <typename Pack, typename... Args>
 struct pack_contains_all_of;
 
@@ -35,14 +19,14 @@ template <typename... Haystack>
 struct pack_contains_all_of<pack<Haystack...>> : std::false_type {};
 
 template <typename... Needles, typename... Haystack>
-struct pack_contains_all_of<variant<Haystack...>, Needles...>
+struct pack_contains_all_of<std::variant<Haystack...>, Needles...>
     : pack_contains_all_of<pack<Haystack...>, Needles...> {};
 
 template <typename T, typename... TRest, typename... Args>
-bool is(variant<Args...> const &v) {
+bool is(std::variant<Args...> const &v) {
   static_assert(pack_contains_all_of<pack<Args...>, T, TRest...>::value, "");
 
-  return holds_alternative<T>(v) || is<TRest...>(v);
+  return std::holds_alternative<T>(v) || is<TRest...>(v);
 }
 
 /* template <typename T> */
@@ -54,89 +38,90 @@ bool is(variant<Args...> const &v) {
 /*     using type = mpark::variant<Args1..., Args2...>; */
 /* }; */
 template <template <typename...> class Cond, typename... Ts>
-struct elements_satisfy<Cond, variant<Ts...>>
+struct elements_satisfy<Cond, std::variant<Ts...>>
     : elements_satisfy_impl<Cond, Ts...> {};
 
 template <typename T, typename Variant>
 struct is_in_variant;
 template <typename T, typename... Rest>
-struct is_in_variant<T, variant<T, Rest...>> : std::true_type {};
+struct is_in_variant<T, std::variant<T, Rest...>> : std::true_type {};
 template <typename T, typename Head, typename... Rest>
-struct is_in_variant<T, variant<Head, Rest...>>
-    : is_in_variant<T, variant<Rest...>> {};
+struct is_in_variant<T, std::variant<Head, Rest...>>
+    : is_in_variant<T, std::variant<Rest...>> {};
 template <typename T>
-struct is_in_variant<T, variant<>> : std::false_type {};
+struct is_in_variant<T, std::variant<>> : std::false_type {};
 
 template <typename T, size_t Idx, typename Variant>
 struct variant_idx_helper;
 template <typename T, size_t Idx, typename... Rest>
-struct variant_idx_helper<T, Idx, variant<T, Rest...>>
+struct variant_idx_helper<T, Idx, std::variant<T, Rest...>>
     : std::integral_constant<int, Idx> {};
 template <typename T, size_t Idx, typename Head, typename... Rest>
-struct variant_idx_helper<T, Idx, variant<Head, Rest...>>
-    : variant_idx_helper<T, (Idx + 1), variant<Rest...>> {};
+struct variant_idx_helper<T, Idx, std::variant<Head, Rest...>>
+    : variant_idx_helper<T, (Idx + 1), std::variant<Rest...>> {};
 
 template <typename T, typename Variant>
 struct index_of_type : variant_idx_helper<T, 0, Variant> {
   static_assert(is_in_variant<T, Variant>::value, "");
 };
 
-static_assert(index_of_type<int, variant<float, double, int, bool>>::value == 2,
-              "");
+static_assert(
+    index_of_type<int, std::variant<float, double, int, bool>>::value == 2, "");
 
 template <typename Variant1, typename Variant2>
 struct is_subeq_variant;
 template <typename Head, typename Variant2, typename... Rest>
-struct is_subeq_variant<variant<Head, Rest...>, Variant2>
+struct is_subeq_variant<std::variant<Head, Rest...>, Variant2>
     : conjunction<is_in_variant<Head, Variant2>,
-                  is_subeq_variant<variant<Rest...>, Variant2>> {};
+                  is_subeq_variant<std::variant<Rest...>, Variant2>> {};
 template <typename Variant2>
-struct is_subeq_variant<variant<>, Variant2> : std::true_type {};
+struct is_subeq_variant<std::variant<>, Variant2> : std::true_type {};
 
 template <typename Variant1, typename Variant2, typename Enable = void>
 struct variant_join_helper;
 
 template <typename Head, typename... Args1, typename... Args2>
 struct variant_join_helper<
-    variant<Head, Args1...>,
-    variant<Args2...>,
+    std::variant<Head, Args1...>,
+    std::variant<Args2...>,
     typename std::enable_if<
-        !is_in_variant<Head, variant<Args2...>>::value>::type> {
-  using type = typename variant_join_helper<variant<Args1...>,
-                                            variant<Head, Args2...>>::type;
+        !is_in_variant<Head, std::variant<Args2...>>::value>::type> {
+  using type = typename variant_join_helper<std::variant<Args1...>,
+                                            std::variant<Head, Args2...>>::type;
 };
 
 template <typename Head, typename... Args1, typename... Args2>
 struct variant_join_helper<
-    variant<Head, Args1...>,
-    variant<Args2...>,
+    std::variant<Head, Args1...>,
+    std::variant<Args2...>,
     typename std::enable_if<
-        is_in_variant<Head, variant<Args2...>>::value>::type> {
-  using type =
-      typename variant_join_helper<variant<Args1...>, variant<Args2...>>::type;
+        is_in_variant<Head, std::variant<Args2...>>::value>::type> {
+  using type = typename variant_join_helper<std::variant<Args1...>,
+                                            std::variant<Args2...>>::type;
 };
 
 template <typename... Args2>
-struct variant_join_helper<variant<>, variant<Args2...>> {
-  using type = variant<Args2...>;
+struct variant_join_helper<std::variant<>, std::variant<Args2...>> {
+  using type = std::variant<Args2...>;
 };
 
 template <class Variant1, class Variant2>
 using variant_join = typename variant_join_helper<Variant1, Variant2>::type;
 
 template <class Variant1, typename... T>
-using variant_add = variant_join<Variant1, variant<T...>>;
+using variant_add = variant_join<Variant1, std::variant<T...>>;
 
-static_assert(
-    std::is_same<variant_join<variant<int, float>, variant<float, double>>,
-                 variant<int, float, double>>::value,
-    "");
-static_assert(std::is_same<variant_join<variant<int>, variant<float, double>>,
-                           variant<int, float, double>>::value,
+static_assert(std::is_same<variant_join<std::variant<int, float>,
+                                        std::variant<float, double>>,
+                           std::variant<int, float, double>>::value,
               "");
 static_assert(
-    std::is_same<variant_join<variant<int>, variant<int>>, variant<int>>::value,
+    std::is_same<variant_join<std::variant<int>, std::variant<float, double>>,
+                 std::variant<int, float, double>>::value,
     "");
+static_assert(std::is_same<variant_join<std::variant<int>, std::variant<int>>,
+                           std::variant<int>>::value,
+              "");
 
 template <typename Out>
 struct VariantWidenFunctor {
@@ -149,15 +134,17 @@ struct VariantWidenFunctor {
 template <typename Out>
 struct VariantNarrowFunctor {
   template <typename T>
-  typename std::enable_if<is_in_variant<T, Out>::value, optional<Out>>::type
+  typename std::enable_if<is_in_variant<T, Out>::value,
+                          std::optional<Out>>::type
       operator()(T const &t) const {
     return Out(t);
   }
 
   template <typename T>
-  typename std::enable_if<!is_in_variant<T, Out>::value, optional<Out>>::type
+  typename std::enable_if<!is_in_variant<T, Out>::value,
+                          std::optional<Out>>::type
       operator()(T const &t) const {
-    return nullopt;
+    return std::nullopt;
   }
 };
 
@@ -183,7 +170,7 @@ template <
     typename VariantOut,
     typename VariantIn,
     typename = std::enable_if<is_subeq_variant<VariantOut, VariantIn>::value>>
-optional<VariantOut> narrow(VariantIn const &v) {
+std::optional<VariantOut> narrow(VariantIn const &v) {
   return visit(VariantNarrowFunctor<VariantOut>{}, v);
 }
 
@@ -194,22 +181,24 @@ template <
     typename = std::enable_if<is_subeq_variant<VariantIn, VariantOut>::value>>
 auto narrow(Container const &c) -> decltype(transform(
     c,
-    std::declval<std::function<optional<VariantOut>(VariantIn const &)>>())) {
+    std::declval<
+        std::function<std::optional<VariantOut>(VariantIn const &)>>())) {
   return transform(c, [](VariantIn const &i) { return narrow<VariantOut>(i); });
 }
 
-template <typename T1,
-          typename T2,
-          typename... Trest,
-          typename VariantIn,
-          typename = std::enable_if<
-              !is_subeq_variant<variant<T1, T2, Trest...>, VariantIn>::value>>
-optional<variant<T1, T2, Trest...>> narrow(VariantIn const &v) {
-  return visit(VariantNarrowFunctor<variant<T1, T2, Trest...>>{}, v);
+template <
+    typename T1,
+    typename T2,
+    typename... Trest,
+    typename VariantIn,
+    typename = std::enable_if<
+        !is_subeq_variant<std::variant<T1, T2, Trest...>, VariantIn>::value>>
+std::optional<std::variant<T1, T2, Trest...>> narrow(VariantIn const &v) {
+  return visit(VariantNarrowFunctor<std::variant<T1, T2, Trest...>>{}, v);
 }
 
 template <typename VariantOut, typename VariantIn>
-optional<VariantOut> cast(VariantIn const &v) {
+std::optional<VariantOut> cast(VariantIn const &v) {
   return narrow<VariantOut>(widen<variant_join<VariantIn, VariantOut>>(v));
 }
 
diff --git a/lib/utils/include/utils/visitable.h b/lib/utils/include/utils/visitable.h
index 5c148a9247..6859b7a785 100644
--- a/lib/utils/include/utils/visitable.h
+++ b/lib/utils/include/utils/visitable.h
@@ -102,7 +102,7 @@ struct construct_visitor {
 
   template <typename T>
   void operator()(char const *, T C::*ptr_to_member) {
-    c.*ptr_to_member = any_cast<T const &>(get(args, idx));
+    c.*ptr_to_member = std::any_cast<T const &>(get(args, idx));
     this->idx++;
   };
 };
@@ -346,6 +346,9 @@ struct visitable_formatter : public ::fmt::formatter<std::string> {
                 #TYPENAME                                                      \
                 " should be list-initialializable by the visit field types");
 
+#ifdef __CUDACC__
+#define CHECK_CONSTRUCTION_NONEMPTY(TYPENAME) ;
+#else
 #define CHECK_CONSTRUCTION_NONEMPTY(TYPENAME)                                  \
   static_assert(is_only_visit_list_initializable<TYPENAME>::value,             \
                 #TYPENAME                                                      \
@@ -357,6 +360,7 @@ struct visitable_formatter : public ::fmt::formatter<std::string> {
   static_assert(is_visit_list_initializable<TYPENAME>::value,                  \
                 #TYPENAME                                                      \
                 " should be list-initialializable by the visit field types");
+#endif
 
 #define CHECK_CONSTRUCTION_EMPTY(TYPENAME)                                     \
   static_assert(std::is_default_constructible<TYPENAME>::value,                \
diff --git a/lib/utils/src/graph/algorithms.cc b/lib/utils/src/graph/algorithms.cc
index d62989d65b..449a9a8203 100644
--- a/lib/utils/src/graph/algorithms.cc
+++ b/lib/utils/src/graph/algorithms.cc
@@ -400,9 +400,9 @@ std::unordered_set<Node> get_sources(DiGraphView const &g) {
   });
 }
 
-optional<bool> is_acyclic(DiGraphView const &g) {
+std::optional<bool> is_acyclic(DiGraphView const &g) {
   if (num_nodes(g) == 0) {
-    return nullopt;
+    return std::nullopt;
   }
   std::unordered_set<Node> sources = get_sources(g);
   if (sources.size() == 0) {
@@ -424,7 +424,7 @@ optional<bool> is_acyclic(DiGraphView const &g) {
   return true;
 }
 
-optional<bool> is_acyclic(MultiDiGraph const &g) {
+std::optional<bool> is_acyclic(MultiDiGraph const &g) {
   return is_acyclic(g);
 }
 
@@ -565,7 +565,7 @@ std::unordered_set<Node> get_dominators(DiGraphView const &g,
   if (n.empty()) {
     throw mk_runtime_error("Cannot find dominators of no nodes");
   }
-  optional<std::unordered_set<Node>> result =
+  std::optional<std::unordered_set<Node>> result =
       intersection(values(restrict_keys(get_dominators(g), n)));
   assert(result.has_value());
 
@@ -577,10 +577,10 @@ std::unordered_map<Node, std::unordered_set<Node>>
   return get_dominators(flipped(g));
 }
 
-std::unordered_map<Node, optional<Node>>
+std::unordered_map<Node, std::optional<Node>>
     get_imm_dominators(DiGraphView const &g) {
 
-  std::unordered_map<Node, optional<Node>> result;
+  std::unordered_map<Node, std::optional<Node>> result;
   for (auto const &kv : get_dominators(g)) {
     Node node = kv.first;
     std::unordered_set<Node> node_dominators = kv.second;
@@ -590,7 +590,7 @@ std::unordered_map<Node, optional<Node>>
     // a node cannot immediately dominate itself
     if (node_dominators.size() == 1) {
       assert(get_only(node_dominators) == node);
-      result[node] = nullopt;
+      result[node] = std::nullopt;
     } else {
       node_dominators.erase(node);
       result[node] = get_node_with_greatest_topo_rank(node_dominators, g);
@@ -599,12 +599,12 @@ std::unordered_map<Node, optional<Node>>
   return result;
 }
 
-std::unordered_map<Node, optional<Node>>
+std::unordered_map<Node, std::optional<Node>>
     get_imm_post_dominators(DiGraphView const &g) {
   return get_imm_dominators(flipped(g));
 }
 
-optional<Node> imm_post_dominator(DiGraphView const &g, Node const &n) {
+std::optional<Node> imm_post_dominator(DiGraphView const &g, Node const &n) {
   return get_imm_post_dominators(g).at(n);
 }
 
@@ -627,8 +627,9 @@ Node get_node_with_greatest_topo_rank(std::unordered_set<Node> const &nodes,
                            });
 }
 
-optional<Node> get_imm_post_dominator(DiGraphView const &g,
-                                      std::unordered_set<Node> const &nodes) {
+std::optional<Node>
+    get_imm_post_dominator(DiGraphView const &g,
+                           std::unordered_set<Node> const &nodes) {
 
   if (nodes.empty()) {
     throw mk_runtime_error("Cannot get imm_post_dominator of no nodes");
@@ -639,7 +640,7 @@ optional<Node> get_imm_post_dominator(DiGraphView const &g,
   if (!commonDoms.empty()) {
     return get_node_with_greatest_topo_rank(commonDoms, g);
   } else {
-    return nullopt;
+    return std::nullopt;
   }
 }
 
diff --git a/lib/utils/src/graph/serialparallel.cc b/lib/utils/src/graph/serialparallel.cc
index 3b3a1b0aed..33fc99b079 100644
--- a/lib/utils/src/graph/serialparallel.cc
+++ b/lib/utils/src/graph/serialparallel.cc
@@ -17,11 +17,11 @@ Node find_sink_node(DiGraphView const &g) {
   return get_only(sinks);
 }
 
-optional<Node> find_bottleneck_node(DiGraphView const &g) {
+std::optional<Node> find_bottleneck_node(DiGraphView const &g) {
   std::unordered_set<Node> sources = get_sources(g);
   std::unordered_set<Node> sinks = get_sources(g);
 
-  optional<Node> maybe_bottleneck = get_imm_post_dominator(g, sources);
+  std::optional<Node> maybe_bottleneck = get_imm_post_dominator(g, sources);
   if (maybe_bottleneck.has_value()) {
     assert(contains(get_dominators(g, sinks), maybe_bottleneck.value()));
   }
@@ -95,7 +95,7 @@ SplitAST sp_decomposition(DiGraphView const &g) {
   std::unordered_set<Node> sources = get_sources(g);
   std::unordered_set<Node> sinks = get_sinks(g);
 
-  optional<Node> bottleneck = find_bottleneck_node(g);
+  std::optional<Node> bottleneck = find_bottleneck_node(g);
   if (bottleneck.has_value()) {
     return SplitASTNode(SplitType::SERIAL,
                         sp_decomposition(source_to_sink_subgraph(
@@ -175,7 +175,7 @@ SplitAST flatten_ast(SplitAST const &ast) {
 }
 
 struct ToFinalAST {
-  variant<Serial, Parallel, Node> operator()(SplitASTNode const &node) {
+  std::variant<Serial, Parallel, Node> operator()(SplitASTNode const &node) {
     if (node.type == SplitType::SERIAL) {
       return Serial{transform(node.children, [](SplitAST const &s) {
         return narrow<Parallel, Node>(to_final_ast(s)).value();
@@ -187,12 +187,12 @@ struct ToFinalAST {
     }
   }
 
-  variant<Serial, Parallel, Node> operator()(Node const &node) {
+  std::variant<Serial, Parallel, Node> operator()(Node const &node) {
     return node;
   }
 };
 
-variant<Serial, Parallel, Node> to_final_ast(SplitAST const &ast) {
+std::variant<Serial, Parallel, Node> to_final_ast(SplitAST const &ast) {
   return visit(ToFinalAST{}, ast);
 }
 struct GetNodes {
@@ -209,14 +209,14 @@ std::unordered_set<Node> get_nodes(SerialParallelDecomposition const &sp) {
 std::unordered_set<Node> get_nodes(Serial const &serial) {
   return set_union(transform(
       serial.children,
-      [](variant<Parallel, Node> const child) -> std::unordered_set<Node> {
+      [](std::variant<Parallel, Node> const child) -> std::unordered_set<Node> {
         return visit(GetNodes{}, child);
       }));
 }
 
 std::unordered_set<Node> get_nodes(Parallel const &parallel) {
   return set_union(
-      transform(parallel.children, [](variant<Serial, Node> const &child) {
+      transform(parallel.children, [](std::variant<Serial, Node> const &child) {
         return visit(GetNodes{}, child);
       }));
 }
@@ -284,18 +284,18 @@ MultiDiGraph multidigraph_from_sp_decomposition(
 }
 
 MultiDiGraph multidigraph_from_sp_decomposition(
-    variant<Parallel, Node> const &sp_decomposition) {
+    std::variant<Parallel, Node> const &sp_decomposition) {
   return visit(MultiDiGraphFromSPDecompositionFunctor{}, sp_decomposition);
 }
 
 MultiDiGraph multidigraph_from_sp_decomposition(
-    variant<Serial, Node> const &sp_decomposition) {
+    std::variant<Serial, Node> const &sp_decomposition) {
   return visit(MultiDiGraphFromSPDecompositionFunctor{}, sp_decomposition);
 }
 
 MultiDiGraph multidigraph_from_sp_decomposition(Serial const &serial) {
   MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
-  for (variant<Parallel, Node> const &child : serial.children) {
+  for (std::variant<Parallel, Node> const &child : serial.children) {
     serial_extend(g, multidigraph_from_sp_decomposition(child));
   }
   return g;
@@ -303,7 +303,7 @@ MultiDiGraph multidigraph_from_sp_decomposition(Serial const &serial) {
 
 MultiDiGraph multidigraph_from_sp_decomposition(Parallel const &parallel) {
   MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
-  for (variant<Serial, Node> const &child : parallel.children) {
+  for (std::variant<Serial, Node> const &child : parallel.children) {
     parallel_extend(g, multidigraph_from_sp_decomposition(child));
   }
   return g;
diff --git a/lib/utils/src/graph/serialparallel_internal.h b/lib/utils/src/graph/serialparallel_internal.h
index d66b97b31b..3d3e17fecb 100644
--- a/lib/utils/src/graph/serialparallel_internal.h
+++ b/lib/utils/src/graph/serialparallel_internal.h
@@ -1,11 +1,11 @@
 #ifndef _FLEXFLOW_UTILS_GRAPH_SERIALPARALLEL_INTERNAL_H
 #define _FLEXFLOW_UTILS_GRAPH_SERIALPARALLEL_INTERNAL_H
 
-#include "mpark/variant.hpp"
 #include "utils/graph/digraph.h"
 #include "utils/graph/node.h"
 #include "utils/graph/serialparallel.h"
 #include "utils/visitable.h"
+#include <variant>
 #include <vector>
 
 namespace FlexFlow {
@@ -16,7 +16,7 @@ enum class SplitType { SERIAL, PARALLEL };
 
 struct SplitASTNode;
 
-using SplitAST = mpark::variant<SplitASTNode, Node>;
+using SplitAST = std::variant<SplitASTNode, Node>;
 
 struct SplitASTNode {
   SplitASTNode(SplitType type);
@@ -34,7 +34,7 @@ SplitAST parallel_decomposition(DiGraphView const &g);
 std::unordered_set<Node>
     from_source_to_sink(DiGraphView const &, Node const &src, Node const &sink);
 
-mpark::variant<Serial, Parallel, Node> to_final_ast(SplitAST const &);
+std::variant<Serial, Parallel, Node> to_final_ast(SplitAST const &);
 SplitAST flatten_ast(SplitAST const &ast);
 
 } // namespace FlexFlow
diff --git a/lib/utils/src/graph/traversal.cc b/lib/utils/src/graph/traversal.cc
index 60bcd05f3f..38758e249e 100644
--- a/lib/utils/src/graph/traversal.cc
+++ b/lib/utils/src/graph/traversal.cc
@@ -105,7 +105,7 @@ bool cdi::operator!=(cdi const &other) const {
 
 bfi::bfs_iterator(DiGraphView const &g,
                   std::queue<Node> const &q,
-                  optional<std::unordered_set<Node>> const &seen)
+                  std::optional<std::unordered_set<Node>> const &seen)
     : graph(g), q(q), seen(seen) {}
 
 bfi::bfs_iterator(DiGraphView const &g,
@@ -225,7 +225,7 @@ bfs_iterator BFSView::cbegin() const {
 }
 
 bfs_iterator BFSView::cend() const {
-  return bfs_iterator(this->graph, std::queue<Node>{}, nullopt);
+  return bfs_iterator(this->graph, std::queue<Node>{}, std::nullopt);
 }
 
 bfs_iterator BFSView::begin() const {
diff --git a/lib/utils/src/stack_vector.cc b/lib/utils/src/stack_vector.cc
new file mode 100644
index 0000000000..daecd41a85
--- /dev/null
+++ b/lib/utils/src/stack_vector.cc
@@ -0,0 +1 @@
+#include "utils/stack_vector.h"
diff --git a/lib/utils/src/tuple.cc b/lib/utils/src/tuple.cc
new file mode 100644
index 0000000000..cdf193d500
--- /dev/null
+++ b/lib/utils/src/tuple.cc
@@ -0,0 +1 @@
+#include "utils/tuple.h"