diff --git a/.dockerignore b/.dockerignore
index 22ec965249..b9f228c009 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -11,3 +11,11 @@ python/flexflow/core/legion_cffi_header.py
*.pb.h
*.o
*.a
+
+# Ignore inference assets
+/inference/weights/*
+/inference/tokenizer/*
+/inference/prompt/*
+/inference/output/*
+
+/tests/inference/python_test_configs/*.json
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 183028b022..e8177cd9b7 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -10,6 +10,3 @@ Linked Issues:
Issues closed by this PR:
- Closes #
-**Before merging:**
-
-- [ ] Did you update the [flexflow-third-party](https://github.com/flexflow/flexflow-third-party) repo, if modifying any of the Cmake files, the build configs, or the submodules?
diff --git a/.github/README.md b/.github/README.md
new file mode 100644
index 0000000000..5aba2295d5
--- /dev/null
+++ b/.github/README.md
@@ -0,0 +1,255 @@
+# FlexFlow Serve: Low-Latency, High-Performance LLM Serving
+![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=inference) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=inference) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=inference) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=inference) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=inference) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=inference) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
+
+
+---
+
+## What is FlexFlow Serve
+
+The high computational and memory requirements of generative large language
+models (LLMs) make it challenging to serve them quickly and cheaply.
+FlexFlow Serve is an open-source compiler and distributed system for
+__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms
+existing systems by 1.3-2.0x for single-node, multi-GPU inference and by
+1.4-2.4x for multi-node, multi-GPU inference.
+
+
+
+
+
+
+## Install FlexFlow Serve
+
+
+### Requirements
+* OS: Linux
+* GPU backend: Hip-ROCm or CUDA
+ * CUDA version: 10.2 – 12.0
+ * NVIDIA compute capability: 6.0 or higher
+* Python: 3.6 or higher
+* Package dependencies: [see here](https://github.com/flexflow/FlexFlow/blob/inference/requirements.txt)
+
+### Install with pip
+You can install FlexFlow Serve using pip:
+
+```bash
+pip install flexflow
+```
+
+### Try it in Docker
+If you run into any issue during the install, or if you would like to use the C++ API without needing to install from source, you can also use our pre-built Docker package for different CUDA versions (NVIDIA backend) and multiple ROCM versions (AMD backend). To download and run our pre-built Docker container:
+
+```bash
+docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest
+```
+
+To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.4`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](../docker/README.md).
+
+### Build from source
+
+You can install FlexFlow Serve from source code by building the inference branch of FlexFlow. Please follow these [instructions](https://flexflow.readthedocs.io/en/latest/installation.html).
+
+## Quickstart
+The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively.
+We need to make sure the aggregated GPU memory and zero-copy memory are **both** sufficient to store LLM parameters in non-offloading serving. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving.
+```python
+import flexflow.serve as ff
+
+ff.init(
+ num_gpus=4,
+ memory_per_gpu=14000,
+ zero_copy_memory_per_node=30000,
+ tensor_parallelism_degree=4,
+ pipeline_parallelism_degree=1
+ )
+```
+Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms).
+```python
+# Specify the LLM
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
+
+# Specify a list of SSMs (just one in this case)
+ssms=[]
+ssm = ff.SSM("JackFram/llama-68m")
+ssms.append(ssm)
+```
+Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs. You can also use the following arguments to specify serving configuration when compiling LLMs and SSMs:
+
+* max\_requests\_per\_batch: the maximum number of requests to serve in a batch (default: 16)
+* max\_seq\_length: the maximum number of tokens in a request (default: 256)
+* max\_tokens\_per\_batch: the maximum number of tokens to process in a batch (default: 128)
+
+```python
+# Create the sampling configs
+generation_config = ff.GenerationConfig(
+ do_sample=False, temperature=0.9, topp=0.8, topk=1
+)
+
+# Compile the SSMs for inference and load the weights into memory
+for ssm in ssms:
+ ssm.compile(generation_config)
+
+# Compile the LLM for inference and load the weights into memory
+llm.compile(generation_config,
+ max_requests_per_batch = 16,
+ max_seq_length = 256,
+ max_tokens_per_batch = 128,
+ ssms=ssms)
+```
+Next, we call `llm.start_server()` to start an LLM server running on a seperate background thread, which allows users to perform computations in parallel with LLM serving. Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text. After all serving requests are processed, you can either call `llm.stop_server()` to terminate the background thread or directly exit the python program, which will automatically terminate the background server thread.
+```python
+llm.start_server()
+result = llm.generate("Here are some travel tips for Tokyo:\n")
+llm.stop_server() # This invocation is optional
+```
+
+### Incremental decoding
+
+Expand here
+
+
+```python
+import flexflow.serve as ff
+
+# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+ff.init(
+ num_gpus=4,
+ memory_per_gpu=14000,
+ zero_copy_memory_per_node=30000,
+ tensor_parallelism_degree=4,
+ pipeline_parallelism_degree=1
+ )
+
+# Create the FlexFlow LLM
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
+
+# Create the sampling configs
+generation_config = ff.GenerationConfig(
+ do_sample=True, temperature=0.9, topp=0.8, topk=1
+)
+
+# Compile the LLM for inference and load the weights into memory
+llm.compile(generation_config,
+ max_requests_per_batch = 16,
+ max_seq_length = 256,
+ max_tokens_per_batch = 128)
+
+# Generation begins!
+llm.start_server()
+result = llm.generate("Here are some travel tips for Tokyo:\n")
+llm.stop_server() # This invocation is optional
+```
+
+
+
+### C++ interface
+If you'd like to use the C++ interface (mostly used for development and benchmarking purposes), you should install from source, and follow the instructions below.
+
+
+Expand here
+
+
+#### Downloading models
+Before running FlexFlow Serve, you should manually download the LLM and SSM(s) model of interest using the [inference/utils/download_hf_model.py](https://github.com/flexflow/FlexFlow/blob/inference/inference/utils/download_hf_model.py) script (see example below). By default, the script will download all of a model's assets (weights, configs, tokenizer files, etc...) into the cache folder `~/.cache/flexflow`. If you would like to use a different folder, you can request that via the parameter `--cache-folder`.
+
+```bash
+python3 ./inference/utils/download_hf_model.py ...
+```
+
+#### Running the C++ examples
+A C++ example is available at [this folder](../inference/spec_infer/). After building FlexFlow Serve, the executable will be available at `/build_dir/inference/spec_infer/spec_infer`. You can use the following command-line arguments to run FlexFlow Serve:
+
+* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0)
+* `-ll:fsize`: size of device memory on each GPU in MB
+* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters.
+* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf")
+* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
+* `-cache-folder`: the folder
+* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used.
+* `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests:
+* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency
+
+For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
+
+```bash
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+```
+
+
+## Speculative Inference
+A key technique that enables FlexFlow Serve to accelerate LLM serving is speculative
+inference, which combines various collectively boost-tuned small speculative
+models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a
+token tree, whose nodes each represent a candidate token sequence. The correctness
+of all candidate token sequences represented by a token tree is verified against the
+LLM’s output in parallel using a novel tree-based parallel decoding mechanism.
+FlexFlow Serve uses an LLM as a token tree verifier instead of an incremental decoder,
+which largely reduces the end-to-end inference latency and computational requirement
+for serving generative LLMs while provably preserving model quality.
+
+
+
+
+
+### Supported LLMs and SSMs
+
+FlexFlow Serve currently supports all HuggingFace models with the following architectures:
+* `LlamaForCausalLM` / `LLaMAForCausalLM` (e.g. LLaMA/LLaMA-2, Guanaco, Vicuna, Alpaca, ...)
+* `OPTForCausalLM` (models from the OPT family)
+* `RWForCausalLM` (models from the Falcon family)
+* `GPTBigCodeForCausalLM` (models from the Starcoder family)
+
+Below is a list of models that we have explicitly tested and for which a SSM may be available:
+
+| Model | Model id on HuggingFace | Boost-tuned SSMs |
+| :---- | :---- | :---- |
+| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-66B | facebook/opt-66b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| Falcon-7B | tiiuae/falcon-7b | |
+| Falcon-40B | tiiuae/falcon-40b | |
+| StarCoder-7B | bigcode/starcoderbase-7b | |
+| StarCoder-15.5B | bigcode/starcoder | |
+
+### CPU Offloading
+FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. [TODO: update instructions] You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags.
+
+### Quantization
+FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually.
+
+### Prompt Datasets
+We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json).
+
+## TODOs
+
+FlexFlow Serve is under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions.
+
+* AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs.
+* Chatbot prompt templates and Multi-round conversations
+* Support for FastAPI server
+* Integration with LangChain for document question answering
+
+## Acknowledgements
+This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as:
+
+``` bibtex
+@misc{miao2023specinfer,
+ title={SpecInfer: Accelerating Generative Large Language Model Serving with Speculative Inference and Token Tree Verification},
+ author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Alan Zhu and Lijie Yang and Xiaoxiang Shi and Chunan Shi and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia},
+ year={2023},
+ eprint={2305.09781},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
+
+## License
+FlexFlow uses Apache License 2.0.
diff --git a/.github/workflows/build-skip.yml b/.github/workflows/build-skip.yml
index b3ab69e9c1..8635c0d137 100644
--- a/.github/workflows/build-skip.yml
+++ b/.github/workflows/build-skip.yml
@@ -3,6 +3,7 @@ on:
pull_request:
paths-ignore:
- "include/**"
+ - "inference/**"
- "cmake/**"
- "config/**"
- "deps/**"
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ada29c5798..ef5961bc87 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -3,6 +3,7 @@ on:
pull_request:
paths:
- "include/**"
+ - "inference/**"
- "cmake/**"
- "config/**"
- "deps/**"
@@ -15,6 +16,7 @@ on:
- "master"
paths:
- "include/**"
+ - "inference/**"
- "cmake/**"
- "config/**"
- "deps/**"
@@ -38,6 +40,8 @@ jobs:
matrix:
gpu_backend: ["cuda", "hip_rocm"]
fail-fast: false
+ env:
+ FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
@@ -48,21 +52,23 @@ jobs:
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Install CUDA
- uses: Jimver/cuda-toolkit@v0.2.11
+ uses: Jimver/cuda-toolkit@v0.2.16
+ if: ${{ matrix.gpu_backend == 'cuda' }}
id: cuda-toolkit
with:
- cuda: "11.8.0"
+ cuda: "12.1.1"
# Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
use-github-cache: "false"
+ log-file-suffix: 'cmake_${{matrix.gpu_backend}}.txt'
- name: Install system dependencies
- run: FF_GPU_BACKEND=${{ matrix.gpu_backend }} .github/workflows/helpers/install_dependencies.sh
+ run: .github/workflows/helpers/install_dependencies.sh
- name: Install conda and FlexFlow dependencies
uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: flexflow
- environment-file: conda/environment.yml
+ environment-file: conda/flexflow.yml
auto-activate-base: false
- name: Build FlexFlow
@@ -70,17 +76,25 @@ jobs:
export CUDNN_DIR="$CUDA_PATH"
export CUDA_DIR="$CUDA_PATH"
export FF_HOME=$(pwd)
- export FF_GPU_BACKEND=${{ matrix.gpu_backend }}
export FF_CUDA_ARCH=70
+ export FF_HIP_ARCH=gfx1100,gfx1036
+ export hip_version=5.6
+ export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+
+ if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
+ export FF_BUILD_ALL_EXAMPLES=ON
+ export FF_BUILD_UNIT_TESTS=ON
+ else
+ export FF_BUILD_ALL_EXAMPLES=OFF
+ export FF_BUILD_UNIT_TESTS=OFF
+ fi
+
cores_available=$(nproc --all)
n_build_cores=$(( cores_available -1 ))
if (( $n_build_cores < 1 )) ; then n_build_cores=1 ; fi
mkdir build
cd build
- if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
- export FF_BUILD_ALL_EXAMPLES=ON
- export FF_BUILD_UNIT_TESTS=ON
- fi
+
../config/config.linux
make -j $n_build_cores
@@ -89,25 +103,24 @@ jobs:
export CUDNN_DIR="$CUDA_PATH"
export CUDA_DIR="$CUDA_PATH"
export FF_HOME=$(pwd)
- export FF_GPU_BACKEND=${{ matrix.gpu_backend }}
export FF_CUDA_ARCH=70
- cd build
+ export FF_HIP_ARCH=gfx1100,gfx1036
+ export hip_version=5.6
+ export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+
if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
- export FF_BUILD_ALL_EXAMPLES=ON
+ export FF_BUILD_ALL_EXAMPLES=ON
export FF_BUILD_UNIT_TESTS=ON
+ else
+ export FF_BUILD_ALL_EXAMPLES=OFF
+ export FF_BUILD_UNIT_TESTS=OFF
fi
+
+ cd build
../config/config.linux
sudo make install
sudo ldconfig
- - name: Check availability of Python flexflow.core module
- if: ${{ matrix.gpu_backend == 'cuda' }}
- run: |
- export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH"
- sudo ln -s "$CUDA_PATH/lib64/stubs/libcuda.so" "$CUDA_PATH/lib64/stubs/libcuda.so.1"
- export CPU_ONLY_TEST=1
- python -c "import flexflow.core; exit()"
-
- name: Run C++ unit tests
if: ${{ matrix.gpu_backend == 'cuda' }}
run: |
@@ -115,9 +128,19 @@ jobs:
export CUDA_DIR="$CUDA_PATH"
export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH"
export FF_HOME=$(pwd)
+ sudo ln -s "$CUDA_PATH/lib64/stubs/libcuda.so" "$CUDA_PATH/lib64/stubs/libcuda.so.1"
cd build
./tests/unit/unit-test
+ - name: Check availability of flexflow modules in Python
+ run: |
+ if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
+ export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH"
+ fi
+ # Remove build folder to check that the installed version can run independently of the build files
+ rm -rf build
+ python -c "import flexflow.core; import flexflow.serve as ff; exit()"
+
makefile-build:
name: Build FlexFlow with the Makefile
runs-on: ubuntu-20.04
@@ -134,11 +157,12 @@ jobs:
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Install CUDA
- uses: Jimver/cuda-toolkit@v0.2.11
+ uses: Jimver/cuda-toolkit@v0.2.16
id: cuda-toolkit
with:
- cuda: "11.8.0"
+ cuda: "12.1.1"
use-github-cache: "false"
+ log-file-suffix: 'makefile_${{matrix.gpu_backend}}.txt'
- name: Install system dependencies
run: .github/workflows/helpers/install_dependencies.sh
@@ -147,7 +171,7 @@ jobs:
uses: conda-incubator/setup-miniconda@v2
with:
activate-environment: flexflow
- environment-file: conda/environment.yml
+ environment-file: conda/flexflow.yml
auto-activate-base: false
- name: Build FlexFlow
@@ -163,5 +187,4 @@ jobs:
cd python
make -j $n_build_cores
- export CPU_ONLY_TEST=1
python -c 'import flexflow.core'
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 46c9bf3be2..fdf53e8254 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -10,7 +10,7 @@ jobs:
- check: "src"
exclude: '\.proto$'
- check: "include"
- - check: "nmt"
+ - check: "inference"
- check: "python"
- check: "scripts"
- check: "tests"
diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml
index 59b584c6c4..e5d7de858f 100644
--- a/.github/workflows/docker-build-skip.yml
+++ b/.github/workflows/docker-build-skip.yml
@@ -13,27 +13,22 @@ concurrency:
cancel-in-progress: true
jobs:
- docker-build:
- name: Build and Install FlexFlow in a Docker Container
- runs-on: ubuntu-20.04
+ docker-build-rocm:
+ name: Build and Install FlexFlow in a Docker Container (ROCm backend)
+ runs-on: ubuntu-latest
strategy:
matrix:
- gpu_backend: ["cuda", "hip_rocm"]
- cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
- # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
- exclude:
- - gpu_backend: "hip_rocm"
- cuda_version: "11.1"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.2"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.3"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.5"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.6"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.7"
+ hip_version: ["5.3", "5.4", "5.5", "5.6"]
+ fail-fast: false
+ steps:
+ - run: 'echo "No docker-build required"'
+
+ docker-build-cuda:
+ name: Build and Install FlexFlow in a Docker Container (CUDA backend)
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
fail-fast: false
steps:
- run: 'echo "No docker-build required"'
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index d059a0605f..eeaab0e0af 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -7,10 +7,11 @@ on:
- ".github/workflows/docker-build.yml"
push:
branches:
+ - "inference"
- "master"
schedule:
- # Run every week on Sunday at midnight PT (3am ET / 8am UTC) to keep the docker images updated
- - cron: "0 8 * * 0"
+ # At 00:00 on day-of-month 1, 14, and 28.
+ - cron: "0 0 1,14,28 * *"
workflow_dispatch:
# Cancel outdated workflows if they are still running
@@ -19,53 +20,121 @@ concurrency:
cancel-in-progress: true
jobs:
- docker-build:
- name: Build and Install FlexFlow in a Docker Container
+ rocm-builder-start:
+ name: Start an AWS instance to build the ROCM Docker images
+ runs-on: ubuntu-latest
+ if: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+ env:
+ ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
+ steps:
+ - name: Configure AWS credentials
+ uses: aws-actions/configure-aws-credentials@v1
+ with:
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+ aws-region: us-east-2
+
+ - name: Start EC2 instance
+ run: aws ec2 start-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
+
+ docker-build-rocm:
+ name: Build and Install FlexFlow in a Docker Container (ROCm backend)
runs-on: ubuntu-20.04
+ if: ${{ ( github.event_name != 'push' && github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' ) || github.ref_name != 'inference' }}
+ env:
+ FF_GPU_BACKEND: "hip_rocm"
+ hip_version: 5.6
+ steps:
+ - name: Checkout Git Repository
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Free additional space on runner
+ run: .github/workflows/helpers/free_space_on_runner.sh
+
+ - name: Build Docker container
+ run: FF_HIP_ARCH="gfx1100,gfx1036" ./docker/build.sh flexflow
+
+ - name: Check availability of flexflow modules in Python
+ run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
+
+ keep-runner-registered:
+ name: Keep runner alive
+ if: ${{ github.event_name == 'schedule' }}
+ runs-on: [self-hosted, rocm_builder]
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ env:
+ CONDA: "3"
+ needs: rocm-builder-start
+ steps:
+ - name: Keep alive
+ run: |
+ echo "Keep self-hosted runner registered with Github"
+ sleep 10m
+
+ docker-build-and-publish-rocm:
+ name: Build and Deploy FlexFlow Docker Containers (ROCm backend)
+ needs: rocm-builder-start
+ runs-on: [self-hosted, rocm_builder]
+ if: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
strategy:
matrix:
- gpu_backend: ["cuda", "hip_rocm"]
- cuda_version: ["11.1", "11.2", "11.3", "11.5", "11.6", "11.7", "11.8"]
- # The CUDA version doesn't matter when building for hip_rocm, so we just pick one arbitrarily (11.8) to avoid building for hip_rocm once per number of CUDA version supported
- exclude:
- - gpu_backend: "hip_rocm"
- cuda_version: "11.1"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.2"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.3"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.5"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.6"
- - gpu_backend: "hip_rocm"
- cuda_version: "11.7"
+ hip_version: ["5.3", "5.4", "5.5", "5.6"]
fail-fast: false
env:
- FF_GPU_BACKEND: ${{ matrix.gpu_backend }}
- cuda_version: ${{ matrix.cuda_version }}
- branch_name: ${{ github.head_ref || github.ref_name }}
+ FF_GPU_BACKEND: "hip_rocm"
+ hip_version: ${{ matrix.hip_version }}
steps:
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
- - name: Free additional space on runner
+ - name: Build Docker container
+ # On push to inference, build for all compatible architectures, so that we can publish
+ # a pre-built general-purpose image. On all other cases, only build for one architecture
+ # to save time.
+ run: FF_HIP_ARCH=all ./docker/build.sh flexflow
+
+ - name: Check availability of flexflow modules in Python
+ run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${hip_version}:latest -c "python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
+
+ - name: Publish Docker environment image (on push to inference)
env:
- deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
- build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
+ FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
run: |
- if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
- .github/workflows/helpers/free_space_on_runner.sh
- else
- echo "Skipping this step to save time"
- fi
+ ./docker/publish.sh flexflow-environment
+ ./docker/publish.sh flexflow
+
+ docker-build-cuda:
+ name: Build and Install FlexFlow in a Docker Container (CUDA backend)
+ runs-on: ubuntu-20.04
+ strategy:
+ matrix:
+ cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
+ fail-fast: false
+ env:
+ FF_GPU_BACKEND: "cuda"
+ cuda_version: ${{ matrix.cuda_version }}
+ steps:
+ - name: Checkout Git Repository
+ if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Free additional space on runner
+ if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+ run: .github/workflows/helpers/free_space_on_runner.sh
- name: Build Docker container
+ if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
env:
- deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
- build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
+ deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+ build_needed: ${{ matrix.cuda_version == '12.0' }}
run: |
# On push to inference, build for all compatible architectures, so that we can publish
# a pre-built general-purpose image. On all other cases, only build for one architecture
@@ -74,42 +143,45 @@ jobs:
export FF_CUDA_ARCH=all
./docker/build.sh flexflow
elif [[ $build_needed == "true" ]]; then
- export FF_CUDA_ARCH=70
+ export FF_CUDA_ARCH=86
./docker/build.sh flexflow
- else
- echo "Skipping build to save time"
fi
- - name: Check availability of Python flexflow.core module
- if: ${{ matrix.gpu_backend == 'cuda' }}
- env:
- deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
- build_needed: ${{ matrix.gpu_backend == 'hip_rocm' || ( matrix.gpu_backend == 'cuda' && matrix.cuda_version == '11.8' ) }}
- run: |
- if [[ $deploy_needed == "true" || $build_needed == "true" ]]; then
- docker run --env CPU_ONLY_TEST=1 --entrypoint /bin/bash flexflow-cuda-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; exit()'"
- else
- echo "Skipping test to save time"
- fi
+ - name: Check availability of flexflow modules in Python
+ if: ${{ ( ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
+ run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
- name: Publish Docker environment image (on push to inference)
- if: github.repository_owner == 'flexflow'
+ if: ${{ github.repository_owner == 'flexflow' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
env:
FLEXFLOW_CONTAINER_TOKEN: ${{ secrets.FLEXFLOW_CONTAINER_TOKEN }}
- deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' ) && env.branch_name == 'inference' }}
run: |
- if [[ $deploy_needed == "true" ]]; then
- ./docker/publish.sh flexflow-environment
- ./docker/publish.sh flexflow
- else
- echo "No need to update Docker containers in ghrc.io registry at this time."
- fi
+ ./docker/publish.sh flexflow-environment
+ ./docker/publish.sh flexflow
+
+ rocm-builder-stop:
+ needs: [docker-build-and-publish-rocm, keep-runner-registered]
+ if: ${{ always() && ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
+ runs-on: ubuntu-latest
+ name: Stop the AWS instance we used to build the ROCM Docker images
+ env:
+ ROCM_BUILDER_INSTANCE_ID: ${{ secrets.ROCM_BUILDER_INSTANCE_ID }}
+ steps:
+ - name: Configure AWS credentials
+ uses: aws-actions/configure-aws-credentials@v1
+ with:
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+ aws-region: us-east-2
+
+ - name: Start EC2 instance
+ run: aws ec2 stop-instances --instance-ids $ROCM_BUILDER_INSTANCE_ID
notify-slack:
name: Notify Slack in case of failure
runs-on: ubuntu-20.04
- needs: docker-build
- if: ${{ failure() && github.event_name == 'schedule' && github.repository_owner == 'flexflow' }}
+ needs: [docker-build-cuda, docker-build-and-publish-rocm]
+ if: ${{ failure() && github.event_name == 'workflow_dispatch' && github.repository_owner == 'flexflow' }}
steps:
- name: Send Slack message
env:
diff --git a/.github/workflows/gpu-ci-daemon.yml b/.github/workflows/gpu-ci-daemon.yml
index 603b44c34e..b36e7b49e1 100644
--- a/.github/workflows/gpu-ci-daemon.yml
+++ b/.github/workflows/gpu-ci-daemon.yml
@@ -34,5 +34,6 @@ jobs:
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
+ pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py --daemon
diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml
index 157f3c271a..f4cb950931 100644
--- a/.github/workflows/gpu-ci-skip.yml
+++ b/.github/workflows/gpu-ci-skip.yml
@@ -8,9 +8,15 @@ on:
- "python/**"
- "setup.py"
- "include/**"
+ - "inference/**"
- "src/**"
+ - "tests/inference/**"
+ - "conda/flexflow.yml"
- ".github/workflows/gpu-ci.yml"
- - "tests/multi_gpu_tests.sh"
+ - "tests/cpp_gpu_tests.sh"
+ - "tests/inference_tests.sh"
+ - "tests/training_tests.sh"
+ - "tests/python_interface_test.sh"
workflow_dispatch:
concurrency:
@@ -30,10 +36,18 @@ jobs:
needs: gpu-ci-concierge
steps:
- run: 'echo "No gpu-ci required"'
-
- gpu-ci-flexflow:
- name: Single Machine, Multiple GPUs Tests
+
+ inference-tests:
+ name: Inference Tests
runs-on: ubuntu-20.04
needs: gpu-ci-concierge
steps:
- run: 'echo "No gpu-ci required"'
+
+ training-tests:
+ name: Training Tests
+ runs-on: ubuntu-20.04
+ # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
+ needs: inference-tests
+ steps:
+ - run: 'echo "No gpu-ci required"'
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index 3b679e9f20..00ca2df603 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -1,21 +1,10 @@
name: "gpu-ci"
on:
- pull_request:
- paths:
- - "cmake/**"
- - "config/**"
- - "deps/**"
- - "python/**"
- - "setup.py"
- - "include/**"
- - "src/**"
- - ".github/workflows/gpu-ci.yml"
- - "tests/cpp_gpu_tests.sh"
- - "tests/multi_gpu_tests.sh"
- - "tests/python_interface_test.sh"
+ schedule:
+ - cron: "0 0 1,14,28 * *" # At 00:00 on day-of-month 1, 14, and 28.
push:
branches:
- - "master"
+ - "inference"
paths:
- "cmake/**"
- "config/**"
@@ -23,10 +12,14 @@ on:
- "python/**"
- "setup.py"
- "include/**"
+ - "inference/**"
- "src/**"
+ - "tests/inference/**"
+ - "conda/flexflow.yml"
- ".github/workflows/gpu-ci.yml"
- "tests/cpp_gpu_tests.sh"
- - "tests/multi_gpu_tests.sh"
+ - "tests/inference_tests.sh"
+ - "tests/training_tests.sh"
- "tests/python_interface_test.sh"
workflow_dispatch:
@@ -48,12 +41,33 @@ jobs:
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
+ pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py
+ keep-runner-registered:
+ name: Keep runner alive
+ if: ${{ github.event_name == 'schedule' }}
+ runs-on: [self-hosted, gpu]
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ env:
+ CONDA: "3"
+ needs: gpu-ci-concierge
+ container:
+ image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ options: --gpus all --shm-size=8192m
+ steps:
+ - name: Keep alive
+ run: |
+ echo "Keep self-hosted runner registered with Github"
+ sleep 10m
+
python-interface-check:
name: Check Python Interface
- runs-on: self-hosted
+ if: ${{ github.event_name != 'schedule' }}
+ runs-on: [self-hosted, gpu]
defaults:
run:
shell: bash -l {0} # required to use an activated conda environment
@@ -77,7 +91,7 @@ jobs:
with:
miniconda-version: "latest"
activate-environment: flexflow
- environment-file: conda/flexflow-cpu.yml
+ environment-file: conda/flexflow.yml
auto-activate-base: false
auto-update-conda: false
@@ -89,7 +103,7 @@ jobs:
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
- export FF_USE_PREBUILT_LEGION=OFF
+ export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
mkdir build
cd build
../config/config.linux
@@ -106,6 +120,7 @@ jobs:
run: |
export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
+ export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
cd build
../config/config.linux
make install
@@ -124,45 +139,150 @@ jobs:
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
./tests/align/test_all_operators.sh
- gpu-ci-flexflow:
- name: Single Machine, Multiple GPUs Tests
- runs-on: self-hosted
- needs: python-interface-check
+ inference-tests:
+ name: Inference Tests
+ if: ${{ github.event_name != 'schedule' }}
+ runs-on: [self-hosted, gpu]
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ env:
+ CONDA: "3"
+ HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
+ needs: gpu-ci-concierge
+ container:
+ image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ options: --gpus all --shm-size=8192m
+ steps:
+ - name: Install updated git version
+ run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
+
+ - name: Checkout Git Repository
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Install conda and FlexFlow dependencies
+ uses: conda-incubator/setup-miniconda@v2
+ with:
+ miniconda-version: "latest"
+ activate-environment: flexflow
+ environment-file: conda/flexflow.yml
+ auto-activate-base: false
+
+ - name: Build FlexFlow
+ run: |
+ export PATH=$CONDA_PREFIX/bin:$PATH
+ export FF_HOME=$(pwd)
+ export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
+ export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+ mkdir build
+ cd build
+ ../config/config.linux
+ make -j
+
+ - name: Run PEFT tests
+ run: |
+ export PATH=$CONDA_PREFIX/bin:$PATH
+ export CUDNN_DIR=/usr/local/cuda
+ export CUDA_DIR=/usr/local/cuda
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+
+ source ./build/set_python_envs.sh
+ ./tests/peft_test.sh
+
+ - name: Run inference tests
+ env:
+ CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
+ run: |
+ export PATH=$CONDA_PREFIX/bin:$PATH
+ export FF_HOME=$(pwd)
+ export CUDNN_DIR=/usr/local/cuda
+ export CUDA_DIR=/usr/local/cuda
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+
+ # GPT tokenizer test
+ # ./tests/gpt_tokenizer_test.sh
+
+ # Inference tests
+ source ./build/set_python_envs.sh
+ ./tests/inference_tests.sh
+
+ - name: Save inference output as an artifact
+ if: always()
+ run: |
+ cd inference
+ tar -zcvf output.tar.gz ./output
+
+ - name: Upload artifact
+ uses: actions/upload-artifact@v3
+ if: always()
+ with:
+ name: output
+ path: inference/output.tar.gz
+
+ # Github persists the .cache folder across different runs/containers
+ - name: Clear cache
+ if: always()
+ run: sudo rm -rf ~/.cache
+
+ training-tests:
+ name: Training Tests
+ if: ${{ github.event_name != 'schedule' }}
+ runs-on: [self-hosted, gpu]
+ # skip this time-consuming test for PRs to the inference branch
+ # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }}
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ env:
+ CONDA: "3"
+ needs: inference-tests
container:
image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
run: sudo add-apt-repository ppa:git-core/ppa -y && sudo apt update -y && sudo apt install -y --no-install-recommends git
+
- name: Checkout Git Repository
uses: actions/checkout@v3
with:
submodules: recursive
+
+ - name: Install conda and FlexFlow dependencies
+ uses: conda-incubator/setup-miniconda@v2
+ with:
+ miniconda-version: "latest"
+ activate-environment: flexflow
+ environment-file: conda/flexflow.yml
+ auto-activate-base: false
- name: Build and Install FlexFlow
run: |
- export PATH=/opt/conda/bin:$PATH
+ export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
export FF_BUILD_ALL_EXAMPLES=ON
- export FF_USE_PREBUILT_LEGION=OFF
+ export FF_BUILD_ALL_INFERENCE_EXAMPLES=ON
+ export FF_USE_PREBUILT_LEGION=OFF #remove this after fixing python path issue in Legion
pip install . --verbose
- name: Check FlexFlow Python interface (pip)
run: |
- export PATH=/opt/conda/bin:$PATH
+ export PATH=$CONDA_PREFIX/bin:$PATH
export FF_HOME=$(pwd)
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
./tests/python_interface_test.sh after-installation
- name: Run multi-gpu tests
run: |
- export PATH=/opt/conda/bin:$PATH
+ export PATH=$CONDA_PREFIX/bin:$PATH
export CUDNN_DIR=/usr/local/cuda
export CUDA_DIR=/usr/local/cuda
export FF_HOME=$(pwd)
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
+ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
# C++ tests
./tests/cpp_gpu_tests.sh 4
# Python tests
- ./tests/multi_gpu_tests.sh 4
+ ./tests/training_tests.sh 4
diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh
index 318134e331..73b8e88418 100755
--- a/.github/workflows/helpers/install_cudnn.sh
+++ b/.github/workflows/helpers/install_cudnn.sh
@@ -5,8 +5,11 @@ set -x
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"
+ubuntu_version=$(lsb_release -rs)
+ubuntu_version=${ubuntu_version//./}
+
# Install CUDNN
-cuda_version=${1:-11.8.0}
+cuda_version=${1:-12.1.1}
cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
echo "Installing CUDNN for CUDA version: ${cuda_version} ..."
CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
@@ -44,6 +47,12 @@ elif [[ "$cuda_version" == "11.7" ]]; then
elif [[ "$cuda_version" == "11.8" ]]; then
CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
+elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then
+ CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+ CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+else
+ echo "CUDNN support for CUDA version above 12.5 not yet added"
+ exit 1
fi
wget -c -q $CUDNN_LINK
if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version" == "11.8" ]]; then
@@ -52,6 +61,17 @@ if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version"
sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include
sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib
rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
+elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then
+ wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+ sudo dpkg -i cuda-keyring_1.1-1_all.deb
+ sudo apt update -y
+ rm -f cuda-keyring_1.1-1_all.deb
+ sudo dpkg -i $CUDNN_TARBALL_NAME
+ sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/
+ sudo apt update -y
+ sudo apt install -y libcudnn8
+ sudo apt install -y libcudnn8-dev
+ sudo apt install -y libcudnn8-samples
else
sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
fi
diff --git a/.github/workflows/helpers/install_dependencies.sh b/.github/workflows/helpers/install_dependencies.sh
index 5ab211c962..6435a37eea 100755
--- a/.github/workflows/helpers/install_dependencies.sh
+++ b/.github/workflows/helpers/install_dependencies.sh
@@ -7,24 +7,61 @@ cd "${BASH_SOURCE[0]%/*}"
# General dependencies
echo "Installing apt dependencies..."
-sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev && \
+sudo apt-get update && sudo apt-get install -y --no-install-recommends wget binutils git zlib1g-dev libhdf5-dev jq && \
sudo rm -rf /var/lib/apt/lists/*
-# Install CUDNN
-./install_cudnn.sh
-
-# Install HIP dependencies if needed
FF_GPU_BACKEND=${FF_GPU_BACKEND:-"cuda"}
+hip_version=${hip_version:-"5.6"}
if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
exit 1
-elif [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then
+fi
+# Install CUDNN if needed
+if [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" ]]; then
+ # Install CUDNN
+ ./install_cudnn.sh
+ # Install NCCL
+ ./install_nccl.sh
+fi
+# Install HIP dependencies if needed
+if [[ "$FF_GPU_BACKEND" == "hip_cuda" || "$FF_GPU_BACKEND" = "hip_rocm" ]]; then
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"
- wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/focal/amdgpu-install_22.20.50205-1_all.deb
- sudo apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb
- rm ./amdgpu-install_22.20.50205-1_all.deb
+ # Check that hip_version is one of 5.3,5.4,5.5,5.6
+ if [[ "$hip_version" != "5.3" && "$hip_version" != "5.4" && "$hip_version" != "5.5" && "$hip_version" != "5.6" ]]; then
+ echo "hip_version '${hip_version}' is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ # Compute script name and url given the version
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.6.50600-1_all.deb
+ if [ "$hip_version" = "5.3" ]; then
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.3.50300-1_all.deb
+ elif [ "$hip_version" = "5.4" ]; then
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.4.50400-1_all.deb
+ elif [ "$hip_version" = "5.5" ]; then
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb
+ fi
+ AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}"
+ # Download and install AMD GPU software with ROCM and HIP support
+ wget "$AMD_GPU_SCRIPT_URL"
+ sudo apt-get install -y ./${AMD_GPU_SCRIPT_NAME}
+ sudo rm ./${AMD_GPU_SCRIPT_NAME}
sudo amdgpu-install -y --usecase=hip,rocm --no-dkms
- sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk
+ sudo apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs
+
+ # Install protobuf v3.20.x manually
+ sudo apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev unzip python autoconf automake libtool curl make
+ git clone -b 3.20.x https://github.com/protocolbuffers/protobuf.git
+ cd protobuf/
+ git submodule update --init --recursive
+ ./autogen.sh
+ ./configure
+ cores_available=$(nproc --all)
+ n_build_cores=$(( cores_available -1 ))
+ if (( n_build_cores < 1 )) ; then n_build_cores=1 ; fi
+ make -j $n_build_cores
+ sudo make install
+ sudo ldconfig
+ cd ..
else
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"
fi
diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh
new file mode 100755
index 0000000000..ae6793ea2a
--- /dev/null
+++ b/.github/workflows/helpers/install_nccl.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set -euo pipefail
+set -x
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}"
+
+# Add NCCL key ring
+ubuntu_version=$(lsb_release -rs)
+ubuntu_version=${ubuntu_version//./}
+wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt update -y
+rm -f cuda-keyring_1.1-1_all.deb
+
+# Install NCCL
+cuda_version=${1:-12.1.1}
+cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
+echo "Installing NCCL for CUDA version: ${cuda_version} ..."
+
+# We need to run a different install command based on the CUDA version, otherwise running `sudo apt install libnccl2 libnccl-dev`
+# will automatically upgrade CUDA to the latest version.
+
+if [[ "$cuda_version" == "11.0" ]]; then
+ sudo apt install libnccl2=2.15.5-1+cuda11.0 libnccl-dev=2.15.5-1+cuda11.0
+elif [[ "$cuda_version" == "11.1" ]]; then
+ sudo apt install libnccl2=2.8.4-1+cuda11.1 libnccl-dev=2.8.4-1+cuda11.1
+elif [[ "$cuda_version" == "11.2" ]]; then
+ sudo apt install libnccl2=2.8.4-1+cuda11.2 libnccl-dev=2.8.4-1+cuda11.2
+elif [[ "$cuda_version" == "11.3" ]]; then
+ sudo apt install libnccl2=2.9.9-1+cuda11.3 libnccl-dev=2.9.9-1+cuda11.3
+elif [[ "$cuda_version" == "11.4" ]]; then
+ sudo apt install libnccl2=2.11.4-1+cuda11.4 libnccl-dev=2.11.4-1+cuda11.4
+elif [[ "$cuda_version" == "11.5" ]]; then
+ sudo apt install libnccl2=2.11.4-1+cuda11.5 libnccl-dev=2.11.4-1+cuda11.5
+elif [[ "$cuda_version" == "11.6" ]]; then
+ sudo apt install libnccl2=2.12.12-1+cuda11.6 libnccl-dev=2.12.12-1+cuda11.6
+elif [[ "$cuda_version" == "11.7" ]]; then
+ sudo apt install libnccl2=2.14.3-1+cuda11.7 libnccl-dev=2.14.3-1+cuda11.7
+elif [[ "$cuda_version" == "11.8" ]]; then
+ sudo apt install libnccl2=2.16.5-1+cuda11.8 libnccl-dev=2.16.5-1+cuda11.8
+elif [[ "$cuda_version" == "12.0" ]]; then
+ sudo apt install libnccl2=2.18.3-1+cuda12.0 libnccl-dev=2.18.3-1+cuda12.0
+elif [[ "$cuda_version" == "12.1" ]]; then
+ sudo apt install libnccl2=2.18.3-1+cuda12.1 libnccl-dev=2.18.3-1+cuda12.1
+elif [[ "$cuda_version" == "12.2" ]]; then
+ sudo apt install libnccl2=2.18.3-1+cuda12.2 libnccl-dev=2.18.3-1+cuda12.2
+else
+ echo "Installing NCCL for CUDA version ${cuda_version} is not supported"
+ exit 1
+fi
diff --git a/.github/workflows/helpers/oracle_con.py b/.github/workflows/helpers/oracle_con.py
new file mode 100644
index 0000000000..0891d66e99
--- /dev/null
+++ b/.github/workflows/helpers/oracle_con.py
@@ -0,0 +1,37 @@
+import oci
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="Program with optional flags")
+group = parser.add_mutually_exclusive_group()
+group.add_argument("--start", action="store_true", help="Start action")
+group.add_argument("--stop", action="store_true", help="Stop action")
+parser.add_argument("--instance_id", type=str, required=True, help="instance id required")
+args = parser.parse_args()
+
+oci_key_content = os.getenv("OCI_CLI_KEY_CONTENT")
+
+config = {
+ "user": os.getenv("OCI_CLI_USER"),
+ "key_content": os.getenv("OCI_CLI_KEY_CONTENT"),
+ "fingerprint": os.getenv("OCI_CLI_FINGERPRINT"),
+ "tenancy": os.getenv("OCI_CLI_TENANCY"),
+ "region": os.getenv("OCI_CLI_REGION")
+}
+
+# Initialize the OCI configuration
+oci.config.validate_config(config)
+
+# Initialize the ComputeClient to interact with VM instances
+compute = oci.core.ComputeClient(config)
+
+# Replace 'your_instance_id' with the actual instance ID of your VM
+instance_id = args.instance_id
+
+# Perform the action
+if args.start:
+ # Start the VM
+ compute.instance_action(instance_id, "START")
+else:
+ # Stop the VM
+ compute.instance_action(instance_id, "STOP")
diff --git a/.github/workflows/helpers/prebuild_legion.sh b/.github/workflows/helpers/prebuild_legion.sh
new file mode 100755
index 0000000000..9f5cbe147a
--- /dev/null
+++ b/.github/workflows/helpers/prebuild_legion.sh
@@ -0,0 +1,75 @@
+#! /usr/bin/env bash
+set -euo pipefail
+
+# Parse input params
+python_version=${python_version:-"empty"}
+gpu_backend=${gpu_backend:-"empty"}
+gpu_backend_version=${gpu_backend_version:-"empty"}
+
+if [[ "${gpu_backend}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
+ echo "Error, value of gpu_backend (${gpu_backend}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'."
+ exit 1
+else
+ echo "Pre-building Legion with GPU backend: ${gpu_backend}"
+fi
+
+if [[ "${gpu_backend}" == "cuda" || "${gpu_backend}" == "hip_cuda" ]]; then
+ # Check that CUDA version is supported. Versions above 12.0 not supported because we don't publish docker images for it yet.
+ if [[ "$gpu_backend_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0) ]]; then
+ echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0}"
+ exit 1
+ fi
+ export cuda_version="$gpu_backend_version"
+elif [[ "${gpu_backend}" == "hip_rocm" ]]; then
+ # Check that HIP version is supported
+ if [[ "$gpu_backend_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ export hip_version="$gpu_backend_version"
+else
+ echo "gpu backend: ${gpu_backend} and gpu_backend_version: ${gpu_backend_version} not yet supported."
+ exit 1
+fi
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}"
+
+export FF_GPU_BACKEND="${gpu_backend}"
+export FF_CUDA_ARCH=all
+export FF_HIP_ARCH=all
+export BUILD_LEGION_ONLY=ON
+export INSTALL_DIR="/usr/legion"
+export python_version="${python_version}"
+
+# Build Docker Flexflow Container
+echo "building docker"
+../../../docker/build.sh flexflow
+
+# Cleanup any existing container with the same name
+docker rm prelegion || true
+
+# Create container to be able to copy data from the image
+docker create --name prelegion flexflow-"${gpu_backend}"-"${gpu_backend_version}":latest
+
+# Copy legion libraries to host
+echo "extract legion library assets"
+mkdir -p ../../../prebuilt_legion_assets
+rm -rf ../../../prebuilt_legion_assets/tmp || true
+docker cp prelegion:$INSTALL_DIR ../../../prebuilt_legion_assets/tmp
+
+
+# Create the tarball file
+cd ../../../prebuilt_legion_assets/tmp
+export LEGION_TARBALL="legion_ubuntu-20.04_${gpu_backend}-${gpu_backend_version}_py${python_version}.tar.gz"
+
+echo "Creating archive $LEGION_TARBALL"
+tar -zcvf "../$LEGION_TARBALL" ./
+cd ..
+echo "Checking the size of the Legion tarball..."
+du -h "$LEGION_TARBALL"
+
+
+# Cleanup
+rm -rf tmp/*
+docker rm prelegion
diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml
index 37f81b615f..2fc527bf08 100644
--- a/.github/workflows/multinode-test.yml
+++ b/.github/workflows/multinode-test.yml
@@ -25,6 +25,7 @@ jobs:
run: |
pip3 install pip --upgrade
pip3 install pyopenssl --upgrade
+ pip3 install urllib3 --upgrade
pip3 install pygithub
python3 .github/workflows/helpers/gpu_ci_helper.py
@@ -37,7 +38,7 @@ jobs:
# 10h timeout, instead of default of 360min (6h)
timeout-minutes: 600
container:
- image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
@@ -77,7 +78,7 @@ jobs:
export OMPI_ALLOW_RUN_AS_ROOT=1
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
export OMPI_MCA_btl_vader_single_copy_mechanism=none
- ./tests/multi_gpu_tests.sh 2 2
+ ./tests/training_tests.sh 2 2
multinode-gpu-test-ucx:
name: Multinode GPU Test with UCX
@@ -86,7 +87,7 @@ jobs:
runs-on: self-hosted
needs: gpu-ci-concierge
container:
- image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
options: --gpus all --shm-size=8192m
# 10h timeout, instead of default of 360min (6h)
timeout-minutes: 600
@@ -128,7 +129,7 @@ jobs:
export OMPI_ALLOW_RUN_AS_ROOT=1
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
export OMPI_MCA_btl_vader_single_copy_mechanism=none
- ./tests/multi_gpu_tests.sh 2 2
+ ./tests/training_tests.sh 2 2
multinode-gpu-test-native-ucx:
name: Multinode GPU Test with native UCX
@@ -137,7 +138,7 @@ jobs:
runs-on: self-hosted
needs: gpu-ci-concierge
container:
- image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+ image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
options: --gpus all --shm-size=8192m
steps:
- name: Install updated git version
@@ -176,7 +177,7 @@ jobs:
export OMPI_ALLOW_RUN_AS_ROOT=1
export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
export OMPI_MCA_btl_vader_single_copy_mechanism=none
- ./tests/multi_gpu_tests.sh 2 2
+ ./tests/training_tests.sh 2 2
notify-slack:
name: Notify Slack in case of failure
diff --git a/.github/workflows/pip-deploy.yml b/.github/workflows/pip-deploy.yml
new file mode 100644
index 0000000000..66fdf00c9a
--- /dev/null
+++ b/.github/workflows/pip-deploy.yml
@@ -0,0 +1,72 @@
+name: "pip-deploy"
+on:
+ workflow_dispatch:
+
+concurrency:
+ group: pip-deploy-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ build-n-publish:
+ name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
+ runs-on: ubuntu-20.04
+ permissions:
+ # IMPORTANT: this permission is mandatory for trusted publishing
+ id-token: write
+
+ steps:
+ - name: Checkout Git Repository
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Free additional space on runner
+ run: .github/workflows/helpers/free_space_on_runner.sh
+
+ - name: Set up Python
+ uses: actions/setup-python@v4
+ with:
+ python-version: "3.x"
+
+ - name: Install pypa/build
+ run: >-
+ python3 -m
+ pip install
+ build
+ --user
+
+ - name: Build a source tarball
+ env:
+ DEPLOY_TO_TEST_PYPI: ${{ vars.DEPLOY_TO_TEST_PYPI }}
+ run: >-
+ python3 -m
+ build
+ --sdist
+ --outdir dist/
+ .
+
+ - name: Publish distribution 📦 to Test PyPI
+ if: ${{ vars.DEPLOY_TO_TEST_PYPI == 'true' }}
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ repository-url: https://test.pypi.org/legacy/
+
+ - name: Publish distribution 📦 to PyPI
+ if: ${{ vars.DEPLOY_TO_TEST_PYPI == 'false' }}
+ uses: pypa/gh-action-pypi-publish@release/v1
+
+ - name: Get package version
+ if: ${{ vars.DEPLOY_TO_TEST_PYPI == 'false' }}
+ run: |
+ # when running setup.py outside of pip install, we need to manually install the modules that are imported in the script
+ pip install setuptools requests cmake-build-extension
+ version=$(python setup.py --version)
+ echo "PY_VERSION=${version}" >> $GITHUB_ENV
+
+ - name: Create Git tag
+ if: ${{ vars.DEPLOY_TO_TEST_PYPI == 'false' }}
+ uses: mathieudutour/github-tag-action@v6.1
+ with:
+ github_token: ${{ secrets.FLEXFLOW_TOKEN }}
+ custom_tag: ${{ env.PY_VERSION }}
+
diff --git a/.github/workflows/pip-install-skip.yml b/.github/workflows/pip-install-skip.yml
index f2606b94d8..92c3223e32 100644
--- a/.github/workflows/pip-install-skip.yml
+++ b/.github/workflows/pip-install-skip.yml
@@ -7,6 +7,7 @@ on:
- "deps/**"
- "python/**"
- "setup.py"
+ - "requirements.txt"
- ".github/workflows/helpers/install_dependencies.sh"
- ".github/workflows/pip-install.yml"
workflow_dispatch:
diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml
index 7d60d3bf52..d5acbfc2e1 100644
--- a/.github/workflows/pip-install.yml
+++ b/.github/workflows/pip-install.yml
@@ -7,6 +7,7 @@ on:
- "deps/**"
- "python/**"
- "setup.py"
+ - "requirements.txt"
- ".github/workflows/helpers/install_dependencies.sh"
- ".github/workflows/pip-install.yml"
push:
@@ -18,6 +19,7 @@ on:
- "deps/**"
- "python/**"
- "setup.py"
+ - "requirements.txt"
- ".github/workflows/helpers/install_dependencies.sh"
- ".github/workflows/pip-install.yml"
workflow_dispatch:
@@ -42,10 +44,10 @@ jobs:
run: .github/workflows/helpers/free_space_on_runner.sh
- name: Install CUDA
- uses: Jimver/cuda-toolkit@v0.2.11
+ uses: Jimver/cuda-toolkit@v0.2.16
id: cuda-toolkit
with:
- cuda: "11.8.0"
+ cuda: "12.1.1"
# Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
use-github-cache: "false"
@@ -64,10 +66,11 @@ jobs:
export FF_HOME=$(pwd)
export FF_CUDA_ARCH=70
pip install . --verbose
+ # Remove build folder to check that the installed version can run independently of the build files
+ rm -rf build
- - name: Check availability of Python flexflow.core module
+ - name: Check availability of flexflow modules in Python
run: |
export LD_LIBRARY_PATH="$CUDA_PATH/lib64/stubs:$LD_LIBRARY_PATH"
sudo ln -s "$CUDA_PATH/lib64/stubs/libcuda.so" "$CUDA_PATH/lib64/stubs/libcuda.so.1"
- export CPU_ONLY_TEST=1
- python -c "import flexflow.core; exit()"
+ python -c 'import flexflow.core; import flexflow.serve as ff; exit()'
diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml
new file mode 100644
index 0000000000..633fb00eb8
--- /dev/null
+++ b/.github/workflows/prebuild-legion.yml
@@ -0,0 +1,84 @@
+name: "prebuild-legion"
+on:
+ push:
+ branches:
+ - "inference"
+ paths:
+ - "cmake/**"
+ - "config/**"
+ - "deps/legion/**"
+ - ".github/workflows/helpers/install_dependencies.sh"
+ workflow_dispatch:
+concurrency:
+ group: prebuild-legion-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ prebuild-legion:
+ name: Prebuild Legion with CMake
+ runs-on: ubuntu-20.04
+ defaults:
+ run:
+ shell: bash -l {0} # required to use an activated conda environment
+ strategy:
+ matrix:
+ gpu_backend: ["cuda", "hip_rocm"]
+ gpu_backend_version: ["12.0", "5.6"]
+ python_version: ["3.11"]
+ exclude:
+ - gpu_backend: "cuda"
+ gpu_backend_version: "5.6"
+ - gpu_backend: "hip_rocm"
+ gpu_backend_version: "12.0"
+ fail-fast: false
+ steps:
+ - name: Checkout Git Repository
+ uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Free additional space on runner
+ run: .github/workflows/helpers/free_space_on_runner.sh
+
+ - name: Build Legion
+ env:
+ gpu_backend: ${{ matrix.gpu_backend }}
+ gpu_backend_version: ${{ matrix.gpu_backend_version }}
+ python_version: ${{ matrix.python_version }}
+ run: .github/workflows/helpers/prebuild_legion.sh
+
+ - name: Archive compiled Legion library (CUDA)
+ uses: actions/upload-artifact@v3
+ with:
+ name: legion_ubuntu-20.04_${{ matrix.gpu_backend }}-${{ matrix.gpu_backend_version }}_py${{ matrix.python_version }}
+ path: prebuilt_legion_assets/legion_ubuntu-20.04_${{ matrix.gpu_backend }}-${{ matrix.gpu_backend_version }}_py${{ matrix.python_version }}.tar.gz
+
+ create-release:
+ name: Create new release
+ runs-on: ubuntu-20.04
+ needs: prebuild-legion
+ steps:
+ - name: Checkout Git Repository
+ uses: actions/checkout@v3
+ - name: Free additional space on runner
+ run: .github/workflows/helpers/free_space_on_runner.sh
+ - name: Create folder for artifacts
+ run: mkdir artifacts unwrapped_artifacts
+ - name: Download artifacts
+ uses: actions/download-artifact@v3
+ with:
+ path: ./artifacts
+ - name: Display structure of downloaded files
+ working-directory: ./artifacts
+ run: ls -R
+ - name: Unwrap all artifacts
+ working-directory: ./artifacts
+ run: find . -maxdepth 2 -mindepth 2 -type f -name "*.tar.gz" -exec mv {} ../unwrapped_artifacts/ \;
+ - name: Get datetime
+ run: echo "RELEASE_DATETIME=$(date '+%Y-%m-%dT%H-%M-%S')" >> $GITHUB_ENV
+ - name: Release
+ env:
+ NAME: ${{ env.RELEASE_DATETIME }}
+ TAG_NAME: ${{ env.RELEASE_DATETIME }}
+ GITHUB_TOKEN: ${{ secrets.FLEXFLOW_TOKEN }}
+ run: gh release create $TAG_NAME ./unwrapped_artifacts/*.tar.gz --repo flexflow/flexflow-third-party
diff --git a/.gitignore b/.gitignore
index b2e3c59ced..cc34c1a7b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,11 @@ __pycache__/
# C extensions
*.so
+/inference/weights/*
+/inference/tokenizer/*
+/inference/prompt/*
+/inference/output/*
+
# Distribution / packaging
.Python
build/
@@ -83,10 +88,7 @@ docs/build/
# Doxygen documentation
docs/doxygen/output/
-
-# Exhale documentation
-docs/source/_doxygen/
-docs/source/c++_api/
+docs/doxygen/cpp_api/
# PyBuilder
.pybuilder/
@@ -179,3 +181,15 @@ train-labels-idx1-ubyte
# Logs
logs/
+gpt_tokenizer
+
+# pip version
+python/flexflow/version.txt
+
+inference_tensors
+hf_peft_tensors
+lora_training_logs
+
+Untitled-1.ipynb
+Untitled-2.ipynb
+tests/inference/python_test_configs/*.json
diff --git a/.gitmodules b/.gitmodules
index b8419fda94..c68582d4ac 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -19,3 +19,7 @@
[submodule "deps/json"]
path = deps/json
url = https://github.com/nlohmann/json.git
+[submodule "deps/tokenizers-cpp"]
+ path = deps/tokenizers-cpp
+ url = https://github.com/mlc-ai/tokenizers-cpp.git
+ fetchRecurseSubmodules = true
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81845dd7b3..f06969ae04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
cmake_minimum_required(VERSION 3.10)
project(FlexFlow)
+
include(ExternalProject)
# Set policy CMP0074 to eliminate cmake warnings
@@ -12,7 +13,21 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
endif()
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake)
set(FLEXFLOW_ROOT ${CMAKE_CURRENT_LIST_DIR})
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UNDEBUG")
+set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC -UNDEBUG")
+set(CMAKE_HIP_FLAGS "-std=c++17 ${CMAKE_HIP_FLAGS} -fPIC -UNDEBUG")
+
+# set std 17
+#set(CMAKE_CXX_STANDARD 17)
+#set(CMAKE_CUDA_STANDARD 17)
+
+option(INFERENCE_TESTS "Run inference tests" OFF)
+set(LIBTORCH_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../libtorch" CACHE STRING "LibTorch Path")
+if (INFERENCE_TESTS)
+ find_package(Torch REQUIRED PATHS ${LIBTORCH_PATH} NO_DEFAULT_PATH)
+ set(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} -fPIC ${TORCH_CXX_FLAGS}")
+ message(STATUS "LIBTORCH_PATH: ${LIBTORCH_PATH}")
+ message(STATUS "TORCH_LIBRARIES: ${TORCH_LIBRARIES}")
+endif()
# Set a default build type if none was specified
set(default_build_type "Debug")
@@ -22,8 +37,33 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
STRING "Choose the type of build." FORCE)
endif()
+# option for using Python
+option(FF_USE_PYTHON "Enable Python" ON)
+if (FF_USE_PYTHON)
+ find_package(Python3 COMPONENTS Interpreter Development)
+endif()
+
+if(INSTALL_DIR)
+ message(STATUS "INSTALL_DIR: ${INSTALL_DIR}")
+ set(CMAKE_INSTALL_PREFIX ${INSTALL_DIR} CACHE PATH "Installation directory" FORCE)
+else()
+ # Install DIR not set. Use default, unless a conda environment is in use
+ if ((DEFINED ENV{CONDA_PREFIX} OR (Python3_EXECUTABLE AND Python3_EXECUTABLE MATCHES "conda")) AND NOT FF_BUILD_FROM_PYPI)
+ if (DEFINED ENV{CONDA_PREFIX})
+ set(CONDA_PREFIX $ENV{CONDA_PREFIX})
+ else()
+ get_filename_component(CONDA_PREFIX "${Python3_EXECUTABLE}" DIRECTORY)
+ get_filename_component(CONDA_PREFIX "${CONDA_PREFIX}" DIRECTORY)
+ endif()
+ # Set CMAKE_INSTALL_PREFIX to the Conda environment's installation path
+ set(CMAKE_INSTALL_PREFIX ${CONDA_PREFIX} CACHE PATH "Installation directory" FORCE)
+ message(STATUS "Active conda environment detected. Setting CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+ endif()
+endif()
+
# do not disable assertions even if in release mode
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG")
+set(CMAKE_HIP_FLAGS_RELEASE "${CMAKE_HIP_FLAGS_RELEASE} -UNDEBUG")
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
set(LIBEXT ".so")
@@ -35,114 +75,23 @@ option(FF_BUILD_FROM_PYPI "Build from pypi" OFF)
# build shared or static flexflow lib
option(BUILD_SHARED_LIBS "Build shared libraries instead of static ones" ON)
-# option for using Python
-option(FF_USE_PYTHON "Enable Python" ON)
+# option for building legion only
+option(BUILD_LEGION_ONLY "Build Legion only" OFF)
# option to download pre-compiled NCCL/Legion libraries
option(FF_USE_PREBUILT_NCCL "Enable use of NCCL pre-compiled library, if available" ON)
option(FF_USE_PREBUILT_LEGION "Enable use of Legion pre-compiled library, if available" ON)
option(FF_USE_ALL_PREBUILT_LIBRARIES "Enable use of all pre-compiled libraries, if available" OFF)
-# option for using Python
-set(FF_GASNET_CONDUITS aries udp mpi ibv ucx)
+# option for using network
+set(FF_GASNET_CONDUITS aries udp mpi ibv)
set(FF_GASNET_CONDUIT "mpi" CACHE STRING "Select GASNet conduit ${FF_GASNET_CONDUITS}")
set_property(CACHE FF_GASNET_CONDUIT PROPERTY STRINGS ${FF_GASNET_CONDUITS})
set(FF_LEGION_NETWORKS "" CACHE STRING "Network backend(s) to use")
-if ((FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx") OR FF_LEGION_NETWORKS STREQUAL "ucx")
- if("${FF_UCX_URL}" STREQUAL "")
- set(UCX_URL "https://github.com/openucx/ucx/releases/download/v1.14.0-rc1/ucx-1.14.0.tar.gz")
- else()
- set(UCX_URL "${FF_UCX_URL}")
- endif()
-
- set(UCX_DIR ${CMAKE_CURRENT_BINARY_DIR}/ucx)
- get_filename_component(UCX_COMPRESSED_FILE_NAME "${UCX_URL}" NAME)
- # message(STATUS "UCX_URL: ${UCX_URL}")
- # message(STATUS "UCX_COMPRESSED_FILE_NAME: ${UCX_COMPRESSED_FILE_NAME}")
- set(UCX_COMPRESSED_FILE_PATH "${CMAKE_CURRENT_BINARY_DIR}/${UCX_COMPRESSED_FILE_NAME}")
- set(UCX_BUILD_NEEDED OFF)
- set(UCX_CONFIG_FILE ${UCX_DIR}/config.txt)
- set(UCX_BUILD_OUTPUT ${UCX_DIR}/build.log)
-
- if(EXISTS ${UCX_CONFIG_FILE})
- file(READ ${UCX_CONFIG_FILE} PREV_UCX_CONFIG)
- # message(STATUS "PREV_UCX_CONFIG: ${PREV_UCX_CONFIG}")
- if("${UCX_URL}" STREQUAL "${PREV_UCX_CONFIG}")
- # configs match - no build needed
- set(UCX_BUILD_NEEDED OFF)
- else()
- message(STATUS "UCX configuration has changed - rebuilding...")
- set(UCX_BUILD_NEEDED ON)
- endif()
- else()
- message(STATUS "Configuring and building UCX...")
- set(UCX_BUILD_NEEDED ON)
- endif()
-
- if(UCX_BUILD_NEEDED)
- if(NOT EXISTS "${UCX_COMPRESSED_FILE_PATH}")
- message(STATUS "Downloading openucx/ucx from: ${UCX_URL}")
- file(
- DOWNLOAD
- "${UCX_URL}" "${UCX_COMPRESSED_FILE_PATH}"
- SHOW_PROGRESS
- STATUS status
- LOG log
- )
-
- list(GET status 0 status_code)
- list(GET status 1 status_string)
-
- if(status_code EQUAL 0)
- message(STATUS "Downloading... done")
- else()
- message(FATAL_ERROR "error: downloading '${UCX_URL}' failed
- status_code: ${status_code}
- status_string: ${status_string}
- log:
- --- LOG BEGIN ---
- ${log}
- --- LOG END ---"
- )
- endif()
- else()
- message(STATUS "${UCX_COMPRESSED_FILE_NAME} already exists")
- endif()
-
- execute_process(COMMAND mkdir -p ${UCX_DIR})
- execute_process(COMMAND tar xzf ${UCX_COMPRESSED_FILE_PATH} -C ${UCX_DIR} --strip-components 1)
- message(STATUS "Building UCX...")
- execute_process(
- COMMAND sh -c "cd ${UCX_DIR} && ${UCX_DIR}/contrib/configure-release --prefix=${UCX_DIR}/install --enable-mt && make -j8 && make install"
- RESULT_VARIABLE UCX_BUILD_STATUS
- OUTPUT_FILE ${UCX_BUILD_OUTPUT}
- ERROR_FILE ${UCX_BUILD_OUTPUT}
- )
-
- if(UCX_BUILD_STATUS)
- message(FATAL_ERROR "UCX build result = ${UCX_BUILD_STATUS} - see ${UCX_BUILD_OUTPUT} for more details")
- endif()
-
- # Currently, we use default build configurations for UCX and therefore only save URL as configuration settings
- file(WRITE ${UCX_CONFIG_FILE} "${UCX_URL}")
- endif()
-
- if (FF_LEGION_NETWORKS STREQUAL "gasnet" AND FF_GASNET_CONDUIT STREQUAL "ucx")
- set(ENV{UCX_HOME} "${UCX_DIR}/install")
- install(DIRECTORY ${UCX_DIR}/install/bin/ DESTINATION bin)
- install(DIRECTORY ${UCX_DIR}/install/include/ DESTINATION include)
- install(DIRECTORY ${UCX_DIR}/install/lib/ DESTINATION lib)
- install(DIRECTORY ${UCX_DIR}/install/share/ DESTINATION share)
- endif()
-
- if (FF_LEGION_NETWORKS STREQUAL "ucx")
- set(ucx_DIR ${UCX_DIR}/cmake)
- set(ENV{Legion_NETWORKS} "ucx")
- message(STATUS "Legion_NETWORKS: $ENV{Legion_NETWORKS}")
- endif()
-else()
- message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}")
+message(STATUS "FF_LEGION_NETWORKS: ${FF_LEGION_NETWORKS}")
+if (FF_LEGION_NETWORKS STREQUAL "gasnet")
+ message(STATUS "FF_GASNET_CONDUIT: ${FF_GASNET_CONDUIT}")
endif()
set(FF_GPU_BACKENDS cuda hip_cuda hip_rocm intel)
@@ -151,9 +100,14 @@ set_property(CACHE FF_GPU_BACKEND PROPERTY STRINGS ${FF_GPU_BACKENDS})
# option for cuda arch
set(FF_CUDA_ARCH "autodetect" CACHE STRING "Target CUDA Arch")
-if (FF_CUDA_ARCH STREQUAL "")
+if ((FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda") AND FF_CUDA_ARCH STREQUAL "")
message(FATAL_ERROR "FF_CUDA_ARCH cannot be an empty string. Set it to `autodetect`, `all`, or pass one or multiple valid CUDA archs.")
endif()
+# option for hip arch
+set(FF_HIP_ARCH "all" CACHE STRING "Target HIP Arch")
+if (FF_GPU_BACKEND STREQUAL "hip_rocm" AND FF_CUDA_ARCH STREQUAL "")
+ message(FATAL_ERROR "FF_HIP_ARCH cannot be an empty string. Set it to `all`, or pass one or multiple valid HIP archs.")
+endif()
# option for nccl
option(FF_USE_NCCL "Run FlexFlow with NCCL" OFF)
@@ -166,6 +120,7 @@ set(FF_MAX_DIM "4" CACHE STRING "Maximum dimention of tensors")
# option for legion
option(FF_USE_EXTERNAL_LEGION "Use pre-installed Legion" OFF)
+set(LEGION_MAX_RETURN_SIZE "32768" CACHE STRING "Maximum Legion return size")
set(FLEXFLOW_EXT_LIBRARIES "")
set(FLEXFLOW_INCLUDE_DIRS "")
@@ -177,10 +132,10 @@ set(LD_FLAGS $ENV{LD_FLAGS})
# Set global FLAGS
list(APPEND CC_FLAGS
- -std=c++11)
-
+ -std=c++17)
list(APPEND NVCC_FLAGS
- -std=c++11)
+ -std=c++17)
+
add_compile_options(${CC_FLAGS})
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${NVCC_FLAGS})
@@ -205,354 +160,442 @@ if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
set(ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory.")
endif()
-# ZLIB
-include(zlib)
-
# CUDA
if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
include(cuda)
endif()
+# HIP
+if (FF_GPU_BACKEND STREQUAL "hip_rocm" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+ enable_language(HIP)
+ include(hip)
+endif()
+
# CUDNN
if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
include(cudnn)
endif()
-# NCCL
-if(FF_USE_NCCL)
- if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")
- include(nccl)
- endif()
- list(APPEND FF_CC_FLAGS
- -DFF_USE_NCCL)
- list(APPEND FF_NVCC_FLAGS
- -DFF_USE_NCCL)
-endif()
-
# Legion
include(legion)
-# json
-include(json)
-
-# variant
-include(variant)
-
-# optional
-include(optional)
-
-if (FF_GPU_BACKEND STREQUAL "cuda")
- list(APPEND FF_CC_FLAGS
- -DFF_USE_CUDA)
- list(APPEND FF_NVCC_FLAGS
- -DFF_USE_CUDA)
-elseif (FF_GPU_BACKEND STREQUAL "hip_cuda")
- list(APPEND FF_CC_FLAGS
- -DFF_USE_HIP_CUDA)
- list(APPEND FF_HIPCC_FLAGS
- -DFF_USE_HIP_CUDA)
-elseif (FF_GPU_BACKEND STREQUAL "hip_rocm")
- list(APPEND FF_CC_FLAGS
- -DFF_USE_HIP_ROCM)
- list(APPEND FF_HIPCC_FLAGS
- -DFF_USE_HIP_ROCM)
-else()
-endif()
+# Not build FlexFlow if BUILD_LEGION_ONLY is ON
+if(NOT BUILD_LEGION_ONLY)
+ # NCCL
+ if(FF_USE_NCCL)
+ if(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda")
+ include(nccl)
+ endif()
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_NCCL)
+ list(APPEND FF_NVCC_FLAGS
+ -DFF_USE_NCCL)
+ endif()
-# Start build FlexFlow
-if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+ # Inference tests
+ if(INFERENCE_TESTS)
list(APPEND FF_CC_FLAGS
- -DFF_DEBUG)
+ -DINFERENCE_TESTS)
list(APPEND FF_NVCC_FLAGS
- -DFF_DEBUG)
-endif()
+ -DINFERENCE_TESTS)
+ endif()
+
+ # json
+ include(json)
+
+ # variant
+ include(variant)
+
+ # optional
+ include(optional)
+
+ if (FF_GPU_BACKEND STREQUAL "cuda")
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_CUDA)
+ list(APPEND FF_NVCC_FLAGS
+ -DFF_USE_CUDA)
+ elseif (FF_GPU_BACKEND STREQUAL "hip_cuda")
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_HIP_CUDA)
+ list(APPEND FF_HIPCC_FLAGS
+ -DFF_USE_HIP_CUDA)
+ elseif (FF_GPU_BACKEND STREQUAL "hip_rocm")
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_HIP_ROCM)
+ list(APPEND FF_HIPCC_FLAGS
+ -DFF_USE_HIP_ROCM)
+ else()
+ endif()
-message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}")
+ # Start build FlexFlow
+ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+ list(APPEND FF_CC_FLAGS
+ -DFF_DEBUG)
+ list(APPEND FF_NVCC_FLAGS
+ -DFF_DEBUG)
+ endif()
-list(APPEND FF_CC_FLAGS
- -DMAX_TENSOR_DIM=${FF_MAX_DIM})
+ message(STATUS "FlexFlow MAX_DIM: ${FF_MAX_DIM}")
+ message(STATUS "LEGION_MAX_RETURN_SIZE: ${LEGION_MAX_RETURN_SIZE}")
-if(FF_USE_AVX2)
list(APPEND FF_CC_FLAGS
- -DFF_USE_AVX2
- -mavx2)
-endif()
-
-list(APPEND FF_NVCC_FLAGS
- -Wno-deprecated-gpu-targets
- -DMAX_TENSOR_DIM=${FF_MAX_DIM})
-
-list(APPEND FF_LD_FLAGS
- -lrt
- -ldl
- -rdynamic)
-
-# Set FF FLAGS
-add_compile_options(${FF_CC_FLAGS})
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${FF_NVCC_FLAGS} -UNDEBUG)
-link_libraries(${FF_LD_FLAGS})
-
-list(APPEND FLEXFLOW_INCLUDE_DIRS
- ${FLEXFLOW_ROOT}/include
- ${FLEXFLOW_ROOT})
-
-file(GLOB_RECURSE FLEXFLOW_HDR
- LIST_DIRECTORIES False
- ${FLEXFLOW_ROOT}/include/*.h)
-
-file(GLOB_RECURSE FLEXFLOW_SRC
- LIST_DIRECTORIES False
- ${FLEXFLOW_ROOT}/src/*.cc)
-list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
-
-set(FLEXFLOW_CPP_DRV_SRC
- ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
-
-add_library(substitution_loader SHARED
- ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc)
-target_include_directories(substitution_loader PRIVATE ${FLEXFLOW_INCLUDE_DIRS})
-target_link_libraries(substitution_loader nlohmann_json::nlohmann_json)
+ -DMAX_TENSOR_DIM=${FF_MAX_DIM}
+ -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE})
+ if(FF_USE_AVX2)
+ list(APPEND FF_CC_FLAGS
+ -DFF_USE_AVX2
+ -mavx2)
+ endif()
-#message("FLEXFLOW_INCLUDE_DIRS: ${FLEXFLOW_INCLUDE_DIRS}")
+ list(APPEND FF_NVCC_FLAGS
+ -Wno-deprecated-gpu-targets
+ -DMAX_TENSOR_DIM=${FF_MAX_DIM}
+ -DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE})
+
+ list(APPEND FF_LD_FLAGS
+ -lrt
+ -ldl
+ -rdynamic
+ -lstdc++fs)
+
+ # Set FF FLAGS
+ add_compile_options(${FF_CC_FLAGS})
+ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} ${FF_NVCC_FLAGS} -UNDEBUG)
+ link_libraries(${FF_LD_FLAGS})
+
+ list(APPEND FLEXFLOW_INCLUDE_DIRS
+ ${FLEXFLOW_ROOT}/include
+ ${FLEXFLOW_ROOT})
+
+ file(GLOB_RECURSE FLEXFLOW_HDR
+ LIST_DIRECTORIES False
+ ${FLEXFLOW_ROOT}/include/*.h)
+
+ #list(APPEND FLEXFLOW_HDR ${FLEXFLOW_ROOT}/inference/file_loader.h)
-# compile flexflow lib
-if (FF_GPU_BACKEND STREQUAL "cuda")
- file(GLOB_RECURSE FLEXFLOW_GPU_SRC
+ file(GLOB_RECURSE FLEXFLOW_SRC
LIST_DIRECTORIES False
- ${FLEXFLOW_ROOT}/src/*.cu)
+ ${FLEXFLOW_ROOT}/src/*.cc)
+
+ list(REMOVE_ITEM FLEXFLOW_SRC "${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc")
+ #list(APPEND FLEXFLOW_SRC ${FLEXFLOW_ROOT}/inference/file_loader.cc)
- add_compile_definitions(FF_USE_CUDA)
+ set(FLEXFLOW_CPP_DRV_SRC
+ ${FLEXFLOW_ROOT}/src/runtime/cpp_driver.cc)
- if(BUILD_SHARED_LIBS)
- cuda_add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
- else()
- cuda_add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
- endif()
-elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
- file(GLOB_RECURSE FLEXFLOW_GPU_SRC
- LIST_DIRECTORIES False
- ${FLEXFLOW_ROOT}/src/*.cpp)
+ add_library(substitution_loader SHARED
+ ${FLEXFLOW_ROOT}/src/runtime/substitution_loader.cc)
+ target_include_directories(substitution_loader PRIVATE ${FLEXFLOW_INCLUDE_DIRS})
+ target_link_libraries(substitution_loader nlohmann_json::nlohmann_json)
- if(BUILD_SHARED_LIBS)
- add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
- else()
- add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
- endif()
- list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH})
+ #message("FLEXFLOW_INCLUDE_DIRS: ${FLEXFLOW_INCLUDE_DIRS}")
- find_package(hip REQUIRED)
+ # compile flexflow lib
+ if (FF_GPU_BACKEND STREQUAL "cuda")
+ file(GLOB_RECURSE FLEXFLOW_GPU_SRC
+ LIST_DIRECTORIES False
+ ${FLEXFLOW_ROOT}/src/*.cu)
- if (FF_GPU_BACKEND STREQUAL "hip_cuda")
- # The targets defined by the hip cmake config only target amd devices.
- # For targeting nvidia devices, we'll make our own interface target,
- # hip_device_nvidia, that includes the rocm and hip headers.
- add_library(hip_device_nvidia INTERFACE)
+ add_compile_definitions(FF_USE_CUDA)
- if (NOT FF_CUDA_ARCH STREQUAL "")
- target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH})
+ if(BUILD_SHARED_LIBS)
+ cuda_add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
+ else()
+ cuda_add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC} OPTIONS ${CUDA_GENCODE})
endif()
-
- target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
- target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
-
- add_compile_definitions(FF_USE_HIP_CUDA)
-
- # Linking cuda:
- # We do not explicitly link cuda. hipcc when targeting nvidia will
- # use nvcc under the hood. nvcc when used for linking will handle
- # linking cuda dependencies
- target_link_libraries(flexflow hip_device_nvidia)
- elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
- find_package(hipblas REQUIRED)
- find_package(miopen REQUIRED)
- if(FF_USE_NCCL)
- find_package(rccl REQUIRED)
+ elseif(FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "hip_rocm")
+ file(GLOB_RECURSE FLEXFLOW_GPU_SRC
+ LIST_DIRECTORIES False
+ ${FLEXFLOW_ROOT}/src/*.cpp)
+
+ set_source_files_properties(${FLEXFLOW_GPU_SRC} PROPERTIES LANGUAGE HIP)
+ set_source_files_properties(${FLEXFLOW_SRC} PROPERTIES LANGUAGE HIP)
+
+ if(BUILD_SHARED_LIBS)
+ add_library(flexflow SHARED ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
+ else()
+ add_library(flexflow STATIC ${FLEXFLOW_GPU_SRC} ${FLEXFLOW_SRC})
endif()
- # find_package(rocrand REQUIRED)
- find_library(HIP_RAND_LIBRARY hiprand REQUIRED)
-
- add_compile_definitions(FF_USE_HIP_ROCM)
- # The hip cmake config module defines three targets,
- # hip::amdhip64, hip::host, and hip::device.
- #
- # hip::host and hip::device are interface targets. hip::amdhip64 is an
- # imported target for libamdhip.
- #
- # You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64
- # and hip::device links to hip::host. Link to hip::host to just use hip without
- # compiling any GPU code. Link to hip::device to compile the GPU device code.
- #
- # Docs (outdated):
- # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
- target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY})
- if(FF_USE_NCCL)
+
+ list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH})
+
+ find_package(hip REQUIRED)
+
+ if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+ # The targets defined by the hip cmake config only target amd devices.
+ # For targeting nvidia devices, we'll make our own interface target,
+ # hip_device_nvidia, that includes the rocm and hip headers.
+ add_library(hip_device_nvidia INTERFACE)
+
+ if (NOT FF_CUDA_ARCH STREQUAL "")
+ target_compile_options(hip_device_nvidia INTERFACE -arch=compute_${FF_CUDA_ARCH})
+ endif()
+
+ target_include_directories(hip_device_nvidia SYSTEM INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
+ target_include_directories(hip_device_nvidia INTERFACE ${HIP_INCLUDE_DIRS} ${ROCM_PATH}/include)
+
+ add_compile_definitions(FF_USE_HIP_CUDA)
+
+ # Linking cuda:
+ # We do not explicitly link cuda. hipcc when targeting nvidia will
+ # use nvcc under the hood. nvcc when used for linking will handle
+ # linking cuda dependencies
+ target_link_libraries(flexflow hip_device_nvidia)
+ elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+ find_package(hipblas REQUIRED)
+ find_package(miopen REQUIRED)
+ if(FF_USE_NCCL)
+ find_package(rccl REQUIRED)
+ endif()
+ # find_package(rocrand REQUIRED)
+ find_library(HIP_RAND_LIBRARY hiprand REQUIRED)
+
+ add_compile_definitions(FF_USE_HIP_ROCM)
+
+ if (FF_HIP_ARCH STREQUAL "")
+ message(FATAL_ERROR "FF_HIP_ARCH is undefined")
+ endif()
+ set_property(TARGET flexflow PROPERTY HIP_ARCHITECTURES "${HIP_ARCH_LIST}")
+
+ message(STATUS "FF_GPU_BACKEND: ${FF_GPU_BACKEND}")
+ message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}")
+ message(STATUS "HIP_ARCH_LIST: ${HIP_ARCH_LIST}")
+ get_property(CHECK_HIP_ARCHS TARGET flexflow PROPERTY HIP_ARCHITECTURES)
+ message(STATUS "CHECK_HIP_ARCHS: ${CHECK_HIP_ARCHS}")
+ message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}")
+
+ # The hip cmake config module defines three targets,
+ # hip::amdhip64, hip::host, and hip::device.
+ #
+ # hip::host and hip::device are interface targets. hip::amdhip64 is an
+ # imported target for libamdhip.
+ #
+ # You do not directly link to hip::amdhip64. hip::host links to hip::amdhip64
+ # and hip::device links to hip::host. Link to hip::host to just use hip without
+ # compiling any GPU code. Link to hip::device to compile the GPU device code.
+ #
+ # Docs (outdated):
+ # https://rocmdocs.amd.com/en/latest/Installation_Guide/Using-CMake-with-AMD-ROCm.html
+ target_link_libraries(flexflow hip::device roc::hipblas MIOpen ${HIP_RAND_LIBRARY})
+ if(FF_USE_NCCL)
target_link_libraries(flexflow rccl)
+ endif()
endif()
+ else()
+ message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}")
endif()
-else()
- message(FATAL_ERROR "Unsupported FF_GPU_BACKEND for cmake: ${FF_GPU_BACKEND}")
-endif()
-if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda"))
- add_dependencies(flexflow ${NCCL_NAME})
-endif()
+ if(FF_USE_NCCL AND (FF_GPU_BACKEND STREQUAL "hip_cuda" OR FF_GPU_BACKEND STREQUAL "cuda"))
+ add_dependencies(flexflow ${NCCL_NAME})
+ endif()
-target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-# LEGION_URL is defined if we found a precompiled Legion library to download
-if(LEGION_URL)
- # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime.
- # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files.
- target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
- add_dependencies(flexflow ${LEGION_NAME})
-else()
- # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the
- # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need
- # to link FlexFlow to ${LEGION_LIBRARY}
- target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
-endif()
+ target_include_directories(flexflow PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
+ # LEGION_URL is defined if we found a precompiled Legion library to download
+ if(LEGION_URL)
+ # Legion builds produce two library files: one for the Legion runtime and one for the Realm runtime.
+ # When linking FlexFlow to a precompiled version of Legion, we need to manually link to both library files.
+ target_link_libraries(flexflow ${LEGION_LIBRARY} ${REALM_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
+ add_dependencies(flexflow ${LEGION_NAME})
+ else()
+ # When building Legion from source, we do so by calling add_subdirectory(), and obtain a library with both the
+ # Legion and Realm runtimes. The library's name is saved into the LEGION_LIBRARY variable. Hence, we only need
+ # to link FlexFlow to ${LEGION_LIBRARY}
+ target_link_libraries(flexflow ${LEGION_LIBRARY} ${FLEXFLOW_EXT_LIBRARIES} nlohmann_json::nlohmann_json mpark_variant optional)
+ endif()
-#library api version, bump from time to time
-set(SOVERSION 1)
-
-set_target_properties(flexflow PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(flexflow PROPERTIES OUTPUT_NAME "flexflow${INSTALL_SUFFIX}")
-set_target_properties(flexflow PROPERTIES SOVERSION ${SOVERSION})
-if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
- set_target_properties(flexflow PROPERTIES BUILD_RPATH "\$ORIGIN")
- set_target_properties(flexflow PROPERTIES INSTALL_RPATH "\$ORIGIN")
-elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
- set_target_properties(flexflow PROPERTIES BUILD_RPATH "@loader_path")
- set_target_properties(flexflow PROPERTIES INSTALL_RPATH "@loader_path")
-endif()
+ #library api version, bump from time to time
+ set(SOVERSION 1)
+
+ set_target_properties(flexflow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+ set_target_properties(flexflow PROPERTIES OUTPUT_NAME "flexflow${INSTALL_SUFFIX}")
+ set_target_properties(flexflow PROPERTIES SOVERSION ${SOVERSION})
+ if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ set_target_properties(flexflow PROPERTIES BUILD_RPATH "\$ORIGIN")
+ set_target_properties(flexflow PROPERTIES INSTALL_RPATH "\$ORIGIN")
+ elseif (CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ set_target_properties(flexflow PROPERTIES BUILD_RPATH "@loader_path")
+ set_target_properties(flexflow PROPERTIES INSTALL_RPATH "@loader_path")
+ endif()
-# python related
-if (FF_USE_PYTHON)
- # create flexflow_cffi_header.py
- add_custom_command(TARGET flexflow
- PRE_BUILD
- COMMAND ${FLEXFLOW_ROOT}/python/flexflow_cffi_build.py --ffhome-dir ${FLEXFLOW_ROOT} --output-dir ${FLEXFLOW_ROOT}/python/flexflow/core
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMENT "Creating flexflow_cffi_header.py..."
- )
- # generate the Legion Python bindings library
- add_custom_command(TARGET flexflow
- POST_BUILD
- COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --cmake-build-dir ${Legion_BINARY_DIR}/runtime --prefix ${Legion_BINARY_DIR} --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
- )
- # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
- if (NOT FF_BUILD_FROM_PYPI)
+ # python related
+ if (FF_USE_PYTHON)
+ find_package(Python COMPONENTS Interpreter Development)
+ # create flexflow_cffi_header.py
add_custom_command(TARGET flexflow
PRE_BUILD
- COMMAND ${PYTHON_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
+ COMMAND ${FLEXFLOW_ROOT}/python/flexflow_cffi_build.py --ffhome-dir ${FLEXFLOW_ROOT} --output-dir ${FLEXFLOW_ROOT}/python/flexflow/core
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- COMMENT "Creating flexflow_python interpreter..."
+ COMMENT "Creating flexflow_cffi_header.py..."
)
- install(PROGRAMS ${CMAKE_BINARY_DIR}/flexflow_python DESTINATION "bin")
+ if (NOT FF_BUILD_FROM_PYPI)
+ # generate the Legion Python bindings library. When building from pip, we need to do this post-install to prevent Legion from overwriting the path to the Legion shared library
+ add_custom_command(TARGET flexflow
+ POST_BUILD
+ COMMAND CMAKE_BUILD_DIR=${Legion_BINARY_DIR}/runtime CMAKE_INSTALL_PREFIX=${Legion_BINARY_DIR} ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python/setup.py build --build-lib=${Legion_BINARY_DIR}/bindings/python ${Legion_PYTHON_EXTRA_INSTALL_ARGS}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/deps/legion/bindings/python
+ )
+ # create flexflow_python interpreter. When building from pip, we install the FF_HOME/python/flexflow_python script instead.
+ add_custom_command(TARGET flexflow
+ PRE_BUILD
+ COMMAND ${Python_EXECUTABLE} ${FLEXFLOW_ROOT}/python/flexflow_python_build.py --build-dir ${CMAKE_BINARY_DIR}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ COMMENT "Creating flexflow_python interpreter..."
+ )
+ install(PROGRAMS ${CMAKE_BINARY_DIR}/flexflow_python DESTINATION "bin")
+ endif()
+ endif()
+
+ if (INFERENCE_TESTS)
+ target_link_libraries(flexflow "${TORCH_LIBRARIES}")
+ set_property(TARGET flexflow PROPERTY CXX_STANDARD 14)
endif()
-endif()
-# build binary
-option(FF_BUILD_RESNET "build resnet example" OFF)
-option(FF_BUILD_RESNEXT "build resnext example" OFF)
-option(FF_BUILD_ALEXNET "build alexnet example" OFF)
-option(FF_BUILD_DLRM "build DLRM example" OFF)
-option(FF_BUILD_XDL "build XDL example" OFF)
-option(FF_BUILD_INCEPTION "build inception example" OFF)
-option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF)
-option(FF_BUILD_TRANSFORMER "build transformer example" OFF)
-option(FF_BUILD_MOE "build mixture of experts example" OFF)
-option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF)
-option(FF_BUILD_SPLIT_TEST "build split test example" OFF)
-option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF)
-option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF)
-option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
-option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
-option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
-
-if(FF_BUILD_UNIT_TESTS)
- set(BUILD_GMOCK OFF)
- add_subdirectory(deps/googletest)
- enable_testing()
- add_subdirectory(tests/unit)
-endif()
+ # build binary
+ option(FF_BUILD_TOKENIZER "build tokenizer=cpp for LLM serving" OFF)
+ option(FF_BUILD_RESNET "build resnet example" OFF)
+ option(FF_BUILD_RESNEXT "build resnext example" OFF)
+ option(FF_BUILD_ALEXNET "build alexnet example" OFF)
+ option(FF_BUILD_DLRM "build DLRM example" OFF)
+ option(FF_BUILD_XDL "build XDL example" OFF)
+ option(FF_BUILD_INCEPTION "build inception example" OFF)
+ option(FF_BUILD_CANDLE_UNO "build candle uno example" OFF)
+ option(FF_BUILD_TRANSFORMER "build transformer example" OFF)
+ option(FF_BUILD_MOE "build mixture of experts example" OFF)
+ option(FF_BUILD_MLP_UNIFY "build mlp unify example" OFF)
+ option(FF_BUILD_SPLIT_TEST "build split test example" OFF)
+ option(FF_BUILD_SPLIT_TEST_2 "build split test 2 example" OFF)
+ option(FF_BUILD_MLP_UNIFY_INFERENCE "build mlp unify inference example" OFF)
+ option(FF_BUILD_ALL_INFERENCE_EXAMPLES "build all inference examples. Overrides others" OFF)
+ option(FF_BUILD_ALL_EXAMPLES "build all examples. Overrides others" OFF)
+ option(FF_BUILD_UNIT_TESTS "build non-operator unit tests" OFF)
+ option(FF_BUILD_SUBSTITUTION_TOOL "build substitution conversion tool" OFF)
+ option(FF_BUILD_VISUALIZATION_TOOL "build substitution visualization tool" OFF)
+
+ if(FF_BUILD_UNIT_TESTS)
+ set(BUILD_GMOCK OFF)
+ add_subdirectory(deps/googletest)
+ enable_testing()
+ add_subdirectory(tests/unit)
+ endif()
-if(FF_BUILD_SUBSTITUTION_TOOL)
- add_subdirectory(tools/protobuf_to_json)
-endif()
+ if(FF_BUILD_SUBSTITUTION_TOOL)
+ add_subdirectory(tools/protobuf_to_json)
+ endif()
-if(FF_BUILD_VISUALIZATION_TOOL)
- add_subdirectory(tools/substitutions_to_dot)
-endif()
+ if(FF_BUILD_VISUALIZATION_TOOL)
+ add_subdirectory(tools/substitutions_to_dot)
+ endif()
-if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/ResNet)
-endif()
+ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_TOKENIZER)
+ # Ensure Rust is installed
+ execute_process(COMMAND rustc --version
+ RESULT_VARIABLE RUST_COMMAND_RESULT
+ OUTPUT_VARIABLE RUSTC_OUTPUT
+ ERROR_QUIET)
+ if(NOT RUST_COMMAND_RESULT EQUAL 0)
+ message(FATAL_ERROR "Rust is not installed on the system. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+ endif()
+ # Ensure Cargo is installed
+ execute_process(COMMAND cargo --version
+ RESULT_VARIABLE CARGO_RESULT
+ OUTPUT_QUIET ERROR_QUIET)
+ if(NOT CARGO_RESULT EQUAL 0)
+ message(FATAL_ERROR "Rust is installed, but cargo is not. Please install it by running: 'curl https://sh.rustup.rs -sSf | sh -s -- -y' and following the instructions on the screen.")
+ endif()
+ set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER ON)
+ add_subdirectory(deps/tokenizers-cpp tokenizers EXCLUDE_FROM_ALL)
+ target_include_directories(flexflow PUBLIC deps/tokenizers-cpp/include)
+ target_link_libraries(flexflow tokenizers_cpp)
+ endif()
+ if(FF_BUILD_RESNET OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/ResNet)
+ endif()
-if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/resnext50)
-endif()
+ if(FF_BUILD_RESNEXT OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/resnext50)
+ endif()
-if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/AlexNet)
-endif()
+ if(FF_BUILD_ALEXNET OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/AlexNet)
+ endif()
-if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/MLP_Unify)
-endif()
+ if(FF_BUILD_MLP_UNIFY OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/MLP_Unify)
+ endif()
-if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/split_test)
-endif()
+ if(FF_BUILD_SPLIT_TEST OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/split_test)
+ endif()
-if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/split_test_2)
-endif()
+ if(FF_BUILD_SPLIT_TEST_2 OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/split_test_2)
+ endif()
-if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/InceptionV3)
-endif()
+ if(FF_BUILD_INCEPTION OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/InceptionV3)
+ endif()
-#TODO: Once functional add to BUILD_ALL_EXAMPLES
-if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/candle_uno)
-endif()
+ #TODO: Once functional add to BUILD_ALL_EXAMPLES
+ if(FF_BUILD_CANDLE_UNO OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/candle_uno)
+ endif()
-if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/DLRM)
+ if(FF_BUILD_DLRM OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/DLRM)
- #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc)
- #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
+ #add_executable(generate_dlrm_hetero_strategy src/runtime/dlrm_strategy_hetero.cc)
+ #target_include_directories(generate_dlrm_hetero_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
- #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc)
- #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
-endif()
+ #add_executable(generate_dlrm_strategy src/runtime/dlrm_strategy.cc)
+ #target_include_directories(generate_dlrm_strategy PUBLIC ${FLEXFLOW_INCLUDE_DIRS})
+ endif()
-if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/XDL)
-endif()
+ if(FF_BUILD_XDL OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/XDL)
+ endif()
-if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/Transformer)
-endif()
+ if(FF_BUILD_TRANSFORMER OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/Transformer)
+ endif()
-if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES)
- add_subdirectory(examples/cpp/mixture_of_experts)
-endif()
+ if(FF_BUILD_MOE OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(examples/cpp/mixture_of_experts)
+ endif()
-# installation
-set(INCLUDE_DEST "include")
-set(LIB_DEST "lib")
-install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST})
-install(TARGETS flexflow DESTINATION ${LIB_DEST})
-# install python
-if (FF_USE_PYTHON)
- execute_process(COMMAND python -c "from distutils import sysconfig; print(sysconfig.get_python_lib(plat_specific=False,standard_lib=False))" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
- install(
- DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/
- DESTINATION ${PY_DEST}/flexflow
- FILES_MATCHING
- PATTERN "*.py")
-endif()
+ if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
+ add_subdirectory(inference/spec_infer)
+ add_subdirectory(inference/incr_decoding)
+ add_subdirectory(inference/peft)
+ endif()
+
+
+ # installation
+ set(INCLUDE_DEST "include")
+ set(LIB_DEST "lib")
+ install(FILES ${FLEXFLOW_HDR} DESTINATION ${INCLUDE_DEST})
+ install(TARGETS flexflow DESTINATION ${LIB_DEST})
+ # install python
+ if (FF_USE_PYTHON)
+ find_package(Python COMPONENTS Interpreter Development)
+ execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if (NOT FF_BUILD_FROM_PYPI)
+ install(
+ DIRECTORY ${FLEXFLOW_ROOT}/python/flexflow/
+ DESTINATION ${PY_DEST}/flexflow
+ FILES_MATCHING
+ PATTERN "*.py")
+ else()
+ # pip automatically installs all *.py files in the python/flexflow folder, but because flexflow_cffi_header.py is generated at build time, we have to install it manually.
+ install(
+ PROGRAMS ${FLEXFLOW_ROOT}/python/flexflow/core/flexflow_cffi_header.py
+ DESTINATION ${PY_DEST}/flexflow/core
+ )
+ # Use setup.py script to re-install the Python bindings library with the right library paths.
+ # Need to put the instructions in a subfolder because of issue below:
+ # https://stackoverflow.com/questions/43875499/do-post-processing-after-make-install-in-cmake
+ add_subdirectory(cmake/pip_install)
+ endif()
+ endif()
+endif() # if(NOT BUILD_LEGION_ONLY)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ff77cb4612..c3c0b5173f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -119,7 +119,26 @@ After adding the DNN layers, the next step before compiling the model for traini
#### Model compilation
-TODO
+Model compilation consists of the following steps:
+
+1. We initialize an operator for each layer in the model, via the function `create_operators_from_layers()`. Layers work with `Tensor` input/weights/outputs, and are created directly by the user when writing a FlexFlow program. Operators work with `ParallelTensor` objects and they are responsible for running computations by launching kernels on GPUs.
+2. Launch the graph optimize task (`GRAPH_OPTIMIZE_TASK_ID`), implemented by`PCG::Graph::graph_optimize_task`, which returns `PCG::GraphOptimalViewSerialized`
+ 1. call `deserialize_graph_optimal_view(...)` to get `PCG::Graph *best_graph` and `std::unordered_map optimal_views` from deserialized `PCG::GraphOptimalViewSerialized`
+ 2. `convert_graph_to_operators()`
+ 3. print the dot of the best graph obtained
+ 4. map inputs to parallel tensor and weights to parallel tensor? -> strange for loop to understand better
+3. Init performance metrics via the `FFModel::update_metrics_task`
+4. Perform inplace optimizations (if enabled)
+5. Loop through the operators to do the following (to be understood better):
+ 1. `parameters.push_back(op->weights[i]);` for each weight in each operator
+ 2. `op->map_output_tensors(*this);`
+ 3. `((ParallelOp *)op)->create_input_partition(*this);` if the operator is a parallel operator
+6. Check correctness of the operator's input and output tensors' settings
+7. Perform fusion optimizations, if enabled
+8. Print all operators and their input and output regions
+9. Create the tensor for the label
+10. Initialize the optimizer
+11. In training mode, if NCCL is enabled, initialize all the communicators and other objects
## Continuous Integration
@@ -131,8 +150,9 @@ We currently implement CI testing using Github Workflows. Each workflow is defin
4. `gpu-ci.yml`: runs all the tests that require a GPU to run.
5. `gpu-ci-daemon.yml`: an helper workflow that turns on/off the GPU instance used by the test above
6. `multinode-test.yml`: runs the same GPU tests from the `gpu-ci.yml` workflow, but using multiple (simulated) nodes. The test currently simulates two nodes, each with 2 GPUs. To run FlexFlow on multiple nodes, we compile Legion with GASNET enabled, and choose MPI as the GASNET conduit. Compared to the single-node version, this test is much more time-consuming (about 4h instead 40mins at the time of writing), so we only run the test on the FlexFlow `master` branch every other day.
-7. `pip-install.yml`: checks the build & installation of FlexFlow using `pip`
-8. `shell-check.yml`: runs shellcheck on all bash scripts in the repo
+7. `pip-deploy.yml`: builds the `flexflow` pip package and publishes it on `TestPyPI` (if the repository environment variable `DEPLOY_TO_TEST_PYPI` is unset, or set to `false`) or `PyPI` (if `DEPLOY_TO_TEST_PYPI` is set to `true`). When deploying to `PyPI`, a new git tag (with the pip package version) will also be created, and associated with the commit hash that triggered the workflow. The `pip-deploy.yml` can only be launched manually via workflow dispatch. More on the pip packaging in the [section below](#pip-packages).
+8. `pip-install.yml`: checks the build & installation of FlexFlow using `pip`
+9. `shell-check.yml`: runs shellcheck on all bash scripts in the repo
We also have three placeholder workflows: `build-skip.yml`, `docker-build-skip.yml`, `gpu-ci-skip` and `pip-install-skip.yml`. These always pass and are used only in the case of skipped workflows whose status is required to merge a PR; we implement the "hack" officially recommended by Github ([see here](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/defining-the-mergeability-of-pull-requests/troubleshooting-required-status-checks#handling-skipped-but-required-checks)).
@@ -208,12 +228,82 @@ Finally, we define the jobs that will run when the workflow is triggered. Each j
Each step in a job will be executed sequentially, and if it fails, the remaining steps will be cancelled and the job will be marked as `failed`. Each step is specified by either reusing a Github action or running a shell command (or a script file). For instance, in the example above, the first step uses the Github Action `actions/checkout@v3` to check out the repository, the second step uses the `Jimver/cuda-toolkit@v0.2.11` action to install CUDA, whereas the third step runs a bash script stored in the repo at the path `.github/workflows/helpers/install_dependencies.sh`.
+## Pip packages
+This section illustrates how we support the automatic deployment of FlexFlow to the `PyPI` and `Test PyPI` repositories. Publishing FlexFlow on `PyPI` makes it possible for users to install FlexFlow on their machine by simply running:
+
+```bash
+pip install flexflow
+```
+
+To install from `Test PyPI`, on the other hand, one can use:
+
+```bash
+pip install flexflow --extra-index-url https://test.pypi.org/simple/
+```
+
+The installation process currently takes approximately the same time as installing from source by running the command `pip install .` from `FF_HOME` after having cloned the repo. However, installing directly from PyPI allows the user to automatically install the Python dependencies, and removes the step of having to manually clone the repo with all its submodules.
+
+Below, we discuss some important properties of PyPI.
+
+### Packaging
+When building a `pip` package from a repository, we can decide what files from the repository will be included in the package, and which ones will be left out. To do that, we write a [MANIFEST.in](https://github.com/flexflow/FlexFlow/blob/master/MANIFEST.in) file, according to the syntax from the [official instructions](https://packaging.python.org/en/latest/guides/using-manifest-in/). In particular, we manually include the submodules (which would otherwise be left out by default), we remove the `.git` folders, which are not needed to build FlexFlow, as well as the `triton` folder, whose contents are not currently in use. Finally, we request that the version.txt file, whose role is described in the section below, is included in the package distribution. Because this file is generated at build time, it would be left out by default if we didn't manually include it.
+
+### Source VS Wheel distribution
+PyPI allows you to upload a source distribution, together with one (or more) binary distributions of your package. A `pip` package's pre-compiled binary is called a Wheel (formerly, Egg). The advantage of publishing Wheel distributions instead of just the source code is that the installation of the package will be much faster for the user, who will just need to download the binary, and extract its files in the proper locations (all of this is handled automatically when running `pip install `). If only the source code is available, on the other hand, `pip install ` will first need to compile the package, and then install it.
+
+`PyPI` allows you to upload multiple Wheels to support different Python versions (the Wheel compatible with version of Python installed on the user's machine is downloaded automatically when the user runs `pip install `), but unfortunately does not yet support uploading a Wheel for each CUDA version, and automatically downloading the relevant one depending on the user's machine configuration. Instead, one needs to upload a Wheel with a distinct name for each CUDA version, and the user will need to specify the name manually at dowload time. For this reason, to keep things simple, we only publish the source distribution at this moment, and plan to upload Wheels that are specific to each Python version and CUDA version at a later time.
+
+### Versioning
+
+PyPI imposes some strict versioning requirements. Among other things, versions need to follow a specific format, and once a given version of a package is published, it can never be replaced. In addition, even if the publisher deletes a version, nobody can never upload a source distribution / Wheel with that same version number again. Finally, when multiple versions of the same package are published, the one with the highest version number (not the one that was uploaded last) will be installed by default.
+
+When publishing a package on PyPI, the version attached to the upload is determined by the `setup.py` script. You can check which version string will be used by running `python setup.py --version`.
+
+The simplest way to version a `pip`package is to hard-code the version number in the `setup.py` script, and committing a change to the repository every time the `pip` package is to be updated. This approach, however, is incompatible with having a script or workflow to automatically update the `pip` package.
+
+If we intend to deploy the latest code to PyPI automatically, we need a way to automatically assign a properly-formatted version string to the code we want to upload. Further, we need to ensure that the assigned version is (1) different from any version (of the same package) already published on PyPI and (2) larger than any previous version. Finally, a trickier requirement is that: (3) at any point in time, the `setup.py` script included in a given version of our package should output a version string that exactly matches the version string recorded in the metadata attached to the package's version at publication time. More about this below.
+
+We follow a simple approach to automatically version the latest code: use the publication's date to generate the version string. For example, on Aug 12, 2023, we can use version string 23.08.12. Assuming that we publish at most one version per day, and that we always publish from the same timezone, we will be able to meet requirements (1) and (2). An additional enhancement to be able to support the update of the package more than once per day (which may be needed in development phase, or if a mistake is made), instead of using the day of the month (12 for August 12, 2023) for the sub-sub-version, we use an index that starts at 0 every month, and is incremented by +1 every time we upload a new version of the package within the same calendar month. So if on Aug 12, 2023 we are updating the package for the first time in the month, we will use version string 23.08.0; if later the same day (or any time before Sept 1, 2023) we wish to upload a new version, we will use string 23.08.1, and so forth.
+
+Having illustrated the general versioning policy, we will need to implement it carefully in `setup.py` to ensure that we meet requirement (3). You can take a look at the `compute_version()` function to see how this is done in practice. The key realization is that we cannot simply compute today's date (using any of the Python libraries that let us do that) and transform it into a string, nor simply get from PyPI the latest available version of our package, and, if it was published on the same calendar month, increment the sub-subversion by +1 to generate the version string of the new upload. We can best illustrate why we cannot do that with an example:
+- Today, Aug 12, 2023, we wish to upload a new version to PyPI. As we said above, the version string is computed by `setup.py`. A naive way to do so in `setup.py` would be to compute the date using `date.today()`, and transform the year and month into digit form to generate the version (23) and sub-version (08) parts of the version string. We could then check on PyPI what was the latest published version of the package as of today, and if we found that it was, say, 23.08.05, we would use 5+1=6 as the sub-sub-version for the new upload (so the final version string would be 23.08.06).
+- Over the next few days, we upload 3 more versions
+- A week later, on Aug 18, 2023, a user trying to install our package, runs `pip install `. To determine which version it should install, the `pip install` script downloads the most recent X versions of `` on the user's machine, and, for each version, re-computes the version string by running `python setup.py --version`. When the script attempts to recompute the version string on the package 23.08.06 (which we uploaded on Aug 12, 2023), it will reconstruct the version string by replaying the same instructions that were run on Aug. 12, and obtain a different version string, as follows. Using the current date, the user will obtain: version=23, sub-version=08, which match the metadata. Checking the latest version of the package available on PyPI, the script finds version 23.08.09 (there were three more submissions since Aug 12). This will translate to sub-sub-version=9+1=10. Noticing that the version included in the Aug 12 package's metadata (23.08.06) does not match the recomputed version (23.08.10), the script will generate unexpected and undesired behavior.
+
+To prevent accidentally breaking requirement (3) as illustrated in the scenario from the example above, we employ a simple hack: when computing our package's version string for the first time by running `setup.py`, we save the string to a file, `python/flexflow/version.txt`, which is added to the `.gitignore` and as such, never committed to the repo. As long as the `version.txt` exists, any subsequent run of `setup.py` will simply read the file, and output the same version string, no matter on which day and/or how many new versions of the package have been uploaded to PyPI since then. When packaging our code to upload it on PyPI, we ensure to delete the `version.txt` file, compute the version string, and then include the `version.txt` in the source distribution that we upload to `PyPI`. In this way, when the user attempts to install the package, `pip install` will download the most recent available versions, run `setup.py` from each distribution, and for each distribution, `setup.py` will always output the correct version string, because it will just read the string recorded in that distribution's `version.txt`.
+
+### Test PyPI
+Given all the complexities and restrictions of PyPI, Test PyPI was created as a "copy" of PyPI to be used for testing and for being able to make mistakes without affecting the user, or forever losing the opportunity to use a given package name and/or version. We take advantage of Test PyPI as follows. If we intend to deploy to PyPI, we can first deploy to Test PyPI, check the results, fix any issue, and only later deploy to PyPI. All our `pip` related scripts in the repo have been designed to support both Test PyPI and PyPI. In order to let `setup.py` know that it should package a distribution for Test PyPI, one can simply export the following environment variable:
+
+```bash
+export DEPLOY_TO_TEST_PYPI=true
+```
+
+Conversely, to upload to PyPI, one can either leave `DEPLOY_TO_TEST_PYPI` unset, or export
+
+```bash
+export DEPLOY_TO_TEST_PYPI=false
+```
+
+WARNING!!! More likely than not, the latest version of the `flexflow` package on Test PyPI and PyPI will be out of sync. This is to be expected, because one may need to upload a few drafts on Test PyPI to detect and correct some bugs, before publishing the definitive version on PyPI. Having different latest versions on the two repositories should not cause any issue. However, after uploading to Test PyPI and before uploading to PyPI (or viceversa), **it is EXTREMELY IMPORTANT** to delete the `python/flexflow/version.txt` file.
+
+An easy way to avoid forgetting this, is to only deploy on Test PyPI/PyPI using the `pip-deploy.yml`, which is designed to only upload to one of the two repositories at a given time.
+
+### Build vs install dependencies
+
+FlexFlow requires some other Python packages in order to run. In addition, even building FlexFlow requires some packages, and you cannot run `setup.py` without those build requirements. There is a way for us to specify these _install_ and _build_ requirements in such a way that `pip` will detect if they are missing, and install them. We record the build requirements in the `pyproject.toml` file, whereas we specify the installation requirements by passing a list with each package's name to the `install_requires` key of the `setup()` function in `setup.py`. The installation requirements are automatically read from the `requirements.txt` file.
+
+
## Contributing to FlexFlow
We want to make contributing to this project as easy and transparent as possible.
### Formatting
We use `clang-format` to format our C++ code. If you make changes to the code and the Clang format CI test is failing, you can lint your code by running: `./scripts/format.sh` from the main folder of this repo.
+### Documenting the code
+We follow the Python Docstring conventions for documenting the Python code. We document the C++ code using comments in any of the conventioned supported by Doxygen [see here](https://doxygen.nl/manual/docblocks.html).
+
+
### Pull Requests
We actively welcome your pull requests.
diff --git a/FlexFlow.mk b/FlexFlow.mk
index b434045893..14f32a7639 100644
--- a/FlexFlow.mk
+++ b/FlexFlow.mk
@@ -59,7 +59,8 @@ GEN_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cc')\
$(shell find $(FF_HOME)/src/runtime/ -name '*.cc')\
$(shell find $(FF_HOME)/src/utils/dot/ -name '*.cc')\
$(shell find $(FF_HOME)/src/dataloader/ -name '*.cc')\
- $(shell find $(FF_HOME)/src/c/ -name '*.cc')
+ $(shell find $(FF_HOME)/src/c/ -name '*.cc')\
+ $(shell find $(FF_HOME)/inference/ -name 'file_loader.cc')
GEN_SRC := $(filter-out $(FF_HOME)/src/runtime/cpp_driver.cc, $(GEN_SRC))
FF_CUDA_SRC += $(shell find $(FF_HOME)/src/loss_functions/ -name '*.cu')\
@@ -94,15 +95,17 @@ ifneq ($(strip $(FF_USE_PYTHON)), 1)
endif
-INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include
+INC_FLAGS += -I${FF_HOME}/include -I${FF_HOME}/inference -I${FF_HOME}/deps/optional/include -I${FF_HOME}/deps/variant/include -I${FF_HOME}/deps/json/include -I${FF_HOME}/deps/tokenizers-cpp/include -I${FF_HOME}/deps/tokenizers-cpp/sentencepiece/src
CC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
NVCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
HIPCC_FLAGS += -DMAX_TENSOR_DIM=$(MAX_DIM) -DLEGION_MAX_RETURN_SIZE=32768
GASNET_FLAGS +=
# For Point and Rect typedefs
-CC_FLAGS += -std=c++11
-NVCC_FLAGS += -std=c++11
-HIPCC_FLAGS += -std=c++11
+CC_FLAGS += -std=c++17
+NVCC_FLAGS += -std=c++17
+HIPCC_FLAGS += -std=c++17
+
+LD_FLAGS += -L$(FF_HOME)/deps/tokenizers-cpp/example/tokenizers -ltokenizers_cpp -ltokenizers_c -L$(FF_HOME)/deps/tokenizers-cpp/example/tokenizers/sentencepiece/src -lsentencepiece
ifeq ($(strip $(FF_USE_NCCL)), 1)
INC_FLAGS += -I$(MPI_HOME)/include -I$(NCCL_HOME)/include
diff --git a/INSTALL.md b/INSTALL.md
index d2e3c1d2f6..1734319540 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -1,4 +1,4 @@
-# Installing FlexFlow
+# Building from source
To build and install FlexFlow, follow the instructions below.
## 1. Download the source code
@@ -30,7 +30,7 @@ If you are planning to build the Python interface, you will need to install seve
The `conda` environment can be created and activated as:
```
-conda env create -f conda/environment.yml
+conda env create -f conda/flexflow.yml
conda activate flexflow
```
@@ -42,7 +42,7 @@ You can configure a FlexFlow build by running the `config/config.linux` file in
3. `FF_CUDA_ARCH` is used to set the architecture of targeted GPUs, for example, the value can be 60 if the GPU architecture is Pascal. To build for more than one architecture, pass a list of comma separated values (e.g. `FF_CUDA_ARCH=70,75`). To compile FlexFlow for all GPU architectures that are detected on the machine, pass `FF_CUDA_ARCH=autodetect` (this is the default value, so you can also leave `FF_CUDA_ARCH` unset. If you want to build for all GPU architectures compatible with FlexFlow, pass `FF_CUDA_ARCH=all`. **If your machine does not have any GPU, you have to set FF_CUDA_ARCH to at least one valid architecture code (or `all`)**, since the compiler won't be able to detect the architecture(s) automatically.
4. `FF_USE_PYTHON` controls whether to build the FlexFlow Python interface.
5. `FF_USE_NCCL` controls whether to build FlexFlow with NCCL support. By default, it is set to ON.
-6. `FF_LEGION_NETWORKS` is used to enable distributed run of FlexFlow. If you want to run FlexFlow on multiple nodes, follow instructions in [MULTI-NODE.md](MULTI-NODE.md) and set the corresponding parameters as follows:
+6. `FF_LEGION_NETWORKS` is used to enable distributed run of FlexFlow. If you want to run FlexFlow on multiple nodes, follow instructions in the [Multinode tutorial](https://flexflow.readthedocs.io/en/latest/multinode.html) and set the corresponding parameters as follows:
* To build FlexFlow with GASNet, set `FF_LEGION_NETWORKS=gasnet` and `FF_GASNET_CONDUIT` as a specific conduit (e.g. `ibv`, `mpi`, `udp`, `ucx`) in `config/config.linux` when configuring the FlexFlow build. Set `FF_UCX_URL` when you want to customize the URL to download UCX.
* To build FlexFlow with native UCX, set `FF_LEGION_NETWORKS=ucx` in `config/config.linux` when configuring the FlexFlow build. Set `FF_UCX_URL` when you want to customize the URL to download UCX.
8. `FF_BUILD_EXAMPLES` controls whether to build all C++ example programs.
@@ -85,10 +85,11 @@ export FF_HOME=/path/to/FlexFlow
### Run FlexFlow Python examples
The Python examples are in the [examples/python](https://github.com/flexflow/FlexFlow/tree/master/examples/python). The native, Keras integration and PyTorch integration examples are listed in `native`, `keras` and `pytorch` respectively.
-To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the following flags:
+To run the Python examples, you have two options: you can use the `flexflow_python` interpreter, available in the `build` folder, or you can use the native Python interpreter. If you choose to use the native Python interpreter, you should either install FlexFlow, or, if you prefer to build without installing, export the required environment flags by running the following command (edit the path if your build folder is not named `build`):
-* `export PYTHONPATH="${FF_HOME}/python:${FF_HOME}/build/deps/legion/bindings/python:${PYTHONPATH}"`
-* `export LD_LIBRARY_PATH="${FF_HOME}/build:${FF_HOME}/build/deps/legion/lib:${LD_LIBRARY_PATH}"`
+```
+source ./build/set_python_envs.sh
+```
**We recommend that you run the** `mnist_mlp` **test under** `native` **using the following cmd to check if FlexFlow has been installed correctly:**
@@ -96,7 +97,7 @@ To run the Python examples, you have two options: you can use the `flexflow_pyth
cd "$FF_HOME"
./python/flexflow_python examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize -ll:zsize
```
-A script to run all the Python examples is available at `tests/multi_gpu_tests.sh`
+A script to run all the Python examples is available at `tests/training_tests.sh`
### Run FlexFlow C++ examples
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000..64f20c1890
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+graft deps
+recursive-exclude . .git
+prune triton
+include python/flexflow/version.txt
diff --git a/MULTI-NODE.md b/MULTI-NODE.md
index 78edba62c0..28f2eab8ed 100644
--- a/MULTI-NODE.md
+++ b/MULTI-NODE.md
@@ -1,29 +1,90 @@
-# Running FlexFlow On Multiple Nodes
-To build, install, and run FlexFlow on multiple nodes, follow the instructions below. We take AWS as an example to present the instructions.
+# Running FlexFlow on Multiple Nodes
+
+To build, install, and run FlexFlow on multiple nodes, follow the instructions below. We will use AWS as an example to present the instructions.
## 1. Spin up instances
-Spin up multiple instances with GPU support. We choose p3.2xlarge with [Deep Learning AMI GPU PyTorch 1.13.1 (Ubuntu 20.04)](https://aws.amazon.com/releasenotes/aws-deep-learning-ami-neuron-pytorch-1-13-ubuntu-20-04/) to simplify the procedure.
-Place the instances in a [placement group](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html) which utilizes `cluster` as strategy to achieve the low-latency network performance.
+Spin up multiple instances with GPU support. For AWS, we recommend using p3.2xlarge with [Deep Learning AMI GPU PyTorch 1.13.1 (Ubuntu 20.04)](https://aws.amazon.com/releasenotes/aws-deep-learning-ami-neuron-pytorch-1-13-ubuntu-20-04/) to simplify the procedure.
+
+Place the instances in a [placement group](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/placement-groups.html) that utilizes the `cluster` strategy to achieve low-latency network performance.
-To enable the communications between instances, you should attach the same security group to all instances and add an inbound rule in the security group to enable all the incoming traffic from the same security group. An example inbound rule is as follows:
+To enable communication between instances, attach the same security group to all instances and add an inbound rule in the security group to allow all incoming traffic from the same security group. An example inbound rule is as follows:
```
Type: Custom TCP
Port range: 1 - 65535
Source: Custom (use the security group ID)
```
-## 2. Configure and build FlexFlow
-Follow steps 1 to 5 in [INSTALL.md](INSTALL.md) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance**. You can skip the step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance.
+You can also use your own GPU cluster, as long as all machines are interconnected with a low-latency network.
+
+## 2. Configure and build UCX
+
+Find the latest source code release for UCX at https://github.com/openucx/ucx/releases. As of writing this documentation, the latest UCX was 1.15.0 at https://github.com/openucx/ucx/releases/download/v1.15.0/ucx-1.15.0.tar.gz. Extract it and switch to the directory with UCX source code, and run:
+
+```
+CUDA_PATH=/usr/local/cuda
+PREFIX=$PWD/install
+./contrib/configure-release-mt --prefix="$PREFIX" --without-go --enable-mt --with-cuda="$CUDA_PATH"
+make -j install
+echo "$PREFIX"
+```
+
+Replace `{{ CUDA_PATH }}` with the path of your CUDA installation. If you don't know the path, try `which nvcc`. Take note of the path of UCX installation, echoed as part of the last command.
+
+## 3. Configure and build FlexFlow
+
+Follow steps 1 to 5 in [INSTALL.md](INSTALL.md#1-download-the-source-code) to download the source code, install system dependencies, install the Python dependencies, configure the FlexFlow build, and build FlexFlow **on each instance at the same path**. Or you can use NFS to mount home directory of each instance so that only a single build is necessary.
+
+You can skip step 2 (Install system dependencies) if you have spun up instances with Deep Learning AMI, which comes preconfigured with CUDA. Otherwise, you need to install system dependencies on each instance.
+
+For step 4 (Configuring the FlexFlow build), here are the parameters that need to be configured:
+* Set `FF_LEGION_NETWORKS=ucx`
+* Set `UCX_DIR` to the UCX installation path mentioned in [Configure and build UCX](#2-configure-and-build-ucx)
+
+Other configuration options are optional.
+
+## 4. Configure MPI
+
+MPI is an easy way to launch FlexFlow across all instances simultaneously and set up communication between them.
+
+To use MPI, enable non-interactive `ssh` logins between instances. This can be done by referring to the [Open MPI documentation](https://docs.open-mpi.org/en/v5.0.0rc9/running-apps/ssh.html). Here are the detailed steps:
+
+1. Choose one of the nodes as the main instance and create a public/private key pair on the instance. This will be the instance from which you launch MPI commands. Run the following command:
+
+```
+ssh-keygen -t ed25519
+```
+
+This will create a public key at `~/.ssh/id_ed25519.pub` and a private key at `~/.ssh/id_ed25519`.
-## 3. Test FlexFlow
-Follow the step 6 in [INSTALL.md](INSTALL.md) to set environment variables.
+2. Append the contents of the **public key** to `~/.ssh/authorized_keys` on all machines (if the file does not exist, create one). Execute the following command on **all instances**:
-A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh` and you can run the script using [`mpirun`](https://www.open-mpi.org/doc/current/man1/mpirun.1.php) or [`srun`](https://slurm.schedmd.com/srun.html). For example, to run the script with MPI, you need to first enable non-interactive `ssh` logins (refer to [Open MPI doc](https://docs.open-mpi.org/en/v5.0.0rc9/running-apps/ssh.html)) between instances and then run:
```
-mpirun --host :,: -np ./scripts/mnist_mlp_run.sh
+mkdir -p ~/.ssh
+echo '' >> ~/.ssh/authorized_keys
```
-If you encounter some errors like `WARNING: Open MPI accepted a TCP connection from what appears to be a
-another Open MPI process but cannot find a corresponding process
-entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command. (refer to [stack overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts))
\ No newline at end of file
+Replace `` with the public key from `~/.ssh/id_ed25519.pub` on the main instance. It should be a single line containing a string like:
+```
+ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOy5NKYdE8Cwgid59rx6xMqyj9vLaWuXIwy/BSRiK4su instance
+```
+
+3. Create a hostfile at `~/hostfile`, with one line for each instance (add more lines if you have more instances):
+
+```
+ slots=
+ slots=
+```
+
+`` and `` refer to the number of slots available for each instance, respectively. Set it to one if you have a GPU on each instance.
+
+4. SSH into each host and make sure you can log into them. It may ask you to verify the public key. Make sure to trust the public key so that it doesn't ask you again.
+
+5. Test MPI by running `mpirun -N 1 --hostfile ~/hostfile hostname`. It should display the hostname of all your nodes. If you encounter any errors like `WARNING: Open MPI accepted a TCP connection from what appears to be another Open MPI process but cannot find a corresponding process entry for that peer.`, add the parameter `--mca btl_tcp_if_include` in the `mpirun` command (refer to [this Stack Overflow question](https://stackoverflow.com/questions/15072563/running-mpi-on-two-hosts)).
+
+## 5. Test FlexFlow
+
+Follow step 6 in the [Build from source guide](https://flexflow.readthedocs.io/en/latest/installation.html) to set the environment variables.
+
+A script to run a Python example on multiple nodes is available at `scripts/mnist_mlp_run.sh`. Run the script to test FlexFlow on mnist mlp training. You can adjust the script to run any other program. Make sure to change the `FLEXFLOW_DIR` and `UCX_DIR` variables in it to appropriate paths.
+
diff --git a/README.md b/README.md
index 9ad900fb3c..95790a90e5 100644
--- a/README.md
+++ b/README.md
@@ -1,72 +1,54 @@
-# FlexFlow
-![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=master) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=master) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=master) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=master) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=master) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=master) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
+# FlexFlow: Low-Latency, High-Performance Training and Serving
+![build](https://github.com/flexflow/flexflow/workflows/build/badge.svg?branch=inference) ![gpu tests](https://github.com/flexflow/flexflow/workflows/gpu-ci/badge.svg?branch=inference) ![multinode gpu tests](https://github.com/flexflow/flexflow/workflows/multinode-test/badge.svg?branch=master) ![docker](https://github.com/flexflow/flexflow/workflows/docker-build/badge.svg?branch=inference) ![pip](https://github.com/flexflow/flexflow/workflows/pip-install/badge.svg?branch=inference) ![shell-check](https://github.com/flexflow/flexflow/workflows/Shell%20Check/badge.svg?branch=inference) ![clang-format](https://github.com/flexflow/flexflow/workflows/clang-format%20Check/badge.svg?branch=inference) [![Documentation Status](https://readthedocs.org/projects/flexflow/badge/?version=latest)](https://flexflow.readthedocs.io/en/latest/?badge=latest)
-FlexFlow is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow only requires [a few lines of changes to the program](https://flexflow.ai/keras).
-## Install FlexFlow
-To install FlexFlow from source code, please read the [instructions](https://flexflow.readthedocs.io/en/latest/installation.html). If you would like to quickly try FlexFlow, we also provide pre-built Docker packages for several versions of CUDA and for the `hip_rocm` backend, together with [Dockerfiles](./docker) if you wish to build the containers manually. More info on the Docker images can be found [here](./docker/README.md). You can also use `conda` to install the FlexFlow Python package (coming soon).
+---
-## PyTorch Support
-Users can also use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`.
-```python
-import torch
-import flexflow.torch.fx as fx
+## News 🔥:
-model = MyPyTorchModule()
-fx.torch_to_flexflow(model, "mymodel.ff")
-```
+* [09/02/2023] Adding AMD GPU support, released Docker images for ROCM 5.3->5.6
+* [08/16/2023] Adding Starcoder model support
+* [08/14/2023] Released Docker image for different CUDA versions
+
+## Install FlexFlow
-Second, a FlexFlow program can directly import a previously saved PyTorch model and [autotune](https://www.usenix.org/conference/osdi22/presentation/unger) the parallelization performance for a given parallel machine.
-```python
-from flexflow.pytorch.model import PyTorchModel
+### Requirements
+* OS: Linux
+* GPU backend: Hip-ROCm or CUDA
+ * CUDA version: 10.2 – 12.0
+ * NVIDIA compute capability: 6.0 or higher
+* Python: 3.6 or higher
+* Package dependencies: [see here](https://github.com/flexflow/FlexFlow/blob/inference/requirements.txt)
-def top_level_task():
- torch_model = PyTorchModel("mymodel.ff")
- output_tensor = torch_model.apply(ffmodel, input_tensor)
- ## Model compilation
- ffmodel.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
- ## Model training
- (x_train, y_train) = cifar10.load_data()
- ffmodel.fit(x_train, y_train, epochs=30)
+### Install with pip
+You can install FlexFlow using pip:
+
+```bash
+pip install flexflow
```
-**More FlexFlow PyTorch examples**: see the [pytorch examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/python/pytorch).
+### Try it in Docker
+If you run into any issue during the install, or if you would like to use the C++ API without needing to install from source, you can also use our pre-built Docker package for different CUDA versions and the `hip_rocm` backend. To download and run our pre-built Docker container:
+
+```bash
+docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest
+```
-## TensorFlow Keras and ONNX Support
-FlexFlow prioritizes PyTorch compatibility, but also includes frontends for [Tensorflow Keras](./docs/source/keras.rst) and [ONNX](./docs/source/onnx.rst) models.
+To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, `cuda-12.0`, `cuda-12.1`, `cuda-12.1`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`. More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md).
-## C++ Interface
-For users that prefer to program in C/C++. FlexFlow supports a C++ program inference that is equivalent to its Python APIs.
+### Build from source
-**More FlexFlow C++ examples**: see the [C++ examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/cpp).
+You can install FlexFlow Serve from source code by building the inference branch of FlexFlow. Please follow these [instructions](https://flexflow.readthedocs.io/en/latest/installation.html).
-## Command-Line Flags
-In addition to setting runtime configurations in a FlexFlow Python/C++ program, the FlexFlow runtime also accepts command-line arguments for various runtime parameters:
+## Get Started!
-FlexFlow training flags:
-* `-e` or `--epochs`: number of total epochs to run (default: 1)
-* `-b` or `--batch-size`: global batch size in each iteration (default: 64)
-* `-p` or `--print-freq`: print frequency (default: 10)
-* `-d` or `--dataset`: path to the training dataset. If not set, synthetic data is used to conduct training.
+To get started, check out the quickstart guides below for the FlexFlow training and serving libraries.
-Legion runtime flags:
-* `-ll:gpu`: number of GPU processors to use on each node (default: 0)
-* `-ll:fsize`: size of device memory on each GPU (in MB)
-* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) on each node (in MB). This is used for prefecthing training images from disk.
-* `-ll:cpu`: number of data loading workers (default: 4)
-* `-ll:util`: number of utility threads to create per process (default: 1)
-* `-ll:bgwork`: number of background worker threads to create per process (default: 1)
+* [FlexFlow Train](./TRAIN.md)
+* [FlexFlow Serve](./SERVE.md)
-Performance auto-tuning flags:
-* `--search-budget` or `--budget`: the number of iterations for the MCMC search (default: 0)
-* `--search-alpha` or `--alpha`: a hyper-parameter for the search procedure (default: 0.05)
-* `--export-strategy` or `--export`: path to export the best discovered strategy (default: None)
-* `--import-strategy` or `--import`: path to import a previous saved strategy (default: None)
-* `--enable-parameter-parallel`: allow FlexFlow to explore parameter parallelism for performance auto-tuning. (By default FlexFlow only considers data and model parallelism.)
-* `--enable-attribute-parallel`: allow FlexFlow to explore attribute parallelism for performance auto-tuning. (By default FlexFlow only considers data and model parallelism.)
-For performance tuning related flags: see [performance autotuning](https://flexflow.ai/search).
## Contributing
@@ -75,6 +57,14 @@ Please let us know if you encounter any bugs or have any suggestions by [submitt
We welcome all contributions to FlexFlow from bug fixes to new features and extensions.
## Citations
+
+**FlexFlow Serve:**
+
+* Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Alan Zhu, Lijie Yang, Xiaoxiang Shi, Chunan Shi, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, Zhihao Jia. [SpecInfer: Accelerating Generative Large Language Model Serving with Speculative Inference and Token Tree Verification](https://arxiv.org/abs/2305.09781). In ArXiV, May 2023.
+
+
+**FlexFlow Train:**
+
* Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. [Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization](https://www.usenix.org/conference/osdi22/presentation/unger). In Proceedings of the Symposium on Operating Systems Design and Implementation (OSDI), July 2022.
* Zhihao Jia, Matei Zaharia, and Alex Aiken. [Beyond Data and Model Parallelism for Deep Neural Networks](https://cs.stanford.edu/~zhihao/papers/sysml19a.pdf). In Proceedings of the 2nd Conference on Machine Learning and Systems (MLSys), April 2019.
@@ -86,3 +76,4 @@ FlexFlow is developed and maintained by teams at CMU, Facebook, Los Alamos Natio
## License
FlexFlow uses Apache License 2.0.
+
diff --git a/SERVE.md b/SERVE.md
new file mode 100644
index 0000000000..9472d50a62
--- /dev/null
+++ b/SERVE.md
@@ -0,0 +1,275 @@
+# FlexFlow Serve: Low-Latency, High-Performance LLM Serving
+
+
+## What is FlexFlow Serve
+
+The high computational and memory requirements of generative large language
+models (LLMs) make it challenging to serve them quickly and cheaply.
+FlexFlow Serve is an open-source compiler and distributed system for
+__low latency__, __high performance__ LLM serving. FlexFlow Serve outperforms
+existing systems by 1.3-2.0x for single-node, multi-GPU inference and by
+1.4-2.4x for multi-node, multi-GPU inference.
+
+
+
+
+
+
+## Quickstart
+The following example shows how to deploy an LLM using FlexFlow Serve and accelerate its serving using [speculative inference](#speculative-inference). First, we import `flexflow.serve` and initialize the FlexFlow Serve runtime. Note that `memory_per_gpu` and `zero_copy_memory_per_node` specify the size of device memory on each GPU (in MB) and zero-copy memory on each node (in MB), respectively.
+We need to make sure the aggregated GPU memory and zero-copy memory are **both** sufficient to store LLM parameters in non-offloading serving. FlexFlow Serve combines tensor and pipeline model parallelism for LLM serving.
+```python
+import flexflow.serve as ff
+
+ff.init(
+ num_gpus=4,
+ memory_per_gpu=14000,
+ zero_copy_memory_per_node=30000,
+ tensor_parallelism_degree=4,
+ pipeline_parallelism_degree=1
+ )
+```
+Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms).
+```python
+# Specify the LLM
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
+
+# Specify a list of SSMs (just one in this case)
+ssms=[]
+ssm = ff.SSM("JackFram/llama-68m")
+ssms.append(ssm)
+```
+Next, we declare the generation configuration and compile both the LLM and SSMs. Note that all SSMs should run in the **beam search** mode, and the LLM should run in the **tree verification** mode to verify the speculated tokens from SSMs.
+```python
+# Create the sampling configs
+generation_config = ff.GenerationConfig(
+ do_sample=False, temperature=0.9, topp=0.8, topk=1
+)
+
+# Compile the SSMs for inference and load the weights into memory
+for ssm in ssms:
+ ssm.compile(generation_config)
+
+# Compile the LLM for inference and load the weights into memory
+llm.compile(generation_config, ssms=ssms)
+```
+Finally, we call `llm.generate` to generate the output, which is organized as a list of `GenerationResult`, which include the output tokens and text.
+```python
+result = llm.generate("Here are some travel tips for Tokyo:\n")
+```
+
+### Incremental decoding
+
+
+Expand here
+
+
+```python
+
+import flexflow.serve as ff
+
+# Initialize the FlexFlow runtime. ff.init() takes a dictionary (as a positional argument) or named key-value parameters
+ff.init(
+ num_gpus=4,
+ memory_per_gpu=14000,
+ zero_copy_memory_per_node=30000,
+ tensor_parallelism_degree=4,
+ pipeline_parallelism_degree=1
+ )
+
+# Create the FlexFlow LLM
+llm = ff.LLM("meta-llama/Llama-2-7b-hf")
+
+# Create the sampling configs
+generation_config = ff.GenerationConfig(
+ do_sample=True, temperature=0.9, topp=0.8, topk=1
+)
+
+# Compile the LLM for inference and load the weights into memory
+llm.compile(generation_config)
+
+# Generation begins!
+result = llm.generate("Here are some travel tips for Tokyo:\n")
+
+```
+
+
+
+### C++ interface
+If you'd like to use the C++ interface (mostly used for development and benchmarking purposes), you should install from source, and follow the instructions below.
+
+
+Expand here
+
+
+#### Downloading models
+
+Before running FlexFlow Serve, you should manually download the LLM and SSM(s) model of interest using the [inference/utils/download_hf_model.py](https://github.com/flexflow/FlexFlow/blob/inference/inference/utils/download_hf_model.py) script (see example below). By default, the script will download all of a model's assets (weights, configs, tokenizer files, etc...) into the cache folder `~/.cache/flexflow`. If you would like to use a different folder, you can request that via the parameter `--cache-folder`.
+
+```bash
+python3 ./inference/utils/download_hf_model.py ...
+```
+
+#### Running the C++ examples
+A C++ example is available at [this folder](../inference/spec_infer/). After building FlexFlow Serve, the executable will be available at `/build_dir/inference/spec_infer/spec_infer`. You can use the following command-line arguments to run FlexFlow Serve:
+
+* `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0)
+* `-ll:fsize`: size of device memory on each GPU in MB
+* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters.
+* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf")
+* `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs.
+* `-cache-folder`: the folder
+* `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used.
+* `-prompt`: (optional) path to the prompt file. FlexFlow Serve expects a json format file for prompts. In addition, users can also use the following API for registering requests:
+* `-output-file`: (optional) filepath to use to save the output of the model, together with the generation latency
+
+For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference.
+
+```bash
+./inference/spec_infer/spec_infer -ll:gpu 4 -ll:cpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion
+```
+
+
+## Speculative Inference
+A key technique that enables FlexFlow Serve to accelerate LLM serving is speculative
+inference, which combines various collectively boost-tuned small speculative
+models (SSMs) to jointly predict the LLM’s outputs; the predictions are organized as a
+token tree, whose nodes each represent a candidate token sequence. The correctness
+of all candidate token sequences represented by a token tree is verified against the
+LLM’s output in parallel using a novel tree-based parallel decoding mechanism.
+FlexFlow Serve uses an LLM as a token tree verifier instead of an incremental decoder,
+which largely reduces the end-to-end inference latency and computational requirement
+for serving generative LLMs while provably preserving model quality.
+
+
+
+
+
+### Supported LLMs and SSMs
+
+FlexFlow Serve currently supports all HuggingFace models with the following architectures:
+* `LlamaForCausalLM` / `LLaMAForCausalLM` (e.g. LLaMA/LLaMA-2, Guanaco, Vicuna, Alpaca, ...)
+* `OPTForCausalLM` (models from the OPT family)
+* `RWForCausalLM` (models from the Falcon family)
+* `GPTBigCodeForCausalLM` (models from the Starcoder family)
+
+Below is a list of models that we have explicitly tested and for which a SSM may be available:
+
+| Model | Model id on HuggingFace | Boost-tuned SSMs |
+| :---- | :---- | :---- |
+| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-13B | meta-llama/Llama-2-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| LLaMA-2-70B | meta-llama/Llama-2-70b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) |
+| OPT-6.7B | facebook/opt-6.7b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-13B | facebook/opt-13b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-30B | facebook/opt-30b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| OPT-66B | facebook/opt-66b | [OPT-125M](https://huggingface.co/facebook/opt-125m) |
+| Falcon-7B | tiiuae/falcon-7b | |
+| Falcon-40B | tiiuae/falcon-40b | |
+| StarCoder-15.5B | bigcode/starcoder | |
+
+
+### CPU Offloading
+FlexFlow Serve also offers offloading-based inference for running large models (e.g., llama-7B) on a single GPU. CPU offloading is a choice to save tensors in CPU memory, and only copy the tensor to GPU when doing calculation. Notice that now we selectively offload the largest weight tensors (weights tensor in Linear, Attention). Besides, since the small model occupies considerably less space, it it does not pose a bottleneck for GPU memory, the offloading will bring more runtime space and computational cost, so we only do the offloading for the large model. [TODO: update instructions] You can run the offloading example by enabling the `-offload` and `-offload-reserve-space-size` flags.
+
+### Quantization
+FlexFlow Serve supports int4 and int8 quantization. The compressed tensors are stored on the CPU side. Once copied to the GPU, these tensors undergo decompression and conversion back to their original precision. Please find the compressed weight files in our s3 bucket, or use [this script](../inference/utils/compress_llama_weights.py) from [FlexGen](https://github.com/FMInference/FlexGen) project to do the compression manually. [TODO: update instructions for quantization].
+
+### Prompt Datasets
+We provide five prompt datasets for evaluating FlexFlow Serve: [Chatbot instruction prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatbot.json), [ChatGPT Prompts](https://specinfer.s3.us-east-2.amazonaws.com/prompts/chatgpt.json), [WebQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/webqa.json), [Alpaca](https://specinfer.s3.us-east-2.amazonaws.com/prompts/alpaca.json), and [PIQA](https://specinfer.s3.us-east-2.amazonaws.com/prompts/piqa.json).
+
+
+
+
+## Python Interface Features and Interaction Methods
+
+FlexFlow Serve provides a comprehensive Python interface for serving with low latency and high performance. This interface facilitates the deployment and interaction with the serving platform for a variety of applications, from chatbots and prompt templates to retrieval augmented generation and API services.
+
+### Chatbot with Gradio
+
+The Python interface allows setting up a chatbot application using Gradio, enabling interactive dialogues with users through a user-friendly web interface.
+
+#### Implementation Steps
+1. **FlexFlow Initialization:** Configure and initialize FlexFlow Serve with the desired settings and the specific LLM.
+```python
+import gradio as gr
+import flexflow.serve as ff
+
+ff.init(num_gpus=2, memory_per_gpu=14000, ...)
+```
+2. **Gradio Interface Setup:** Implement a function to generate responses from user inputs and set up the Gradio Chat Interface for interaction.
+```python
+def generate_response(user_input):
+ result = llm.generate(user_input)
+ return result.output_text.decode('utf-8')
+```
+3. **Running the Interface:** Launch the Gradio interface to interact with the LLM through a web-based chat interface.
+```python
+iface = gr.ChatInterface(fn=generate_response)
+iface.launch()
+```
+4. **Shutdown:** Properly stop the FlexFlow server after interaction is complete.
+
+
+
+### Langchain Usecases
+FlexFlow Serve supports langchain usecases including dynamic prompt template handling and RAG usecases, enabling the customization of model responses based on structured input templates and Retrieval Augmented Generation.
+
+#### Implementation Steps
+1. **FlexFlow Initialization**: Start by initializing FlexFlow Serve with the appropriate configurations.
+2. **LLM Setup**: Compile and load the LLM for text generation.
+3. **Prompt Template/RAG Setup**: Configure prompt templates to guide the model's responses.
+4. **Response Generation**: Use the LLM with the prompt template to generate responses.
+
+
+### Python FastAPI Entrypoint
+Flexflow Serve also supports deploying and managing LLMs with FastAPI, offering a RESTful API interface for generating responses from models.
+
+```python
+@app.on_event("startup")
+async def startup_event():
+ global llm
+ # Initialize and compile the LLM model
+ llm.compile(
+ generation_config,
+ # ... other params as needed
+ )
+ llm.start_server()
+
+@app.post("/generate/")
+async def generate(prompt_request: PromptRequest):
+ # ... exception handling
+ full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8')
+ # ... split prompt and response text for returning results
+ return {"prompt": prompt_request.prompt, "response": full_output}
+```
+
+
+
+
+## TODOs
+
+FlexFlow Serve is still under active development. We currently focus on the following tasks and strongly welcome all contributions from bug fixes to new features and extensions.
+
+* AMD benchmarking. We are actively working on benchmarking FlexFlow Serve on AMD GPUs and comparing it with the performance on NVIDIA GPUs.
+
+## Acknowledgements
+This project is initiated by members from CMU, Stanford, and UCSD. We will be continuing developing and supporting FlexFlow Serve. Please cite FlexFlow Serve as:
+
+``` bibtex
+@misc{miao2023specinfer,
+ title={SpecInfer: Accelerating Generative Large Language Model Serving with Speculative Inference and Token Tree Verification},
+ author={Xupeng Miao and Gabriele Oliaro and Zhihao Zhang and Xinhao Cheng and Zeyu Wang and Rae Ying Yee Wong and Alan Zhu and Lijie Yang and Xiaoxiang Shi and Chunan Shi and Zhuoming Chen and Daiyaan Arfeen and Reyna Abhyankar and Zhihao Jia},
+ year={2023},
+ eprint={2305.09781},
+ archivePrefix={arXiv},
+ primaryClass={cs.CL}
+}
+```
+
+## License
+FlexFlow uses Apache License 2.0.
diff --git a/TRAIN.md b/TRAIN.md
new file mode 100644
index 0000000000..1595274a4c
--- /dev/null
+++ b/TRAIN.md
@@ -0,0 +1,65 @@
+# FlexFlow Train: Distributed DNN Training with Flexible Parallelization Strategies.
+FlexFlow Train is a deep learning framework that accelerates distributed DNN training by automatically searching for efficient parallelization strategies. FlexFlow Train provides a drop-in replacement for PyTorch and TensorFlow Keras. Running existing PyTorch and Keras programs in FlexFlow oTrain nly requires [a few lines of changes to the program](https://flexflow.ai/keras).
+
+
+## PyTorch Support
+Users can also use FlexFlow Train to optimize the parallelization performance of existing PyTorch models in two steps. First, a PyTorch model can be exported to the FlexFlow model format using `flexflow.torch.fx.torch_to_flexflow`.
+```python
+import torch
+import flexflow.torch.fx as fx
+
+model = MyPyTorchModule()
+fx.torch_to_flexflow(model, "mymodel.ff")
+```
+
+Second, a FlexFlow Train program can directly import a previously saved PyTorch model and [autotune](https://www.usenix.org/conference/osdi22/presentation/unger) the parallelization performance for a given parallel machine.
+
+```python
+from flexflow.pytorch.model import PyTorchModel
+
+def top_level_task():
+ torch_model = PyTorchModel("mymodel.ff")
+ output_tensor = torch_model.apply(ffmodel, input_tensor)
+ ## Model compilation
+ ffmodel.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
+ ## Model training
+ (x_train, y_train) = cifar10.load_data()
+ ffmodel.fit(x_train, y_train, epochs=30)
+```
+
+**More FlexFlow PyTorch examples**: see the [pytorch examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/python/pytorch).
+
+## TensorFlow Keras and ONNX Support
+FlexFlow Train prioritizes PyTorch compatibility, but also includes frontends for [Tensorflow Keras](./docs/source/keras.rst) and [ONNX](./docs/source/onnx.rst) models.
+
+## C++ Interface
+For users that prefer to program in C/C++. FlexFlow Train supports a C++ program inference that is equivalent to its Python APIs.
+
+**More FlexFlow C++ examples**: see the [C++ examples folder](https://github.com/flexflow/FlexFlow/tree/master/examples/cpp).
+
+
+## Command-Line Flags
+In addition to setting runtime configurations in a FlexFlow Train Python/C++ program, the FlexFlow Train runtime also accepts command-line arguments for various runtime parameters:
+
+FlexFlow training flags:
+* `-e` or `--epochs`: number of total epochs to run (default: 1)
+* `-b` or `--batch-size`: global batch size in each iteration (default: 64)
+* `-p` or `--print-freq`: print frequency (default: 10)
+* `-d` or `--dataset`: path to the training dataset. If not set, synthetic data is used to conduct training.
+
+Legion runtime flags:
+* `-ll:gpu`: number of GPU processors to use on each node (default: 0)
+* `-ll:fsize`: size of device memory on each GPU (in MB)
+* `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) on each node (in MB). This is used for prefecthing training images from disk.
+* `-ll:cpu`: number of data loading workers (default: 4)
+* `-ll:util`: number of utility threads to create per process (default: 1)
+* `-ll:bgwork`: number of background worker threads to create per process (default: 1)
+
+Performance auto-tuning flags:
+* `--search-budget` or `--budget`: the number of iterations for the MCMC search (default: 0)
+* `--search-alpha` or `--alpha`: a hyper-parameter for the search procedure (default: 0.05)
+* `--export-strategy` or `--export`: path to export the best discovered strategy (default: None)
+* `--import-strategy` or `--import`: path to import a previous saved strategy (default: None)
+* `--enable-parameter-parallel`: allow FlexFlow Train to explore parameter parallelism for performance auto-tuning. (By default FlexFlow Train only considers data and model parallelism.)
+* `--enable-attribute-parallel`: allow FlexFlow Train to explore attribute parallelism for performance auto-tuning. (By default FlexFlow Train only considers data and model parallelism.)
+For performance tuning related flags: see [performance autotuning](https://flexflow.ai/search).
diff --git a/bootcamp_demo/ff_alexnet_cifar10.py b/bootcamp_demo/ff_alexnet_cifar10.py
deleted file mode 100644
index cb0b0e99ad..0000000000
--- a/bootcamp_demo/ff_alexnet_cifar10.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#./flexflow_python $FF_HOME/bootcamp_demo/ff_alexnet_cifar10.py -ll:py 1 -ll:gpu 1 -ll:fsize 2048 -ll:zsize 12192
-
-from flexflow.core import *
-from flexflow.keras.datasets import cifar10
-from flexflow.torch.model import PyTorchModel
-from PIL import Image
-
-def top_level_task():
- ffconfig = FFConfig()
- ffconfig.parse_args()
- print("Python API batchSize(%d) workersPerNodes(%d) numNodes(%d)" %(ffconfig.get_batch_size(), ffconfig.get_workers_per_node(), ffconfig.get_num_nodes()))
- ffmodel = FFModel(ffconfig)
-
- dims_input = [ffconfig.get_batch_size(), 3, 229, 229]
- input_tensor = ffmodel.create_tensor(dims_input, DataType.DT_FLOAT)
-
- torch_model = PyTorchModel("alexnet.ff")
- output_tensors = torch_model.apply(ffmodel, [input_tensor])
-
- ffoptimizer = SGDOptimizer(ffmodel, 0.01)
- ffmodel.set_sgd_optimizer(ffoptimizer)
- ffmodel.compile(loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY, metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
- label_tensor = ffmodel.get_label_tensor()
-
- num_samples = 10000
-
- (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
- full_input_np = np.zeros((num_samples, 3, 229, 229), dtype=np.float32)
-
- for i in range(0, num_samples):
- image = x_train[i, :, :, :]
- image = image.transpose(1, 2, 0)
- pil_image = Image.fromarray(image)
- pil_image = pil_image.resize((229,229), Image.NEAREST)
- image = np.array(pil_image, dtype=np.float32)
- image = image.transpose(2, 0, 1)
- full_input_np[i, :, :, :] = image
-
- full_input_np /= 255
-
- y_train = y_train.astype('int32')
- full_label_np = y_train
-
- dataloader_input = ffmodel.create_data_loader(input_tensor, full_input_np)
- dataloader_label = ffmodel.create_data_loader(label_tensor, full_label_np)
-
- num_samples = dataloader_input.num_samples
-
- ffmodel.init_layers()
-
- epochs = ffconfig.get_epochs()
-
- ts_start = ffconfig.get_current_time()
-
- ffmodel.fit(x=dataloader_input, y=dataloader_label, epochs=epochs)
-
- ts_end = ffconfig.get_current_time()
- run_time = 1e-6 * (ts_end - ts_start);
- print("epochs %d, ELAPSED TIME = %.4fs, THROUGHPUT = %.2f samples/s\n" %(epochs, run_time, num_samples * epochs / run_time));
-
- # perf_metrics = ffmodel.get_perf_metrics()
- # accuracy = perf_metrics.get_accuracy()
- # if accuracy < ModelAccuracy.CIFAR10_CNN.value:
- # assert 0, 'Check Accuracy'
-
-
-if __name__ == "__main__":
- print("cifar10 cnn")
- top_level_task()
diff --git a/bootcamp_demo/keras_cnn_cifar10.py b/bootcamp_demo/keras_cnn_cifar10.py
deleted file mode 100644
index a62f625449..0000000000
--- a/bootcamp_demo/keras_cnn_cifar10.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#./flexflow_python $FF_HOME/bootcamp_demo/keras_cnn_cifar10.py -ll:py 1 -ll:gpu 1 -ll:fsize 2048 -ll:zsize 12192
-
-# from keras.models import Model, Sequential
-# from keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Dropout
-# from keras.optimizers import SGD
-# from keras.datasets import cifar10
-# from keras import losses
-# from keras import metrics
-
-from flexflow.keras.models import Model, Sequential
-from flexflow.keras.layers import Input, Flatten, Dense, Activation, Conv2D, MaxPooling2D, Dropout
-from flexflow.keras.optimizers import SGD
-from flexflow.keras.datasets import cifar10
-from flexflow.keras import losses
-from flexflow.keras import metrics
-
-import numpy as np
-
-def top_level_task():
- num_classes = 10
-
- num_samples = 10000
-
- #(x_train, y_train), (x_test, y_test) = cifar10.load_data()
- (x_train, y_train), (x_test, y_test) = cifar10.load_data(num_samples)
-
- x_train = x_train.astype('float32')
- x_train /= 255
- y_train = y_train.astype('int32')
- print("shape: ", x_train.shape[1:])
-
- model = Sequential()
-
- model.add(Conv2D(filters=32, input_shape=(3,32,32), kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"))
- model.add(Conv2D(filters=32, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"))
- model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"))
- model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid", activation="relu"))
- model.add(Conv2D(filters=64, kernel_size=(3,3), strides=(1,1), padding="valid"))
- model.add(Activation("relu"))
- model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2), padding="valid"))
- model.add(Flatten())
- model.add(Dense(512))
- model.add(Activation("relu"))
- model.add(Dropout(0.5))
- model.add(Dense(num_classes))
- model.add(Activation("softmax"))
-
- opt = SGD(learning_rate=0.01)
- model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy', 'sparse_categorical_crossentropy'])
- print(model.summary())
-
- model.fit(x_train, y_train, batch_size=64, epochs=4)
-
-if __name__ == "__main__":
- print("Functional API, cifar10 cnn")
- top_level_task()
\ No newline at end of file
diff --git a/bootcamp_demo/torch_alexnet_cifar10.py b/bootcamp_demo/torch_alexnet_cifar10.py
deleted file mode 100644
index 394161c5a3..0000000000
--- a/bootcamp_demo/torch_alexnet_cifar10.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#./flexflow_python $FF_HOME/bootcamp_demo/torch_alexnet_cifar10.py -ll:py 1 -ll:gpu 1 -ll:fsize 2048 -ll:zsize 12192
-
-# https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py
-
-import torch.nn as nn
-import torch
-import flexflow.torch.fx as fx
-import torchvision.models as models
-
-class AlexNet(nn.Module):
- def __init__(self, num_classes: int = 1000) -> None:
- super(AlexNet, self).__init__()
- self.features = nn.Sequential(
- nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
- nn.ReLU(inplace=True),
- nn.MaxPool2d(kernel_size=3, stride=2),
- nn.Conv2d(64, 192, kernel_size=5, padding=2),
- nn.ReLU(inplace=True),
- nn.MaxPool2d(kernel_size=3, stride=2),
- nn.Conv2d(192, 384, kernel_size=3, padding=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(384, 256, kernel_size=3, padding=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(256, 256, kernel_size=3, padding=1),
- nn.ReLU(inplace=True),
- nn.MaxPool2d(kernel_size=3, stride=2),
- )
- self.classifier = nn.Sequential(
- nn.Linear(256 * 6 * 6, 4096),
- nn.ReLU(inplace=True),
- nn.Linear(4096, 4096),
- nn.ReLU(inplace=True),
- nn.Linear(4096, num_classes),
- nn.Softmax(),
- )
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- x = self.features(x)
- x = torch.flatten(x, 1)
- x = self.classifier(x)
- return x
-
-model = AlexNet(num_classes=10)
-fx.torch_to_flexflow(model, "alexnet.ff")
\ No newline at end of file
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index f4111d8ea6..45ecc1798b 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -13,8 +13,19 @@ if(CUDA_FOUND)
# set cuda runtime and driver lib
# override cublas and curand because the FindCUDA module may not find the correct libs
set(CUDADRV_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda${LIBEXT})
- set(CUDA_CUBLAS_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas${LIBEXT})
- set(CUDA_curand_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand${LIBEXT})
+ if(CUBLAS_PATH)
+ set(CUBLAS_ROOT ${CUBLAS_PATH})
+ else()
+ set(CUBLAS_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+ endif()
+ set(CUDA_CUBLAS_LIBRARIES ${CUBLAS_ROOT}/lib64/libcublas${LIBEXT})
+ if(CURAND_PATH)
+ set(CURAND_ROOT ${CURAND_PATH})
+ else()
+ set(CURAND_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+ endif()
+ set(CUDA_curand_LIBRARY ${CURAND_ROOT}/lib64/libcurand${LIBEXT})
+
list(APPEND FLEXFLOW_EXT_LIBRARIES
${CUDADRV_LIBRARIES}
${CUDA_CUBLAS_LIBRARIES}
@@ -53,8 +64,12 @@ if(CUDA_FOUND)
message( STATUS "CUDA Detected CUDA_ARCH : ${DETECTED_CUDA_ARCH}" )
set(FF_CUDA_ARCH ${DETECTED_CUDA_ARCH})
# Set FF_CUDA_ARCH to the list of all GPU architectures compatible with FlexFlow
- elseif("${FF_CUDA_ARCH}" STREQUAL "all")
- set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86)
+ elseif("${FF_CUDA_ARCH}" STREQUAL "all")
+ if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
+ set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86,90)
+ else()
+ set(FF_CUDA_ARCH 60,61,62,70,72,75,80,86)
+ endif()
endif()
# create CUDA_GENCODE list based on FF_CUDA_ARCH
@@ -66,6 +81,7 @@ if(CUDA_FOUND)
endforeach()
string(REGEX REPLACE "([0-9]+)" "-gencode arch=compute_\\1,code=sm_\\1" CUDA_GENCODE "${CUDA_GENCODE}")
+ set(CMAKE_CUDA_COMPILER "${CUDA_NVCC_EXECUTABLE}")
#output
message( STATUS "CUDA_VERSION: ${CUDA_VERSION}")
message( STATUS "CUDA root path : ${CUDA_TOOLKIT_ROOT_DIR}" )
@@ -76,6 +92,7 @@ if(CUDA_FOUND)
message( STATUS "CURAND libraries : ${CUDA_curand_LIBRARY}" )
message( STATUS "CUDA Arch : ${FF_CUDA_ARCH}" )
message( STATUS "CUDA_GENCODE: ${CUDA_GENCODE}")
+ message( STATUS "CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}")
list(APPEND FLEXFLOW_INCLUDE_DIRS
${CUDA_INCLUDE_DIRS})
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
new file mode 100644
index 0000000000..25f2e05e19
--- /dev/null
+++ b/cmake/hip.cmake
@@ -0,0 +1,12 @@
+if (NOT FF_HIP_ARCH STREQUAL "")
+ if (FF_HIP_ARCH STREQUAL "all")
+ set(FF_HIP_ARCH "gfx900,gfx902,gfx904,gfx906,gfx908,gfx909,gfx90a,gfx90c,gfx940,gfx1010,gfx1011,gfx1012,gfx1013,gfx1030,gfx1031,gfx1032,gfx1033,gfx1034,gfx1035,gfx1036,gfx1100,gfx1101,gfx1102,gfx1103")
+ endif()
+ string(REPLACE "," "," HIP_ARCH_LIST "${FF_HIP_ARCH}")
+endif()
+
+message(STATUS "FF_HIP_ARCH: ${FF_HIP_ARCH}")
+if(FF_GPU_BACKEND STREQUAL "hip_rocm")
+ #set(HIP_CLANG_PATH ${ROCM_PATH}/llvm/bin CACHE STRING "Path to the clang compiler by ROCM" FORCE)
+ set(GPU_TARGETS "${FF_HIP_ARCH}" CACHE STRING "The GPU TARGETs")
+endif()
diff --git a/cmake/legion.cmake b/cmake/legion.cmake
index b4cfad20e2..2afb507d3b 100644
--- a/cmake/legion.cmake
+++ b/cmake/legion.cmake
@@ -132,6 +132,10 @@ else()
set(Legion_EMBED_GASNet_VERSION "GASNet-2022.3.0" CACHE STRING "GASNet version")
set(Legion_NETWORKS "gasnetex" CACHE STRING "GASNet conduit")
set(GASNet_CONDUIT ${FF_GASNET_CONDUIT})
+ elseif("${FF_LEGION_NETWORKS}" STREQUAL "ucx")
+ set(ucx_ROOT ${UCX_PATH}/lib/cmake)
+ message(STATUS "Find ucx: ${UCX_PATH}")
+ set(Legion_NETWORKS "ucx" CACHE STRING "Enable UCX")
endif()
message(STATUS "GASNET ROOT: $ENV{GASNet_ROOT_DIR}")
set(Legion_MAX_DIM ${FF_MAX_DIM} CACHE STRING "Maximum number of dimensions")
@@ -142,8 +146,11 @@ else()
set(Legion_USE_HIP ON CACHE BOOL "enable Legion_USE_HIP" FORCE)
if (FF_GPU_BACKEND STREQUAL "hip_cuda")
set(Legion_HIP_TARGET "CUDA" CACHE STRING "Legion_HIP_TARGET CUDA" FORCE)
+ set(Legion_CUDA_ARCH ${FF_CUDA_ARCH} CACHE STRING "Legion CUDA ARCH" FORCE)
elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
set(Legion_HIP_TARGET "ROCM" CACHE STRING "Legion HIP_TARGET ROCM" FORCE)
+ set(Legion_HIP_ARCH ${FF_HIP_ARCH} CACHE STRING "Legion HIP ARCH" FORCE)
+ message(STATUS "Legion_HIP_ARCH: ${Legion_HIP_ARCH}")
endif()
endif()
set(Legion_REDOP_COMPLEX OFF CACHE BOOL "disable complex")
diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
index 04a23dcb8a..82cf3b4122 100644
--- a/cmake/nccl.cmake
+++ b/cmake/nccl.cmake
@@ -2,139 +2,88 @@ set(NCCL_NAME nccl)
# set(NCCL_CUDA_ARCH "-gencode=arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}")
# message("NCCL_CUDA_ARCH: ${NCCL_CUDA_ARCH}")
-set(NCCL_URL "")
-if((FF_USE_PREBUILT_NCCL OR FF_USE_ALL_PREBUILT_LIBRARIES) AND CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
- if(LINUX_VERSION MATCHES "20.04")
- if (CUDA_VERSION VERSION_EQUAL "11.0")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.0.3.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.1")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.1.1.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.2")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.2.2.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.3")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.3.1.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.4")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.4.3.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.5")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.5.2.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.6")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.6.2.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.7")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-20.04_11.7.0.tar.gz")
- endif()
- elseif(LINUX_VERSION MATCHES "18.04")
- if (CUDA_VERSION VERSION_EQUAL "10.1")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.1.243.tar.gz")
- elseif (CUDA_VERSION VERSION_EQUAL "10.2")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_10.2.89.tar.gz")
- elseif (CUDA_VERSION VERSION_EQUAL "11.0")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.0.3.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.1")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.1.1.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.2")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.2.2.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.3")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.3.1.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.4")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.4.3.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.5")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.5.2.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.6")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.6.2.tar.gz")
- elseif(CUDA_VERSION VERSION_EQUAL "11.7")
- set(NCCL_URL "https://github.com/flexflow/flexflow-third-party/releases/latest/download/nccl_ubuntu-18.04_11.7.0.tar.gz")
- endif()
- endif()
+if(NCCL_PATH)
+ set(NCCL_ROOT ${NCCL_PATH})
+else()
+ # if NCCL_PATH is not set, let's try to find it in the CUDA root
+ set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
endif()
-if(NCCL_URL)
- # Download and import pre-compiled NCCL library
- message(STATUS "Using pre-compiled NCCL library")
- message(STATUS "NCCL_URL: ${NCCL_URL}")
+find_library(NCCL_LIBRARY
+ NAMES libnccl${LIBEXT}
+ PATHS ${NCCL_ROOT} ${CUDA_ROOT}
+ PATH_SUFFIXES lib lib64
+ DOC "NCCL library." )
- include(FetchContent)
- FetchContent_Declare(${NCCL_NAME}
- URL ${NCCL_URL}
- CONFIGURE_COMMAND ""
- BUILD_COMMAND ""
- )
- FetchContent_GetProperties(${NCCL_NAME})
- if(NOT ${NCCL_NAME}_POPULATED)
- FetchContent_Populate(${NCCL_NAME})
- endif()
-
- set(NCCL_FOLDER_PATH ${${NCCL_NAME}_SOURCE_DIR}/deps/${NCCL_NAME})
- set(NCCL_INCLUDE_DIR ${NCCL_FOLDER_PATH}/include)
- set(NCCL_LIB_DIR ${NCCL_FOLDER_PATH}/lib)
- message(STATUS "NCCL library path: ${NCCL_FOLDER_PATH}")
- add_library(nccl SHARED IMPORTED)
- set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_FOLDER_PATH})
+find_path(NCCL_INCLUDE_DIR
+ NAMES nccl.h
+ HINTS ${NCCL_ROOT}
+ PATH_SUFFIXES include
+ DOC "NCCL include directory.")
- list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
- list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIB_DIR}/libnccl${LIBEXT})
- install(DIRECTORY ${NCCL_INCLUDE_DIR}/ DESTINATION include)
- install(DIRECTORY ${NCCL_LIB_DIR}/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
-
-else()
- if(NCCL_PATH)
- set(NCCL_ROOT ${NCCL_PATH})
+# find NCCL, set NCCL lib and include
+if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
+ set(NCCL_FOUND ON)
+ set(NCCL_LIBRARIES ${NCCL_LIBRARY})
+ set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
+
+ # Check NCCL version
+ if(EXISTS "${NCCL_INCLUDE_DIR}/nccl.h")
+ file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES
+ REGEX "#define NCCL_MAJOR [0-9]+" )
+ file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_DEFINES2
+ REGEX "#define NCCL_MINOR [0-9]+" )
+ string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES})
+ string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2})
+ set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}")
+ if(NCCL_VERSION VERSION_LESS 2.23)
+ set(NCCL_OLD TRUE)
+ else()
+ set(NCCL_OLD FALSE)
+ endif()
+ message(STATUS "Found NCCL version: ${NCCL_VERSION}")
else()
- # if NCCL_PATH is not set, let's try to find it in the CUDA root
- set(NCCL_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+ message(WARNING "NCCL header not found, unable to determine version")
+ set(NCCL_OLD TRUE) # Assume old version if we can't determine
endif()
-
- find_library(NCCL_LIBRARY
- NAMES libnccl${LIBEXT}
- PATHS ${NCCL_ROOT} ${CUDA_ROOT}
- PATH_SUFFIXES lib lib64
- DOC "NCCL library." )
+endif()
- find_path(NCCL_INCLUDE_DIR
- NAMES nccl.h
- HINTS ${NCCL_ROOT}
- PATH_SUFFIXES include
- DOC "NCCL include directory.")
-
- # find NCCL, set NCCL lib and include
- if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
- set(NCCL_FOUND ON)
- set(NCCL_LIBRARIES ${NCCL_LIBRARY})
- set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR})
- endif()
-
- # find NCCL
- if(NCCL_FOUND)
- list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
- list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
- message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
- message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
- add_library(nccl SHARED IMPORTED)
- else()
- # Build NCCL from source
- message(STATUS "Building NCCL from source")
- list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
-
- ExternalProject_Add(${NCCL_NAME}
- SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
- PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
- INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
- BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
- INSTALL_COMMAND ""
- CONFIGURE_COMMAND ""
- BUILD_COMMAND make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}"
- BUILD_IN_SOURCE 1
- )
+# find NCCL
+if(NCCL_FOUND AND (NOT NCCL_OLD OR CUDA_VERSION VERSION_LESS 12.0))
+ list(APPEND FLEXFLOW_EXT_LIBRARIES ${NCCL_LIBRARIES})
+ list(APPEND FLEXFLOW_INCLUDE_DIRS ${NCCL_INCLUDE_DIRS})
+ message( STATUS "NCCL include : ${NCCL_INCLUDE_DIRS}" )
+ message( STATUS "NCCL libraries : ${NCCL_LIBRARIES}" )
+ add_library(nccl SHARED IMPORTED)
+
+# Build NCCL from source
+else()
+ message(STATUS "Building NCCL from source")
+ list(TRANSFORM CUDA_GENCODE PREPEND "NVCC_GENCODE=" OUTPUT_VARIABLE NCCL_BUILD_NVCC_GENCODE)
- ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
- message(STATUS "NCCL install dir: ${INSTALL_DIR}")
- list(APPEND FLEXFLOW_INCLUDE_DIRS
- ${INSTALL_DIR}/include)
- list(APPEND FLEXFLOW_EXT_LIBRARIES
- ${INSTALL_DIR}/lib/libnccl${LIBEXT})
- set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")
-
- install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
- install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
+ set(NCCL_BUILD_CMD make src.build "${NCCL_BUILD_NVCC_GENCODE}" "CUDA_HOME=${CUDA_TOOLKIT_ROOT_DIR}" "BUILDDIR=${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}")
+ if(DEFINED ENV{MAKEFLAGS})
+ set(NCCL_BUILD_CMD ${CMAKE_COMMAND} -E env MAKEFLAGS=$ENV{MAKEFLAGS} ${NCCL_BUILD_CMD})
endif()
+ ExternalProject_Add(${NCCL_NAME}
+ SOURCE_DIR ${PROJECT_SOURCE_DIR}/deps/${NCCL_NAME}
+ PREFIX ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+ INSTALL_DIR ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}
+ BUILD_BYPRODUCTS ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/libnccl${LIBEXT}
+ INSTALL_COMMAND ""
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ${NCCL_BUILD_CMD}
+ BUILD_IN_SOURCE 1
+ )
+ ExternalProject_Get_Property(${NCCL_NAME} INSTALL_DIR)
+ message(STATUS "NCCL install dir: ${INSTALL_DIR}")
+ list(APPEND FLEXFLOW_INCLUDE_DIRS
+ ${INSTALL_DIR}/include)
+ list(APPEND FLEXFLOW_EXT_LIBRARIES
+ ${INSTALL_DIR}/lib/libnccl${LIBEXT})
+ set_directory_properties(PROPERTIES ADDITIONAL_CLEAN_FILES "${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/")
+
+ install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/include/ DESTINATION include)
+ install(DIRECTORY ${CMAKE_BINARY_DIR}/deps/${NCCL_NAME}/lib/ DESTINATION lib PATTERN "pkgconfig" EXCLUDE)
endif()
diff --git a/cmake/pip_install/CMakeLists.txt b/cmake/pip_install/CMakeLists.txt
new file mode 100644
index 0000000000..217d7e14f0
--- /dev/null
+++ b/cmake/pip_install/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Use setup.py script to re-install the Python bindings library with the right library paths
+if (FF_USE_PYTHON)
+ execute_process(COMMAND ${Python_EXECUTABLE} -c "import site, os; print([pkg for func in (site.getsitepackages(), site.getusersitepackages()) for pkg in ([func] if isinstance(func, str) else func) if os.access(pkg, os.W_OK)][0])" OUTPUT_VARIABLE PY_DEST OUTPUT_STRIP_TRAILING_WHITESPACE)
+ if(FF_BUILD_FROM_PYPI)
+ cmake_path(SET CMAKE_SOURCE_DIR_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion)
+ cmake_path(SET CMAKE_BUILD_DIR_ NORMALIZE ${Legion_BINARY_DIR}/runtime)
+ cmake_path(SET CMAKE_INSTALL_PREFIX_ NORMALIZE ${PY_DEST}/../../..)
+ cmake_path(SET WORKING_DIRECTORY_ NORMALIZE ${CMAKE_CURRENT_SOURCE_DIR}/../../deps/legion/bindings/python/)
+ # CMAKE_CURRENT_SOURCE_DIR=/usr/FlexFlow/cmake/pip_install
+ # Legion_BINARY_DIR=/usr/FlexFlow/build//deps/legion
+ # CMAKE_SOURCE_DIR_=/usr/FlexFlow/deps/legion
+ # CMAKE_BUILD_DIR_: /usr/FlexFlow/build//deps/legion/runtime
+ # CMAKE_INSTALL_PREFIX_: /opt/conda/ or /usr/local
+ # WORKING_DIRECTORY_: /usr/FlexFlow/deps/legion/bindings/python/
+ # PY_DEST: /python3.11/site-packages
+ message(STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
+ message(STATUS "Legion_BINARY_DIR: ${Legion_BINARY_DIR}")
+ message(STATUS "CMAKE_SOURCE_DIR_: ${CMAKE_SOURCE_DIR_}")
+ message(STATUS "CMAKE_BUILD_DIR_: ${CMAKE_BUILD_DIR_}")
+ message(STATUS "CMAKE_INSTALL_PREFIX_: ${CMAKE_INSTALL_PREFIX_}")
+ message(STATUS "WORKING_DIRECTORY_: ${WORKING_DIRECTORY_}")
+ message(STATUS "PY_DEST: ${PY_DEST}")
+ install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E echo \"Editing path to Legion library using path: ${CMAKE_INSTALL_PREFIX_} \")")
+ install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E env CMAKE_SOURCE_DIR=${CMAKE_SOURCE_DIR_} CMAKE_BUILD_DIR=${CMAKE_BUILD_DIR_} CMAKE_INSTALL_PREFIX=${PY_DEST}/flexflow ${Python3_EXECUTABLE} setup.py install --prefix ${CMAKE_INSTALL_PREFIX_} ${Legion_PYTHON_EXTRA_INSTALL_ARGS} WORKING_DIRECTORY ${WORKING_DIRECTORY_} COMMAND_ECHO STDOUT COMMAND_ERROR_IS_FATAL ANY)")
+ endif()
+endif()
diff --git a/cmake/zlib.cmake b/cmake/zlib.cmake
deleted file mode 100644
index 0281e02b88..0000000000
--- a/cmake/zlib.cmake
+++ /dev/null
@@ -1,8 +0,0 @@
-find_package(ZLIB REQUIRED)
-if(ZLIB_FOUND)
- list(APPEND FLEXFLOW_EXT_LIBRARIES
- ${ZLIB_LIBRARIES})
- message( STATUS "ZLIB libraries : ${ZLIB_LIBRARIES}" )
-else()
- message( FATAL_ERROR "ZLIB package not found")
-endif()
\ No newline at end of file
diff --git a/conda/build.sh b/conda/build.sh
deleted file mode 100755
index 0e84b7489a..0000000000
--- a/conda/build.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#! /usr/bin/env bash
-set -euo pipefail
-
-# Cd into FF_HOME
-cd "${BASH_SOURCE[0]%/*}/../"
-
-# build flexflow
-# "search and replace" bash syntax used below to make shellcheck happy.
-# see here: https://wiki-dev.bash-hackers.org/syntax/pe
-CXXFLAGS="${CXXFLAGS//-O2/}"
-CXXFLAGS="${CXXFLAGS//-std=c++17/}"
-CXXFLAGS="${CXXFLAGS//-DNDEBUG/}"
-CXXFLAGS="${CXXFLAGS//-D_FORTIFY_SOURCE=2/}"
-export CXXFLAGS
-CPPFLAGS="${CPPFLAGS//-O2/}"
-CPPFLAGS="${CPPFLAGS//-std=c++17/}"
-CPPFLAGS="${CPPFLAGS//-DNDEBUG/}"
-CPPFLAGS="${CPPFLAGS//-D_FORTIFY_SOURCE=2/}"
-export CPPFLAGS
-
-#export CUDNN_HOME=/projects/opt/centos7/cuda/10.1
-#export CUDA_HOME=/projects/opt/centos7/cuda/10.1
-export PROTOBUF_DIR=$BUILD_PREFIX
-export FF_HOME=$SRC_DIR
-export LG_RT_DIR=$SRC_DIR/legion/runtime
-#export FF_ENABLE_DEBUG=1
-#export DEBUG=0
-
-cd python
-make
diff --git a/conda/environment.yml b/conda/environment.yml
index 05992a8bf7..48cd8ddb33 100644
--- a/conda/environment.yml
+++ b/conda/environment.yml
@@ -3,13 +3,16 @@ channels:
- defaults
- conda-forge
dependencies:
- - python>=3.6
+ - python>=3.6,<3.12
- cffi>=1.11.0
- Pillow
- pybind11
+ - rust
- cmake-build-extension
+ - jq
- pip
- pip:
- qualname>=0.1.0
- keras_preprocessing>=1.1.2
- numpy>=1.16.0
+ - requests
diff --git a/conda/flexflow-cpu.yml b/conda/flexflow-cpu.yml
deleted file mode 100644
index ced02b9db4..0000000000
--- a/conda/flexflow-cpu.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-name: flexflow
-channels:
- - defaults
- - conda-forge
-dependencies:
- - python>=3.6
- - cffi>=1.11.0
- - Pillow
- - pybind11
- - cmake-build-extension
- - pytest
- - pip
- - pip:
- - qualname>=0.1.0
- - keras_preprocessing>=1.1.2
- - numpy>=1.16.0
- - torch --index-url https://download.pytorch.org/whl/cpu
- - torchaudio --index-url https://download.pytorch.org/whl/cpu
- - torchvision --index-url https://download.pytorch.org/whl/cpu
diff --git a/conda/flexflow.yml b/conda/flexflow.yml
new file mode 100644
index 0000000000..091ba929e4
--- /dev/null
+++ b/conda/flexflow.yml
@@ -0,0 +1,34 @@
+name: flexflow
+channels:
+ - defaults
+ - conda-forge
+dependencies:
+ - python>=3.6,<3.12
+ - cffi>=1.11.0
+ - Pillow
+ - pybind11
+ - rust
+ - cmake-build-extension
+ - jq
+ - pytest
+ - pip
+ - pip:
+ - qualname>=0.1.0
+ - keras_preprocessing>=1.1.2
+ - numpy>=1.16.0
+ - torch>=1.13.1 --index-url https://download.pytorch.org/whl/cpu
+ - torchaudio>=0.13.1 --index-url https://download.pytorch.org/whl/cpu
+ - torchvision>=0.14.1 --index-url https://download.pytorch.org/whl/cpu
+ - regex
+ - onnx
+ - transformers>=4.31.0
+ - sentencepiece
+ - einops
+ - requests
+ - scipy
+ - bitsandbytes
+ - datasets
+ - accelerate
+ - loralib
+ - triton
+ - peft
diff --git a/conda/meta.yaml b/conda/meta.yaml
deleted file mode 100644
index b6e14b2957..0000000000
--- a/conda/meta.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-package:
- name: flexflow
- version: "1.0"
-
-source:
- git_rev: master
- git_url: https://github.com/flexflow/FlexFlow.git
-
-build:
- number: 0
-
-requirements:
- build:
- - make
- - git
- - zlib
- - protobuf
- - {{ compiler('c') }}
- - {{ compiler('cxx') }}
- host:
- - python
- - cffi
- run:
- - cffi
- - numpy
- - python
- - zlib
- - keras-preprocessing
diff --git a/conda/pytorch-gpu.yml b/conda/pytorch-gpu.yml
index 677e71d73f..85d24ced17 100644
--- a/conda/pytorch-gpu.yml
+++ b/conda/pytorch-gpu.yml
@@ -3,7 +3,7 @@ channels:
- defaults
- conda-forge
dependencies:
- - python>=3.6
+ - python>=3.6,<3.12
- pip
- pip:
- numpy>=1.16.0
diff --git a/config/config.inc b/config/config.inc
index 6497dae40a..6431eaf136 100644
--- a/config/config.inc
+++ b/config/config.inc
@@ -24,7 +24,20 @@ fi
#set installation dir
if [ -n "$INSTALL_DIR" ]; then
- SET_INSTALL_DIR="-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}"
+ SET_INSTALL_DIR="-DINSTALL_DIR=${INSTALL_DIR}"
+fi
+
+if [ "$INFERENCE_TESTS" = "ON" ]; then
+ SET_INFERENCE_TESTS="-DINFERENCE_TESTS=ON"
+else
+ SET_INFERENCE_TESTS="-DINFERENCE_TESTS=OFF"
+fi
+
+#set cmake prefix path dir
+if [ -n "$LIBTORCH_PATH" ]; then
+ SET_LIBTORCH_PATH="-DLIBTORCH_PATH=${LIBTORCH_PATH}"
+else
+ SET_LIBTORCH_PATH=""
fi
# set build type
@@ -37,6 +50,11 @@ if [ -n "$FF_CUDA_ARCH" ]; then
SET_CUDA_ARCH="-DFF_CUDA_ARCH=${FF_CUDA_ARCH}"
fi
+# set HIP Arch
+if [ -n "$FF_HIP_ARCH" ]; then
+ SET_HIP_ARCH="-DFF_HIP_ARCH=${FF_HIP_ARCH}"
+fi
+
# set CUDA dir
if [ -n "$CUDA_DIR" ]; then
SET_CUDA="-DCUDA_PATH=${CUDA_DIR}"
@@ -44,11 +62,30 @@ if [ -n "$CUDA_DIR" ]; then
SET_CUDA_LIB_PATH="CUDA_PATH=${CUDA_PATH}"
fi
+# set cublas dir
+if [ -n "$CUBLAS_DIR" ]; then
+ SET_CUBLAS="-DCUBLAS_PATH=${CUBLAS_DIR}"
+fi
+
+# set curand dir
+if [ -n "$CURAND_DIR" ]; then
+ SET_CURAND="-DCURAND_PATH=${CURAND_DIR}"
+fi
+
# set cudnn dir
if [ -n "$CUDNN_DIR" ]; then
SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}"
fi
+# build legion only
+if [ "$BUILD_LEGION_ONLY" = "ON" ]; then
+ SET_BUILD_LEGION_ONLY="-DBUILD_LEGION_ONLY=ON"
+elif [ "$BUILD_LEGION_ONLY" = "OFF" ]; then
+ SET_BUILD_LEGION_ONLY="-DBUILD_LEGION_ONLY=OFF"
+else
+ SET_BUILD_LEGION_ONLY="-DBUILD_LEGION_ONLY=OFF"
+fi
+
# enable Python
if [ "$FF_USE_PYTHON" = "ON" ]; then
SET_PYTHON="-DFF_USE_PYTHON=ON"
@@ -81,14 +118,13 @@ if [ "$FF_LEGION_NETWORKS" = "gasnet" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=mpi"
elif [ "$FF_GASNET_CONDUIT" = "udp" ]; then
SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=udp"
- elif [ "$FF_GASNET_CONDUIT" = "ucx" ]; then
- SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ucx"
- SET_LEGION_NETWORKS+=" -DFF_UCX_URL=$FF_UCX_URL"
- elif [ "$FF_GASNET_CONDUIT" = "ofi" ]; then
- SET_LEGION_NETWORKS+=" -DFF_GASNET_CONDUIT=ofi"
fi
elif [ "$FF_LEGION_NETWORKS" = "ucx" ]; then
SET_LEGION_NETWORKS+=" -DFF_LEGION_NETWORKS=ucx"
+ # set ucx dir
+ if [ -n "$UCX_DIR" ]; then
+ SET_UCX="-DUCX_PATH=${UCX_DIR}"
+ fi
fi
# build C++ examples
@@ -99,6 +135,13 @@ elif [ "$FF_BUILD_ALL_EXAMPLES" = "OFF" ]; then
else
SET_EXAMPLES="-DFF_BUILD_ALL_EXAMPLES=ON"
fi
+if [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "ON" ]; then
+ SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON"
+elif [ "$FF_BUILD_ALL_INFERENCE_EXAMPLES" = "OFF" ]; then
+ SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=OFF"
+else
+ SET_INFERENCE_EXAMPLES="-DFF_BUILD_ALL_INFERENCE_EXAMPLES=ON"
+fi
# enable C++ unit tests
if [ "$FF_BUILD_UNIT_TESTS" = "ON" ]; then
@@ -147,11 +190,18 @@ if [ -n "$FF_MAX_DIM" ]; then
SET_MAX_DIM="-DFF_MAX_DIM=${FF_MAX_DIM}"
fi
+#set LEGION_MAX_RETURN_SIZE
+if [ -n "$LEGION_MAX_RETURN_SIZE" ]; then
+ SET_LEGION_MAX_RETURN_SIZE="-DLEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE}"
+fi
+
# set ROCM path
if [ -n "$ROCM_PATH" ]; then
- SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}"
+ SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH} -DHIP_ROOT_DIR=${ROCM_PATH}"
fi
+ADD_ROCM_TO_PATH=""
+
# set GPU backend
if [ -n "$FF_GPU_BACKEND" ]; then
SET_FF_GPU_BACKEND="-DFF_GPU_BACKEND=${FF_GPU_BACKEND}"
@@ -184,17 +234,18 @@ if [ -n "$FF_GPU_BACKEND" ]; then
chmod +x "$(pwd)/nvidia_hipcc"
SET_CXX="-DCMAKE_CXX_COMPILER=$(pwd)/nvidia_hipcc -DCMAKE_CXX_LINKER=$(pwd)/nvidia_hipcc"
else
- SET_CXX="-DCMAKE_CXX_COMPILER=$ROCM_PATH/bin/hipcc -DCMAKE_CXX_LINKER=$ROCM_PATH/bin/hipcc -DHIP_PATH=$ROCM_PATH/hip -DCMAKE_CXX_FLAGS='-I${MPICH_DIR}/include' -DCMAKE_EXE_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi' -DCMAKE_SHARED_LINKER_FLAGS='-L${MPICH_DIR}/lib -lmpi'"
+ ADD_ROCM_TO_PATH="PATH=${PATH}:${ROCM_PATH}/bin"
+ #SET_CXX="-DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_CXX_LINKER=/opt/rocm/bin/hipcc"
fi
fi
fi
fi
-CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_PYTHON} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
+CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUBLAS} ${SET_CURAND} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
function run_cmake() {
SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../}
-CMAKE_COMMAND="${SET_CC_FLAGS} ${SET_NVCC_FLAGS} ${SET_LD_FLAGS} ${SET_CUDA_LIB_PATH} cmake ${CMAKE_FLAGS} $* ${SRC_LOCATION}"
+CMAKE_COMMAND="${SET_CC_FLAGS} ${SET_NVCC_FLAGS} ${SET_LD_FLAGS} ${SET_CUDA_LIB_PATH} ${ADD_ROCM_TO_PATH} cmake ${CMAKE_FLAGS} $* ${SRC_LOCATION}"
echo $CMAKE_COMMAND
eval $CMAKE_COMMAND
}
diff --git a/config/config.linux b/config/config.linux
index d3729aea4c..a4b903ef15 100755
--- a/config/config.linux
+++ b/config/config.linux
@@ -1,5 +1,4 @@
#!/bin/bash
-
# set the CC and CXX, usually it is not needed as cmake can detect it
# set CC and CXX to mpicc and mpic++ when enable gasnet
# CC=mpicc
@@ -11,24 +10,46 @@
#LD_FLAGS=${LD_FLAGS+=""}
#set install dir
-#INSTALL_DIR=
+INSTALL_DIR=${INSTALL_DIR:-}
# set build type
BUILD_TYPE=${BUILD_TYPE:-Release}
+INFERENCE_TESTS=${INFERENCE_TESTS:-OFF}
+LIBTORCH_PATH=${LIBTORCH_PATH:-"$(realpath ../..)/libtorch"}
+if [[ "$INFERENCE_TESTS" == "ON" && ! -d "$LIBTORCH_PATH" ]]; then
+ cwd="$(pwd)"
+ cd ../..
+ wget https://download.pytorch.org/libtorch/nightly/cpu/libtorch-shared-with-deps-latest.zip
+ unzip libtorch-shared-with-deps-latest.zip
+ rm libtorch-shared-with-deps-latest.zip
+ LIBTORCH_PATH="$(pwd)/libtorch"
+ cd "$cwd"
+fi
+
# set CUDA Arch to the desired GPU architecture(s) to target (e.g. pass "FF_CUDA_ARCH=60" for Pascal).
# To pass more than one value, separate architecture numbers with a comma (e.g. FF_CUDA_ARCH=70,75).
# Alternatively, set "FF_CUDA_ARCH=autodetect" to build FlexFlow for all architectures detected on the machine,
# or set "FF_CUDA_ARCH=all" to build FlexFlow for all supported GPU architectures
FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"}
-
-# set CUDNN dir in case cmake cannot autodetect a path
-CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
+# FF_HIP_ARCH only supports building for a specific AMD architecture, a list of architectures separated by a comma
+# or all available architectures. TODO: support autodetect
+FF_HIP_ARCH=${FF_HIP_ARCH:-"all"}
# set CUDA dir in case cmake cannot autodetect a path
CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"}
-#set NCCL dir
+# set CUBLAS dir in case it is not stored in the CUDA DIR
+CUBLAS_DIR=${CUBLAS_DIR:-"/usr/local/cuda"}
+
+# set CURAND dir in case it is not stored in the CUDA DIR
+CURAND_DIR=${CURAND_DIR:-"/usr/local/cuda"}
+
+# set CUDNN dir in case cmake cannot autodetect a path
+CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
+
+# if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib,
+# otherwise, we will build nccl from source
NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"}
# enable Python
@@ -40,11 +61,12 @@ FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS:-}
# select GASNET conduit
FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT:-ofi}
-# set UCX URL
-FF_UCX_URL=${FF_UCX_URL:-""}
+# set UCX dir if Legion networks is set to ucx
+UCX_DIR=${UCX_DIR:-""}
# build C++ examples
FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES:-OFF}
+FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES:-ON}
# build C++ unit tests
FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF}
@@ -52,6 +74,7 @@ FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS:-OFF}
# use precompiled NCCL and Legion libraries, where available
FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL:-OFF}
FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION:-OFF}
+
# use the flag below to use both the NCCL and Legion pre-built libraries.
# when the flag below is set to ON, the two flags above are ignored.
FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES:-OFF}
@@ -62,6 +85,12 @@ FF_USE_AVX2=${FF_USE_AVX2:-OFF}
# set MAX_DIM
FF_MAX_DIM=${FF_MAX_DIM:-5}
+# set BUILD_LEGION_ONLY
+BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY:-OFF}
+
+# set LEGION_MAX_RETURN_SIZE
+LEGION_MAX_RETURN_SIZE=${LEGION_MAX_RETURN_SIZE:-262144}
+
# set ROCM path
ROCM_PATH=${ROCM_PATH:-"/opt/rocm"}
@@ -70,14 +99,14 @@ FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid."
exit 1
-elif [["$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm"]]; then
+elif [[ "$FF_GPU_BACKEND" == "cuda" || "$FF_GPU_BACKEND" = "hip_cuda" || "$FF_GPU_BACKEND" == "hip_rocm" ]]; then
# enable NCCL
FF_USE_NCCL=${FF_USE_NCCL:-ON}
fi
function get_build_configs() {
# Create a string with the values of the variables set in this script
- BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} FF_UCX_URL=${FF_UCX_URL} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND}"
+ BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
}
if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then
diff --git a/deps/nccl b/deps/nccl
index 6e24ef4e1f..2ea4ee94bf 160000
--- a/deps/nccl
+++ b/deps/nccl
@@ -1 +1 @@
-Subproject commit 6e24ef4e1f1eac9f104d115ef65429f179924ee7
+Subproject commit 2ea4ee94bfb04c886c79ccae60ac9961000fdee2
diff --git a/docker/README.md b/docker/README.md
index 916b78acf6..010aadf762 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -2,50 +2,61 @@
This folder contains the Dockerfiles and scripts that you can use to quickly run FlexFlow with no manual installation required. To use the containers, follow the steps below.
## Prerequisites
-You will need a machine with a NVIDIA GPU, with drivers installed. You will also need to have Docker and the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#getting-started) installed on the host machine.
+You can build and run the FlexFlow Docker images on any machine, but if you want to train or serve a model, you will need a machine with a NVIDIA or AMD GPU, with drivers installed. You will also need to have Docker and the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#getting-started) installed on the host machine. If using an AMD GPU, follow the [Deploy ROCm Docker containers](https://rocm.docs.amd.com/en/latest/deploy/docker.html) instructions.
## Downloading a pre-built package
The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `inference` branch (the `inference` branch is currently ahead of the `master` branch). The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):
-* `flexflow`: the pre-built version of FlexFlow. We currently publish one version targeting GPUs with a `hip_rocm` backend (`flexflow-hip_rocm`), and several versions for CUDA GPUs (one for each of the following CUDA versions 11.1, 11.2, 11.3, 11.5, 11.6, 11.7, and 11.8). The CUDA images are named `flexflow-cuda-`, e.g. [flexflow-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-11.8)
-* `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish one version of `flexflow-environment` for `hip_rocm` and one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 11.8 is tagged [flexflow-environment-cuda-11.8](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-11.8).
+* `flexflow`: the pre-built version of FlexFlow. We currently publish four version targeting AMD GPUs (ROCm versions: 5.3, 5.4, 5.5 and 5.6 ), and several versions for CUDA GPUs (CUDA versions: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1, and 12.2). The CUDA images are named `flexflow--`, e.g. [flexflow-hip_rocm-5.6](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm-5.6) or [flexflow-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-12.0) or
+* `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish four version of `flexflow-environment` for AMD GPUs and, for NVIDIA GPUs, one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 12.0 is tagged [flexflow-environment-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-12.0).
The easiest way to download any of the Docker containers above is to call:
```
-FF_GPU_BACKEND= cuda_version= ./docker/pull.sh
+./docker/pull.sh
```
-where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`), and `FF_GPU_BACKEND`/`cuda_version` are optional environment variables you can use if you wish to download the docker image for a GPU backend and/or cuda version other than those installed on your machine (leaving these variables unset will let the script autodetect which version to download depending on your setup).
+where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`). By default, the script will assume a NVIDIA backend and attempt to detect the CUDA version on your machine, to download the relevant container. If your machine has AMD GPUs, or no GPUs, or if you want to specify the CUDA/ROCM version to download, set the environment variables below:
+
+* `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be downloaded.
+* `cuda_version` (supported options: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1 and 12.2) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored
+* `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored.
+
After downloading a container you can use the `run.sh` script to run it by following the instructions in the section below.
## Building a Docker container from scratch
-If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](../INSTALL.md) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`). When building for the `cuda` backend, you can pick the CUDA version by setting the optional environment variable `cuda_version`, e.g.: `export cuda_version=11.8`. Leaving the `cuda_version` variable blank will let the script autodetect the CUDA version installed on the host machine, and build for that version. Setting the `cuda_version` env will have no effect when building for a GPU backend other than CUDA.
+If you prefer to build one of the Docker containers from scratch, you can do so with the help of the `build.sh` script. You can configure the build via the same environment variables that you'd use to configure a CMake build (refer to the [Installation guide](https://flexflow.readthedocs.io/en/latest/installation.html) and to the `config/config.linux` file). For example, to build for a CUDA backend, you can export `FF_GPU_BACKEND=cuda` (you can also omit this since `cuda` is the default value for `FF_GPU_BACKEND`). When building for the `cuda` backend, you can pick the CUDA version by setting the optional environment variable `cuda_version`, e.g.: `export cuda_version=12.0`. Leaving the `cuda_version` variable blank will let the script autodetect the CUDA version installed on the host machine, and build for that version. Setting the `cuda_version` env will have no effect when building for a GPU backend other than CUDA. Similarly, you can pick the ROCm version by setting `hip_version` when the backend is `FF_GPU_BACKEND=hip_rocm`, whereas the env will be ignored for non-HIP backends.
To build the FlexFlow container, run (the `flexflow` argument of the build script can be omitted):
```
-FF_GPU_BACKEND= cuda_version= ./docker/build.sh flexflow
+./docker/build.sh flexflow
```
If you only want to build the `flexflow-environment` image (the base layers of the `flexflow` container, used in CI and for other internal purposes), run:
```
-FF_GPU_BACKEND= cuda_version= ./docker/build.sh flexflow-environment
+./docker/build.sh flexflow-environment
```
## Running a Docker container
-After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND` and `cuda_version` optional environment variables to run the docker image with the desired GPU backend and CUDA version. Leaving these variables unset will instruct the script to autodetect the GPU backend and CUDA version installed on the current machine and run the Docker container with it if available.
+After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND`, `cuda_version` and `hip_version` optional environment variables to run the docker image with the desired GPU backend and CUDA/HIP version:
+
+* `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be run.
+* `cuda_version` (supported options: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored
+* `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored.
+
+Leaving these variables unset will assume a GPU backend, and instruct the script to autodetect the CUDA version installed on the current machine and run the Docker container with it if available.
```
-FF_GPU_BACKEND= cuda_version= ./docker/run.sh --image_name flexflow
+./docker/run.sh --image_name flexflow
```
If you wish to run the `flexflow-environment` container, run:
```
-FF_GPU_BACKEND= cuda_version= ./docker/run.sh --image_name flexflow-environment
+./docker/run.sh --image_name flexflow-environment
```
N.B.: If you don't have GPUs available on the machine, or you wish to run the docker image without attaching GPUs, you can set the environment variable `ATTACH_GPUS=false` before running the script.
diff --git a/docker/build.sh b/docker/build.sh
index 6ed5cbe00e..b68860712f 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -2,7 +2,7 @@
set -euo pipefail
# Usage: ./build.sh
-# Optional environment variables: FF_GPU_BACKEND, cuda_version
+# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
# Cd into $FF_HOME. Assumes this script is in $FF_HOME/docker
cd "${BASH_SOURCE[0]%/*}/.."
@@ -11,6 +11,8 @@ cd "${BASH_SOURCE[0]%/*}/.."
image=${1:-flexflow}
FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
cuda_version=${cuda_version:-"empty"}
+hip_version=${hip_version:-"empty"}
+python_version=${python_version:-latest}
# Check docker image name
if [[ "$image" != @(flexflow-environment|flexflow) ]]; then
@@ -28,52 +30,97 @@ else
echo "Building $image docker image with default GPU backend: cuda"
fi
+# base image to use when building the flexflow environment docker image.
+ff_environment_base_image="ubuntu:20.04"
+# gpu backend version suffix for the docker image.
+gpu_backend_version=""
+
if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
# Autodetect cuda version if not specified
if [[ $cuda_version == "empty" ]]; then
- cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+ # shellcheck disable=SC2015
+ cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
# Change cuda_version eg. V11.7.99 to 11.7
cuda_version=${cuda_version:1:4}
+ if [[ -z "$cuda_version" ]]; then
+ echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
+ exit 1
+ fi
fi
# Check that CUDA version is supported, and modify cuda version to include default subsubversion
- if [[ "$cuda_version" == @(11.1|11.3|11.7) ]]; then
+ if [[ "$cuda_version" == @(11.1|11.3|11.7|12.0|12.1) ]]; then
cuda_version_input=${cuda_version}.1
- elif [[ "$cuda_version" == @(11.2|11.5|11.6) ]]; then
+ elif [[ "$cuda_version" == @(11.2|11.5|11.6|12.2) ]]; then
cuda_version_input=${cuda_version}.2
+ elif [[ "$cuda_version" == @(11.4) ]]; then
+ cuda_version_input=${cuda_version}.3
elif [[ "$cuda_version" == @(11.8) ]]; then
cuda_version_input=${cuda_version}.0
+ elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+ # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
+ cuda_version=12.2
+ cuda_version_input=${cuda_version}.2
else
- echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+ echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
- # Set cuda version suffix to docker image name
echo "Building $image docker image with CUDA $cuda_version"
- cuda_version="-${cuda_version}"
-else
- # Empty cuda version suffix for non-CUDA images
- cuda_version=""
- # Pick a default CUDA version for the base docker image from NVIDIA
- cuda_version_input="11.8.0"
+ ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04"
+ gpu_backend_version="-${cuda_version}"
fi
-docker build --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "cuda_version=${cuda_version_input}" -t "flexflow-environment-${FF_GPU_BACKEND}${cuda_version}" -f docker/flexflow-environment/Dockerfile .
+if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # Autodetect HIP version if not specified
+ if [[ $hip_version == "empty" ]]; then
+ # shellcheck disable=SC2015
+ hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true)
+ # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6
+ hip_version=${hip_version:0:3}
+ if [[ -z "$hip_version" ]]; then
+ echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env."
+ exit 1
+ fi
+ fi
+ # Check that HIP version is supported
+ if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ echo "Building $image docker image with HIP $hip_version"
+ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ gpu_backend_version="-${hip_version}"
+ fi
+fi
+
+# Get number of cores available on the machine. Build with all cores but one, to prevent RAM choking
+cores_available=$(nproc --all)
+n_build_cores=$(( cores_available -1 ))
+
+# check python_version
+if [[ "$python_version" != @(3.8|3.9|3.10|3.11|latest) ]]; then
+ echo "python_version not supported!"
+ exit 0
+fi
+
+docker build --build-arg "ff_environment_base_image=${ff_environment_base_image}" --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "hip_version=${hip_version}" --build-arg "python_version=${python_version}" -t "flexflow-environment-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow-environment/Dockerfile .
# If the user only wants to build the environment image, we are done
if [[ "$image" == "flexflow-environment" ]]; then
exit 0
fi
-# Gather arguments needed to build the FlexFlow image
-# Get number of cores available on the machine. Build with all cores but one, to prevent RAM choking
-cores_available=$(nproc --all)
-n_build_cores=$(( cores_available -1 ))
+# Done with flexflow-environment image
-# If FF_CUDA_ARCH is set to autodetect, we need to perform the autodetection here because the Docker
-# image will not have access to GPUs during the build phase (due to a Docker restriction). In all other
-# cases, we pass the value of FF_CUDA_ARCH directly to Cmake.
-if [[ "${FF_CUDA_ARCH:-autodetect}" == "autodetect" ]]; then
- # Get CUDA architecture(s), if GPUs are available
- cat << EOF > ./get_gpu_arch.cu
+###########################################################################################
+
+# Build flexflow image if requested
+if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # If FF_CUDA_ARCH is set to autodetect, we need to perform the autodetection here because the Docker
+ # image will not have access to GPUs during the build phase (due to a Docker restriction). In all other
+ # cases, we pass the value of FF_CUDA_ARCH directly to Cmake.
+ if [[ "${FF_CUDA_ARCH:-autodetect}" == "autodetect" ]]; then
+ # Get CUDA architecture(s), if GPUs are available
+ cat << EOF > ./get_gpu_arch.cu
#include
int main() {
int count = 0;
@@ -87,24 +134,25 @@ int main() {
return 0;
}
EOF
- gpu_arch_codes=""
- if command -v nvcc &> /dev/null
- then
- nvcc ./get_gpu_arch.cu -o ./get_gpu_arch
- gpu_arch_codes="$(./get_gpu_arch)"
- fi
- gpu_arch_codes="$(echo "$gpu_arch_codes" | xargs -n1 | sort -u | xargs)"
- gpu_arch_codes="${gpu_arch_codes// /,}"
- rm -f ./get_gpu_arch.cu ./get_gpu_arch
-
- if [[ -n "$gpu_arch_codes" ]]; then
- echo "Host machine has GPUs with architecture codes: $gpu_arch_codes"
- echo "Configuring FlexFlow to build for the $gpu_arch_codes code(s)."
- FF_CUDA_ARCH="${gpu_arch_codes}"
- export FF_CUDA_ARCH
- else
- echo "FF_CUDA_ARCH is set to 'autodetect', but the host machine does not have any compatible GPUs."
- exit 1
+ gpu_arch_codes=""
+ if command -v nvcc &> /dev/null
+ then
+ nvcc ./get_gpu_arch.cu -o ./get_gpu_arch
+ gpu_arch_codes="$(./get_gpu_arch)"
+ fi
+ gpu_arch_codes="$(echo "$gpu_arch_codes" | xargs -n1 | sort -u | xargs)"
+ gpu_arch_codes="${gpu_arch_codes// /,}"
+ rm -f ./get_gpu_arch.cu ./get_gpu_arch
+
+ if [[ -n "$gpu_arch_codes" ]]; then
+ echo "Host machine has GPUs with architecture codes: $gpu_arch_codes"
+ echo "Configuring FlexFlow to build for the $gpu_arch_codes code(s)."
+ FF_CUDA_ARCH="${gpu_arch_codes}"
+ export FF_CUDA_ARCH
+ else
+ echo "FF_CUDA_ARCH is set to 'autodetect', but the host machine does not have any compatible GPUs."
+ exit 1
+ fi
fi
fi
@@ -114,4 +162,4 @@ fi
# Set value of BUILD_CONFIGS
get_build_configs
-docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "cuda_version=${cuda_version}" -t "flexflow-${FF_GPU_BACKEND}${cuda_version}" -f docker/flexflow/Dockerfile .
+docker build --build-arg "N_BUILD_CORES=${n_build_cores}" --build-arg "FF_GPU_BACKEND=${FF_GPU_BACKEND}" --build-arg "BUILD_CONFIGS=${BUILD_CONFIGS}" --build-arg "gpu_backend_version=${gpu_backend_version}" -t "flexflow-${FF_GPU_BACKEND}${gpu_backend_version}" -f docker/flexflow/Dockerfile .
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index 50497197c9..ee13a07375 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -1,11 +1,11 @@
-ARG cuda_version
-FROM nvidia/cuda:${cuda_version}-cudnn8-devel-ubuntu20.04
+ARG ff_environment_base_image
+FROM ${ff_environment_base_image}
LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow
LABEL org.opencontainers.image.description="FlexFlow environment container"
# Install basic dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano libhdf5-dev && \
+RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binutils git zlib1g-dev lsb-release nano gdb libhdf5-dev jq && \
rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/cuda.list /etc/apt/sources.list.d/nvidia-ml.list && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends software-properties-common && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends build-essential apt-utils \
@@ -16,43 +16,105 @@ RUN apt-get update && apt-get install -y --no-install-recommends wget sudo binut
apt-get upgrade -y libstdc++6
# Install Python3 with Miniconda
-RUN wget -c -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
- mv Miniconda3-latest-Linux-x86_64.sh ~/Miniconda3-latest-Linux-x86_64.sh && \
- chmod +x ~/Miniconda3-latest-Linux-x86_64.sh && \
- bash ~/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
- rm ~/Miniconda3-latest-Linux-x86_64.sh && \
- /opt/conda/bin/conda upgrade --all && \
- /opt/conda/bin/conda install conda-build conda-verify && \
- /opt/conda/bin/conda clean -ya
-
-# Optionally install HIP dependencies
+ARG python_version "latest"
+#RUN MINICONDA_SCRIPT_NAME=Miniconda3-latest-Linux-x86_64.sh; \
+RUN MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
+ if [ "$python_version" != "3.8" ] && [ "$python_version" != "3.9" ] && [ "$python_version" != "3.10" ] && [ "$python_version" != "3.11" ] && [ "$python_version" != "latest" ]; then \
+ echo "python_version '${python_version}' is not supported, please choose among {3.8, 3.9, 3.10, 3.11 or latest (default)}"; \
+ exit 1; \
+ fi; \
+ if [ "${python_version}" = "3.8" ]; then \
+ MINICONDA_SCRIPT_NAME=Miniconda3-py38_23.5.2-0-Linux-x86_64.sh; \
+ elif [ "${python_version}" = "3.9" ]; then \
+ MINICONDA_SCRIPT_NAME=Miniconda3-py39_23.5.2-0-Linux-x86_64.sh; \
+ elif [ "${python_version}" = "3.10" ]; then \
+ MINICONDA_SCRIPT_NAME=Miniconda3-py310_23.5.2-0-Linux-x86_64.sh; \
+ elif [ "${python_version}" = "3.11" ]; then \
+ MINICONDA_SCRIPT_NAME=Miniconda3-py311_23.5.2-0-Linux-x86_64.sh; \
+ fi; \
+ wget -c -q https://repo.continuum.io/miniconda/${MINICONDA_SCRIPT_NAME} && \
+ mv ./${MINICONDA_SCRIPT_NAME} ~/${MINICONDA_SCRIPT_NAME} && \
+ chmod +x ~/${MINICONDA_SCRIPT_NAME} && \
+ bash ~/${MINICONDA_SCRIPT_NAME} -b -p /opt/conda && \
+ rm ~/${MINICONDA_SCRIPT_NAME} && \
+ /opt/conda/bin/conda config --set solver classic && \
+ /opt/conda/bin/conda upgrade --all && \
+ /opt/conda/bin/conda install conda-build conda-verify && \
+ /opt/conda/bin/conda clean -ya
+
+# set MAKEFLAGS to speedup any dependency that uses make
+ARG N_BUILD_CORES
+ENV MAKEFLAGS "${MAKEFLAGS} -j${N_BUILD_CORES}"
+
+# Set env vars
+ENV PATH /opt/conda/bin:$PATH
+ENV CUDNN_DIR /usr/local/cuda
+ENV CUDA_DIR /usr/local/cuda
+
+# GPU-specific dependencies
+ARG FF_GPU_BACKEND "cuda"
+
+# Update NCCL if FF_GPU_BACKEND is cuda
+RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
+ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
+ ubuntu_version=$(lsb_release -rs); \
+ ubuntu_version=${ubuntu_version//./}; \
+ wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
+ DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
+ DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
+ rm -f cuda-keyring_1.0-1_all.deb; \
+ DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
+ else \
+ echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
+ fi'
+
+# Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm
# Note that amd's docs say to also install the `hip-runtime-nvidia` package. This
# package attempts to re-install cuda even though cuda is already installed
# in the container. It also attempts to install packages for a graphical install.
# For our container, we don't need `hip-runtime-nvidia`
-ARG FF_GPU_BACKEND "cuda"
+ARG hip_version "5.6"
RUN if [ "$FF_GPU_BACKEND" = "hip_cuda" ] || [ "$FF_GPU_BACKEND" = "hip_rocm" ]; then \
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Installing HIP dependencies"; \
- wget https://repo.radeon.com/amdgpu-install/22.20.5/ubuntu/bionic/amdgpu-install_22.20.50205-1_all.deb; \
- apt-get install -y ./amdgpu-install_22.20.50205-1_all.deb; \
- rm ./amdgpu-install_22.20.50205-1_all.deb; \
+ # Check that hip_version is one of 5.3,5.4,5.5,5.6
+ if [ "$hip_version" != "5.3" ] && [ "$hip_version" != "5.4" ] && [ "$hip_version" != "5.5" ] && [ "$hip_version" != "5.6" ]; then \
+ echo "hip_version '${hip_version}' is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"; \
+ exit 1; \
+ fi; \
+ # Compute script name and url given the version
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.6.50600-1_all.deb; \
+ if [ "$hip_version" = "5.3" ]; then \
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.3.50300-1_all.deb; \
+ elif [ "$hip_version" = "5.4" ]; then \
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.4.50400-1_all.deb; \
+ elif [ "$hip_version" = "5.5" ]; then \
+ AMD_GPU_SCRIPT_NAME=amdgpu-install_5.5.50500-1_all.deb; \
+ fi; \
+ AMD_GPU_SCRIPT_URL="https://repo.radeon.com/amdgpu-install/${hip_version}/ubuntu/focal/${AMD_GPU_SCRIPT_NAME}"; \
+ # Download and install AMD GPU software with ROCM and HIP support
+ wget $AMD_GPU_SCRIPT_URL; \
+ apt-get install -y ./${AMD_GPU_SCRIPT_NAME}; \
+ rm ./${AMD_GPU_SCRIPT_NAME}; \
amdgpu-install -y --usecase=hip,rocm --no-dkms; \
- apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk; \
+ apt-get install -y hip-dev hipblas miopen-hip rocm-hip-sdk rocm-device-libs; \
+ # Install protobuf dependencies
+ apt-get update -y && sudo apt-get install -y pkg-config zip g++ zlib1g-dev autoconf automake libtool make; \
else \
echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping installing HIP dependencies"; \
fi
RUN rm -rf /var/lib/apt/lists/*
-# Set env vars
-ENV PATH /opt/conda/bin:$PATH
-ENV CUDNN_DIR /usr/local/cuda
-ENV CUDA_DIR /usr/local/cuda
-
# Install python packages and other dependencies
RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind11 numpy pandas keras-preprocessing
# Install CPU-only Pytorch and related dependencies
-RUN conda install pytorch torchvision torchaudio cpuonly -c pytorch
-RUN conda install -c conda-forge onnx transformers sentencepiece
-RUN pip3 install tensorflow
+RUN conda install pytorch torchvision torchaudio -c pytorch
+RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
+RUN pip3 install tensorflow notebook
+# PEFT-related
+RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
+
+# Install Rust
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
+ENV PATH /root/.cargo/bin:$PATH
ENTRYPOINT ["/bin/bash"]
diff --git a/docker/flexflow/Dockerfile b/docker/flexflow/Dockerfile
index 0cda5cbc18..dff9259657 100644
--- a/docker/flexflow/Dockerfile
+++ b/docker/flexflow/Dockerfile
@@ -1,6 +1,6 @@
ARG FF_GPU_BACKEND "cuda"
-ARG cuda_version ""
-FROM flexflow-environment-$FF_GPU_BACKEND$cuda_version:latest
+ARG gpu_backend_version ""
+FROM flexflow-environment-$FF_GPU_BACKEND$gpu_backend_version:latest
LABEL org.opencontainers.image.source=https://github.com/flexflow/FlexFlow
LABEL org.opencontainers.image.description="FlexFlow container"
@@ -15,12 +15,19 @@ COPY . .
ARG BUILD_CONFIGS
ARG N_BUILD_CORES
+# Create install directory if needed
+RUN for pair in $BUILD_CONFIGS; do \
+ key=${pair%%=*}; \
+ value=${pair#*=}; \
+ if [ "$key" = "INSTALL_DIR" ] && [ -n "$value" ]; then \
+ mkdir -p "$value"; \
+ fi; \
+ done
+
# Build and install C++ and Python versions of FlexFlow
RUN mkdir -p build && cd build && \
eval "$BUILD_CONFIGS" ../config/config.linux && \
- make -j $N_BUILD_CORES && \
- eval "$BUILD_CONFIGS" ../config/config.linux && \
- make install && \
+ make -j $N_BUILD_CORES install && \
ldconfig
ENTRYPOINT ["/bin/bash"]
diff --git a/docker/publish.sh b/docker/publish.sh
index b8668d3c0e..c70419a9cc 100755
--- a/docker/publish.sh
+++ b/docker/publish.sh
@@ -2,7 +2,7 @@
set -euo pipefail
# Usage: ./publish.sh
-# Optional environment variables: FF_GPU_BACKEND, cuda_version
+# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"
@@ -11,6 +11,7 @@ cd "${BASH_SOURCE[0]%/*}"
image=${1:-flexflow}
FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
cuda_version=${cuda_version:-"empty"}
+hip_version=${hip_version:-"empty"}
# Check docker image name
if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then
@@ -18,6 +19,9 @@ if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then
exit 1
fi
+# gpu backend version suffix for the docker image.
+gpu_backend_version=""
+
# Check GPU backend
if [[ "${FF_GPU_BACKEND}" != @(cuda|hip_cuda|hip_rocm|intel) ]]; then
echo "Error, value of FF_GPU_BACKEND (${FF_GPU_BACKEND}) is invalid. Pick between 'cuda', 'hip_cuda', 'hip_rocm' or 'intel'."
@@ -31,25 +35,50 @@ fi
if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
# Autodetect cuda version if not specified
if [[ $cuda_version == "empty" ]]; then
- cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+ # shellcheck disable=SC2015
+ cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
# Change cuda_version eg. V11.7.99 to 11.7
cuda_version=${cuda_version:1:4}
+ if [[ -z "$cuda_version" ]]; then
+ echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
+ exit 1
+ fi
fi
# Check that CUDA version is supported
- if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then
- echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+ if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
+ echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
# Set cuda version suffix to docker image name
echo "Publishing $image docker image with CUDA $cuda_version"
- cuda_version="-${cuda_version}"
-else
- # Empty cuda version suffix for non-CUDA images
- cuda_version=""
+ gpu_backend_version="-${cuda_version}"
+fi
+
+if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # Autodetect HIP version if not specified
+ if [[ $hip_version == "empty" ]]; then
+ # shellcheck disable=SC2015
+ hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true)
+ # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6
+ hip_version=${hip_version:0:3}
+ if [[ -z "$hip_version" ]]; then
+ echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env."
+ exit 1
+ fi
+ fi
+ # Check that HIP version is supported
+ if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ echo "Pubilishing $image docker image with HIP $hip_version"
+ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ gpu_backend_version="-${hip_version}"
+ fi
fi
# Check that image exists
-docker image inspect "${image}-${FF_GPU_BACKEND}${cuda_version}":latest > /dev/null
+docker image inspect "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest > /dev/null
# Log into container registry
FLEXFLOW_CONTAINER_TOKEN=${FLEXFLOW_CONTAINER_TOKEN:-}
@@ -59,8 +88,8 @@ echo "$FLEXFLOW_CONTAINER_TOKEN" | docker login ghcr.io -u flexflow --password-s
# Tag image to be uploaded
git_sha=${GITHUB_SHA:-$(git rev-parse HEAD)}
if [ -z "$git_sha" ]; then echo "Commit hash cannot be detected, cannot publish the docker image to ghrc.io"; exit; fi
-docker tag "${image}-${FF_GPU_BACKEND}${cuda_version}":latest ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${cuda_version}":latest
+docker tag "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest
# Upload image
-docker push ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${cuda_version}":latest
+docker push ghcr.io/flexflow/"${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest
diff --git a/docker/pull.sh b/docker/pull.sh
index f8624a1072..f641e1a591 100755
--- a/docker/pull.sh
+++ b/docker/pull.sh
@@ -2,7 +2,7 @@
set -euo pipefail
# Usage: ./pull.sh
-# Optional environment variables: FF_GPU_BACKEND, cuda_version
+# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"
@@ -11,6 +11,7 @@ cd "${BASH_SOURCE[0]%/*}"
image=${1:-flexflow}
FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
cuda_version=${cuda_version:-"empty"}
+hip_version=${hip_version:-"empty"}
# Check docker image name
if [[ "${image}" != @(flexflow-environment|flexflow) ]]; then
@@ -28,31 +29,63 @@ else
echo "Downloading $image docker image with default GPU backend: cuda"
fi
+# gpu backend version suffix for the docker image.
+gpu_backend_version=""
+
if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
# Autodetect cuda version if not specified
if [[ $cuda_version == "empty" ]]; then
- cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+ # shellcheck disable=SC2015
+ cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
# Change cuda_version eg. V11.7.99 to 11.7
cuda_version=${cuda_version:1:4}
+ if [[ -z "$cuda_version" ]]; then
+ echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
+ exit 1
+ fi
fi
# Check that CUDA version is supported
- if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then
- echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+ if [[ "$cuda_version" != @(11.1|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
+ echo "cuda_version is not available for download, please choose among {11.1|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
+ # Use CUDA 12.2 for all versions greater or equal to 12.2 for now
+ if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+ cuda_version=12.2
+ fi
# Set cuda version suffix to docker image name
echo "Downloading $image docker image with CUDA $cuda_version"
- cuda_version="-${cuda_version}"
-else
- # Empty cuda version suffix for non-CUDA images
- cuda_version=""
+ gpu_backend_version="-${cuda_version}"
+fi
+
+if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # Autodetect HIP version if not specified
+ if [[ $hip_version == "empty" ]]; then
+ # shellcheck disable=SC2015
+ hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true)
+ # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6
+ hip_version=${hip_version:0:3}
+ if [[ -z "$hip_version" ]]; then
+ echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env."
+ exit 1
+ fi
+ fi
+ # Check that HIP version is supported
+ if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ echo "Downloading $image docker image with HIP $hip_version"
+ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ gpu_backend_version="-${hip_version}"
+ fi
fi
# Download image
-docker pull ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${cuda_version}"
+docker pull ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${gpu_backend_version}"
# Tag downloaded image
-docker tag ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${cuda_version}":latest "$image-${FF_GPU_BACKEND}${cuda_version}":latest
+docker tag ghcr.io/flexflow/"$image-${FF_GPU_BACKEND}${gpu_backend_version}":latest "$image-${FF_GPU_BACKEND}${gpu_backend_version}":latest
# Check that image exists
-docker image inspect "${image}-${FF_GPU_BACKEND}${cuda_version}":latest > /dev/null
+docker image inspect "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest > /dev/null
diff --git a/docker/run.sh b/docker/run.sh
index 307628f4fd..cdf9383052 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -2,7 +2,7 @@
set -euo pipefail
# Usage: ./run.sh
-# Optional environment variables: FF_GPU_BACKEND, cuda_version, ATTACH_GPUS, SHM_SIZE
+# Optional environment variables: FF_GPU_BACKEND, cuda_version, hip_version, ATTACH_GPUS, SHM_SIZE
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"
@@ -11,12 +11,14 @@ cd "${BASH_SOURCE[0]%/*}"
image=${1:-flexflow}
FF_GPU_BACKEND=${FF_GPU_BACKEND:-cuda}
cuda_version=${cuda_version:-"empty"}
+hip_version=${hip_version:-"empty"}
# Parameter controlling whether to attach GPUs to the Docker container
ATTACH_GPUS=${ATTACH_GPUS:-true}
gpu_arg=""
if $ATTACH_GPUS ; then gpu_arg="--gpus all" ; fi
+
# Amount of shared memory to give the Docker container access to
# If you get a Bus Error, increase this value. If you don't have enough memory
# on your machine, decrease this value.
@@ -38,35 +40,84 @@ else
echo "Running $image docker image with default GPU backend: cuda"
fi
+# gpu backend version suffix for the docker image.
+gpu_backend_version=""
+
if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
# Autodetect cuda version if not specified
if [[ $cuda_version == "empty" ]]; then
- cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}')
+ # shellcheck disable=SC2015
+ cuda_version=$(command -v nvcc >/dev/null 2>&1 && nvcc --version | grep "release" | awk '{print $NF}' || true)
# Change cuda_version eg. V11.7.99 to 11.7
cuda_version=${cuda_version:1:4}
+ if [[ -z "$cuda_version" ]]; then
+ echo "Could not detect CUDA version. Please specify one manually by setting the 'cuda_version' env."
+ exit 1
+ fi
fi
# Check that CUDA version is supported
- if [[ "$cuda_version" != @(11.1|11.3|11.7|11.2|11.5|11.6|11.8) ]]; then
- echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.5|11.6|11.7|11.8}"
+ if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+ echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
exit 1
fi
+ # Use CUDA 12.2 for all versions greater or equal to 12.2 for now
+ if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+ cuda_version=12.2
+ fi
# Set cuda version suffix to docker image name
echo "Running $image docker image with CUDA $cuda_version"
- cuda_version_hyphen="-${cuda_version}"
-else
- # Empty cuda version suffix for non-CUDA images
- cuda_version_hyphen=""
+ gpu_backend_version="-${cuda_version}"
+fi
+
+if [[ "${FF_GPU_BACKEND}" == "hip_rocm" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; then
+ # Autodetect HIP version if not specified
+ if [[ $hip_version == "empty" ]]; then
+ # shellcheck disable=SC2015
+ hip_version=$(command -v hipcc >/dev/null 2>&1 && hipcc --version | grep "HIP version:" | awk '{print $NF}' || true)
+ # Change hip_version eg. 5.6.31061-8c743ae5d to 5.6
+ hip_version=${hip_version:0:3}
+ if [[ -z "$hip_version" ]]; then
+ echo "Could not detect HIP version. Please specify one manually by setting the 'hip_version' env."
+ exit 1
+ fi
+ fi
+ # Check that HIP version is supported
+ if [[ "$hip_version" != @(5.3|5.4|5.5|5.6) ]]; then
+ echo "hip_version is not supported, please choose among {5.3, 5.4, 5.5, 5.6}"
+ exit 1
+ fi
+ echo "Running $image docker image with HIP $hip_version"
+ if [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ gpu_backend_version="-${hip_version}"
+ fi
fi
# Check that image exists, if fails, print the default error message.
-if [[ "$(docker images -q "$image"-"$FF_GPU_BACKEND""$cuda_version_hyphen":latest 2> /dev/null)" == "" ]]; then
- echo ""
- echo "To download the docker image, run:"
- echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/pull.sh $image"
- echo "To build the docker image from source, run:"
- echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/build.sh $image"
- echo ""
+if [[ "$(docker images -q "${image}-${FF_GPU_BACKEND}${gpu_backend_version}":latest 2> /dev/null)" == "" ]]; then
+ echo "Error, ${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest does not exist!"
+ if [[ "${FF_GPU_BACKEND}" == "cuda" ]]; then
+ echo ""
+ echo "To download the docker image, run:"
+ echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/pull.sh $image"
+ echo "To build the docker image from source, run:"
+ echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} cuda_version=${cuda_version} $(pwd)/build.sh $image"
+ echo ""
+ elif [[ "${FF_GPU_BACKEND}" == "hip_rocm" ]]; then
+ echo ""
+ echo "To download the docker image, run:"
+ echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} hip_version=${hip_version} $(pwd)/pull.sh $image"
+ echo "To build the docker image from source, run:"
+ echo " FF_GPU_BACKEND=${FF_GPU_BACKEND} hip_version=${hip_version} $(pwd)/build.sh $image"
+ echo ""
+ fi
exit 1
fi
-eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${image}-${FF_GPU_BACKEND}${cuda_version_hyphen}:latest"
+hf_token_volume=""
+hf_token_path="$HOME/.cache/huggingface/token"
+if [ -f "$hf_token_path" ]; then
+ # If the token exists, add the volume mount to the Docker command
+ hf_token_volume+="-v $hf_token_path:/root/.cache/huggingface/token"
+fi
+
+eval docker run -it "$gpu_arg" "--shm-size=${SHM_SIZE}" "${hf_token_volume}" "${image}-${FF_GPU_BACKEND}${gpu_backend_version}:latest"
diff --git a/docs/Makefile b/docs/Makefile
index 5424c5bc9f..d14c2ef91f 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -15,7 +15,7 @@ help:
.PHONY: help Makefile clean
clean:
- rm -rf build source/_doxygen/ source/c++_api/ doxygen/output
+ rm -rf build doxygen/output doxygen/cpp_api
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# Catch-all target: route all unknown targets to Sphinx using the new
diff --git a/docs/doxygen/Doxyfile b/docs/doxygen/Doxyfile
index b38bfc12b5..aafa65d79b 100644
--- a/docs/doxygen/Doxyfile
+++ b/docs/doxygen/Doxyfile
@@ -44,7 +44,7 @@ PROJECT_NUMBER =
# for a project that appears at the top of each page and should give viewer a
# quick idea about the purpose of the project. Keep the description short.
-PROJECT_BRIEF = A distributed deep learning framework that supports flexible parallelization strategies.
+PROJECT_BRIEF = "A distributed deep learning framework that supports flexible parallelization strategies."
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
# in the documentation. The maximum height of the logo should not exceed 55
@@ -150,7 +150,7 @@ INLINE_INHERITED_MEMB = NO
# shortest path that makes the file name unique will be used
# The default value is: YES.
-FULL_PATH_NAMES = YES
+FULL_PATH_NAMES = NO
# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
# Stripping is only done if one of the specified strings matches the left-hand
@@ -874,12 +874,7 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
-INPUT = $(FF_HOME)/align
-INPUT += $(FF_HOME)/bootcamp_demo
-INPUT += $(FF_HOME)/examples
INPUT += $(FF_HOME)/include
-INPUT += $(FF_HOME)/nmt
-INPUT += $(FF_HOME)/python
INPUT += $(FF_HOME)/src
# This tag can be used to specify the character encoding of the source files
@@ -911,12 +906,10 @@ INPUT_ENCODING = UTF-8
FILE_PATTERNS = *.c \
*.cc \
- *.cpp \
*.cu \
+ *.cpp \
*.h \
- *.hpp \
- *.md \
- *.py
+ *.hpp
# The RECURSIVE tag can be used to specify whether or not subdirectories should
# be searched for input files as well.
@@ -2110,7 +2103,7 @@ MAN_LINKS = NO
# captures the structure of the code including all documentation.
# The default value is: NO.
-GENERATE_XML = YES
+GENERATE_XML = NO
# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
diff --git a/docs/source/chatbot.rst b/docs/source/chatbot.rst
new file mode 100644
index 0000000000..c41307e231
--- /dev/null
+++ b/docs/source/chatbot.rst
@@ -0,0 +1,64 @@
+:tocdepth: 1
+********
+Chatbot
+********
+
+The chatbot use case involves setting up a conversational AI model using FlexFlow Serve, capable of engaging in interactive dialogues with users.
+
+Requirements
+============
+
+- FlexFlow Serve setup with required configurations.
+- Gradio or any interactive interface tool.
+
+Implementation
+==============
+
+1. FlexFlow Initialization
+ Initialize FlexFlow Serve with desired configurations and specific LLM model.
+
+2. Gradio Interface Setup
+ Define a function for response generation based on user inputs. Setup Gradio Chat Interface for interaction.
+
+ .. code-block:: python
+
+ def generate_response(user_input):
+ result = llm.generate(user_input)
+ return result.output_text.decode('utf-8')
+
+
+3. Running the Interface
+ Launch the Gradio interface and interact with the model by entering text inputs.
+
+ .. image:: /imgs/gradio_interface.png
+ :alt: Gradio Chatbot Interface
+ :align: center
+
+4. Shutdown
+ Stop the FlexFlow server after interaction.
+
+Example
+=======
+
+Complete code example can be found here:
+
+1. `Chatbot Example with incremental decoding `__
+
+2. `Chatbot Example with speculative inference `__
+
+
+Example Implementation:
+
+ .. code-block:: python
+
+ import gradio as gr
+ import flexflow.serve as ff
+
+ ff.init(num_gpus=2, memory_per_gpu=14000, ...)
+
+ def generate_response(user_input):
+ result = llm.generate(user_input)
+ return result.output_text.decode('utf-8')
+
+ iface = gr.ChatInterface(fn=generate_response)
+ iface.launch()
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0e614f37c2..f67c0dae01 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -13,28 +13,42 @@
import os
import sys
import subprocess
+import shutil
+import sphinx # only needed for the manual post processing
+from pathlib import Path
+from m2r2 import convert
+from docutils.core import publish_string
+import re
def get_parent_dir_path(path):
return os.path.abspath(os.path.join(path, ".."))
docs_path = get_parent_dir_path(os.path.dirname(os.path.abspath(__file__)))
doxygen_path = os.path.join(docs_path, "doxygen")
+doxygen_output = os.path.join(doxygen_path, "output")
+doxygen_cpp_api_out = os.path.join(doxygen_path, "cpp_api")
FF_HOME = get_parent_dir_path(docs_path)
python_package_path = os.path.join(FF_HOME, "python")
sys.path.insert(0, os.path.abspath(python_package_path))
# Build the Doxygen docs
-#subprocess.call(f'cd {doxygen_path}; FF_HOME={FF_HOME} doxygen', shell=True)
+shutil.rmtree(doxygen_cpp_api_out, ignore_errors=True)
+for gpu_backend in ("cuda", "hip"):
+ doxygen_dest = os.path.join(doxygen_cpp_api_out, f"{gpu_backend}_api")
+ os.makedirs(doxygen_dest, exist_ok=True)
+ exclude_extension = ".cu" if gpu_backend == "hip" else ".cpp"
+ doxygen_cmd = f'export FF_HOME={FF_HOME}; ( cat Doxyfile ; echo "EXCLUDE_PATTERNS+=*{exclude_extension}" ) | doxygen -'
+ subprocess.check_call(doxygen_cmd, cwd=doxygen_path, shell=True)
+ subprocess.check_call(f'mv {os.path.join(doxygen_output, "html")}/* {doxygen_dest}/', shell=True)
import sphinx_rtd_theme
# -- Project information -----------------------------------------------------
project = 'FlexFlow'
-copyright = '2020, Stanford, LANL, CMU, Facebook'
-author = 'Stanford, LANL, CMU, Facebook'
-
+copyright = '2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)'
+author = 'CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)'
# -- General configuration ---------------------------------------------------
@@ -45,8 +59,6 @@ def get_parent_dir_path(path):
'sphinx_rtd_theme',
'sphinx.ext.autodoc',
'm2r2',
- 'breathe',
- 'exhale',
]
# Theme options are theme-specific and customize the look and feel of a theme
@@ -55,6 +67,7 @@ def get_parent_dir_path(path):
html_theme_options = {
"collapse_navigation" : False
}
+html_extra_path = [doxygen_cpp_api_out]
# Add any paths that contain templates here, relative to this directory.
# templates_path = ['_templates']
@@ -86,27 +99,50 @@ def get_parent_dir_path(path):
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
-# Breathe + Exhale configuration
-# Setup the breathe extension
-breathe_projects = {
- "FlexFlow": "./_doxygen/xml"
-}
-breathe_default_project = "FlexFlow"
-
-c_plus_plus_src_dirs = " ".join([f"\"{os.path.join(FF_HOME, 'src', dirname)}\"" for dirname in ("loss_functions", "mapper", "metrics_functions", "ops", "parallel_ops", "recompile", "runtime", "utils")])
-# Setup the exhale extension
-exhale_args = {
- # These arguments are required
- "containmentFolder": "./c++_api",
- "rootFileName": "c++_api_root.rst",
- "doxygenStripFromPath": "..",
- # Heavily encouraged optional argument (see docs)
- #"rootFileTitle": "Library API",
- # Suggested optional arguments
- "createTreeView": True,
- # TIP: if using the sphinx-bootstrap-theme, you need
- # "treeViewIsBootstrap": True,
- "exhaleExecutesDoxygen": True,
- "exhaleDoxygenStdin": f'INPUT = {c_plus_plus_src_dirs}'
-}
+def manual_post_processing(app, exception):
+ if exception is None and app.builder.name == 'html': # build succeeded
+ print(f'Post-processing HTML docs at path {app.outdir}')
+ build_dir = Path(app.outdir)
+
+ # List of subfolders to search
+ folder_paths = [build_dir, build_dir / 'developers_guide']
+
+ for folder_path in folder_paths:
+
+ # Only get HTML files in build dir, not subfolders
+ html_files = folder_path.glob('*.html')
+
+ for html_file in html_files:
+ content = html_file.read_text()
+
+ # Find dropdown menus, and manually convert their contents
+ pattern = r'\nExpand here
\n
(.*?) '
+ blocks = re.findall(pattern, content, re.DOTALL)
+
+ for block in blocks:
+ # Convert Markdown to HTML
+ rst = convert(block, github_markdown=True)
+ html = publish_string(rst, writer_name='html')
+ html_str = html.decode('utf-8')
+
+ # Replace block with converted HTML
+ content = content.replace(block, html_str)
+
+ # Add space after dropdown menu block
+ content = content.replace('',
+ '\n')
+
+ # Replace incorrect links
+ content = content.replace('href="../docker/README.md"', 'href="docker.html"')
+ content = content.replace('href="./TRAIN.md"', 'href="train_overview.html"')
+ content = content.replace('href="./SERVE.md"', 'href="serve_overview.html"')
+ content = content.replace('href="./docs/source/keras.rst"', 'href="keras.html"')
+ content = content.replace('href="./docs/source/onnx.rst"', 'href="onnx.html"')
+
+
+ html_file.write_text(content)
+
+
+def setup(app):
+ app.connect('build-finished', manual_post_processing)
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
new file mode 100644
index 0000000000..b5d39be62e
--- /dev/null
+++ b/docs/source/cpp_api.rst
@@ -0,0 +1,10 @@
+*************
+C++ API
+*************
+
+The FlexFlow backend is at the core of FlexFlow Train and FlexFlow Serve. It is written entirely in C/C++ and CUDA/HIP. This section documents the API, which is generated by Doxygen and it is available at the following links:
+
+* `CUDA version <./cuda_api/index.html>`_ (default version)
+* `HIP version <./hip_api/index.html>`_
+
+The two versions only differ when it comes to the GPU kernels, so the great majority of the entries are identical. If you are unsure which version to use, take a look at the CUDA version.
diff --git a/docs/source/developers_guide.rst b/docs/source/developers_guide/developers_guide.rst
similarity index 64%
rename from docs/source/developers_guide.rst
rename to docs/source/developers_guide/developers_guide.rst
index 107135fae4..a125e60460 100644
--- a/docs/source/developers_guide.rst
+++ b/docs/source/developers_guide/developers_guide.rst
@@ -2,5 +2,5 @@
Developers Guide
******************
-.. mdinclude:: ../../CONTRIBUTING.md
+.. mdinclude:: ../../../CONTRIBUTING.md
:start-line: 2
diff --git a/docs/source/developers_guide/ff_internals.rst b/docs/source/developers_guide/ff_internals.rst
new file mode 100644
index 0000000000..15c0804255
--- /dev/null
+++ b/docs/source/developers_guide/ff_internals.rst
@@ -0,0 +1,6 @@
+*******************
+FlexFlow Internals
+*******************
+
+.. mdinclude:: internals.md
+ :start-line: 2
diff --git a/docs/source/developers_guide/internals.md b/docs/source/developers_guide/internals.md
new file mode 100644
index 0000000000..243b14a174
--- /dev/null
+++ b/docs/source/developers_guide/internals.md
@@ -0,0 +1,15 @@
+# FlexFlow Internals
+
+## The Parallel Computation Graph (PCG)
+
+FlexFlow uses a _Parallel Computation Graph (PCG)_ to simultaneously represent tensor operations, as well as parallelism choices and data movement across nodes.
+
+### Tensor representations
+
+There are two types of tensor representations in FlexFlow: a [Tensor](./cuda_api/de/da9/structFlexFlow_1_1TensorBase.html) and a [ParallelTensor](./cuda_api/d3/dfc/structFlexFlow_1_1ParallelTensorBase.html). The first variant is used when writing a FlexFlow DNN program, whereas the second is used by the runtime to run all the computations in a distributed fashion. `Tensor` and `ParallelTensor` are implemented as typedef-ed pointers to, respectively, the `TensorBase` (defined in `include/flexflow/tensor.h`) and `ParallelTensorBase` (defined in `include/flexflow/parallel_tensor.h`) structs.
+
+The `ParallelTensor` struct contains all the information that a `Tensor` also stores, but in addition, it also codifies how the tensor should be parallelized. For instance, a ParallelTensor records how each dimension is *partitioned*, how many *replicas* of the tensors have been created, and the *mapping* between the partitions of the tensors and the physical machines that will store them.
+
+## Transformation generation
+
+## Joint optimization
diff --git a/docs/source/docker.rst b/docs/source/docker.rst
index 4a457a8dcc..63f84e460c 100644
--- a/docs/source/docker.rst
+++ b/docs/source/docker.rst
@@ -1,3 +1,4 @@
+:tocdepth: 1
*************
Docker
*************
diff --git a/docs/source/imgs/gradio_api.png b/docs/source/imgs/gradio_api.png
new file mode 100644
index 0000000000..7bf1b99a5e
Binary files /dev/null and b/docs/source/imgs/gradio_api.png differ
diff --git a/docs/source/imgs/gradio_interface.png b/docs/source/imgs/gradio_interface.png
new file mode 100644
index 0000000000..9584d76fb3
Binary files /dev/null and b/docs/source/imgs/gradio_interface.png differ
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7af62e417e..6aa47d157b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -7,47 +7,40 @@ Welcome to FlexFlow's documentation!
====================================
.. toctree::
- :maxdepth: 2
:caption: Getting Started
welcome
installation
docker
- jupyter
+ multinode
.. toctree::
- :maxdepth: 2
- :caption: Interoperability
+ :caption: FlexFlow Serve
- keras
- pytorch
- onnx
+ serve_overview
+ serve_usecases
+ serve_api
.. toctree::
- :maxdepth: 2
- :caption: Examples
-
- mt5
+ :caption: FlexFlow Train
-.. toctree::
- :maxdepth: 3
- :caption: Python API
+ train_overview
+ train_interface
+ train_examples
- python/models
- python/layers
- python/dataloader
+ train_python_api
.. toctree::
- :maxdepth: 2
- :caption: C++ API
+ :caption: FlexFlow Backend
- c++_api/c++_api_root
+ cpp_api
.. toctree::
- :maxdepth: 2
+ :maxdepth: 3
:caption: Developers Guide
- developers_guide
+ developers_guide/developers_guide.rst
+.. developers_guide/ff_internals.rst
.. Indices and tables
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index 109b546834..95ec8596e6 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,5 +1,6 @@
+:tocdepth: 1
*************
-Installing FlexFlow
+Building from source
*************
.. mdinclude:: ../../INSTALL.md
diff --git a/docs/source/jupyter.rst b/docs/source/jupyter.rst
deleted file mode 100644
index 2e37bfb183..0000000000
--- a/docs/source/jupyter.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-*****************
-Jupyter Notebook
-*****************
-
-.. mdinclude:: ../../jupyter_notebook/README.md
- :start-line: 2
diff --git a/docs/source/keras.rst b/docs/source/keras.rst
index eb4f2d7fa7..f1c0743c70 100644
--- a/docs/source/keras.rst
+++ b/docs/source/keras.rst
@@ -1,6 +1,7 @@
-*************
-Keras Support
-*************
+:tocdepth: 1
+****************
+Keras Interface
+****************
FlexFlow provides a drop-in replacement for TensorFlow Keras. Running an existing Keras program on the FlexFlow backend only requires a few lines of changes to the program. The detailed instructions are as follows:
diff --git a/docs/source/mt5.rst b/docs/source/mt5.rst
index c9c3af080a..8a632b90d6 100644
--- a/docs/source/mt5.rst
+++ b/docs/source/mt5.rst
@@ -1,6 +1,6 @@
-****************
-HuggingFace mT5
-****************
+************************
+mT5 Model
+************************
.. mdinclude:: ../../examples/python/pytorch/mt5/README.md
:start-line: 2
diff --git a/docs/source/multinode.rst b/docs/source/multinode.rst
new file mode 100644
index 0000000000..8827200582
--- /dev/null
+++ b/docs/source/multinode.rst
@@ -0,0 +1,8 @@
+:tocdepth: 1
+******************
+Multinode tutorial
+******************
+
+
+.. mdinclude:: ../../MULTI-NODE.md
+ :start-line: 3
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index 91b314ac96..b6bc49b146 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -1,3 +1,4 @@
+:tocdepth: 1
*************
ONNX Support
*************
diff --git a/docs/source/prompt_template.rst b/docs/source/prompt_template.rst
new file mode 100644
index 0000000000..7f987b0f18
--- /dev/null
+++ b/docs/source/prompt_template.rst
@@ -0,0 +1,55 @@
+:tocdepth: 1
+****************
+Prompt Template
+****************
+
+Prompt templates guide the model's response generation. This use case demonstrates setting up FlexFlow Serve to integrate with Langchain and using prompt templates to handle dynamic prompt templates.
+
+Requirements
+============
+
+- FlexFlow Serve setup with appropriate configurations.
+- Langchain integration with templates for prompt management.
+
+Implementation
+==============
+
+1. FlexFlow Initialization
+ Initialize and configure FlexFlow Serve.
+
+2. LLM Setup
+ Compile and start the server for text generation.
+
+3. Prompt Template Setup
+ Setup a prompt template for guiding model's responses.
+
+4. Response Generation
+ Use the LLM with the prompt template to generate a response.
+
+5. Shutdown
+ Stop the FlexFlow server after generating the response.
+
+Example
+=======
+
+Complete code example can be found here:
+
+1. `Prompt Template Example with incremental decoding `__
+
+2. `Prompt Template Example with speculative inference `__
+
+
+Example Implementation:
+
+ .. code-block:: python
+
+ import flexflow.serve as ff
+ from langchain.prompts import PromptTemplate
+
+ ff_llm = FlexFlowLLM(...)
+ ff_llm.compile_and_start(...)
+
+ template = "Question: {question}\nAnswer:"
+ prompt = PromptTemplate(template=template, input_variables=["question"])
+
+ response = ff_llm.generate("Who was the US president in 1997?")
diff --git a/docs/source/python/layers.rst b/docs/source/python/layers.rst
index 91f12094e6..1be91a8b17 100644
--- a/docs/source/python/layers.rst
+++ b/docs/source/python/layers.rst
@@ -3,7 +3,7 @@ Layers API
**********
Layers are the basic building blocks of neural networks in FlexFlow. The inputs of a layer consists of a tensor or a list of tensors and some state variables,
-and the outputs of a layer is a tensor or a list of tensors.
+and the outputs of a layer is a tensor or a list of tensors. See https://github.com/flexflow/FlexFlow/examples/python/native/ops for an example for every layer
.. automodule:: flexflow.core.flexflow_cffi
:noindex:
diff --git a/docs/source/pytorch.rst b/docs/source/pytorch.rst
index a6d4e23311..3dbe337d55 100644
--- a/docs/source/pytorch.rst
+++ b/docs/source/pytorch.rst
@@ -1,6 +1,7 @@
-***************
-PyTorch Support
-***************
+:tocdepth: 1
+******************
+PyTorch Interface
+******************
Users can use FlexFlow to optimize the parallelization performance of existing PyTorch models in two steps.
The PyTorch support requires the `PyTorch FX module `_, so make sure your PyTorch is up to date.
diff --git a/docs/source/rag.rst b/docs/source/rag.rst
new file mode 100644
index 0000000000..640b2fe131
--- /dev/null
+++ b/docs/source/rag.rst
@@ -0,0 +1,90 @@
+:tocdepth: 1
+********
+RAG Q&A
+********
+
+Retrieval Augmented Generation (RAG) combines language models with external knowledge. This use case integrates RAG with FlexFlow Serve for Q&A with documents.
+
+Requirements
+============
+
+- FlexFlow Serve setup.
+- Retriever setup for RAG.
+
+Implementation
+==============
+
+1. FlexFlow Initialization
+ Initialize and configure FlexFlow Serve.
+
+2. Data Retrieval Setup
+ Setup a retriever for sourcing information relevant to user queries.
+
+3. RAG Integration
+ Integrate the retriever with FlexFlow Serve.
+
+4. Response Generation
+ Use the LLM with RAG to generate responses based on model's knowledge and retrieved information.
+
+5. Shutdown
+ The FlexFlow server automatically shuts down after generating the response.
+
+Example
+=======
+
+A complete code example for a web-document Q&A using FlexFlow can be found here:
+
+1. `Rag Q&A Example with incremental decoding `__
+
+2. `Rag Q&A Example with speculative inference `__
+
+
+Example Implementation:
+
+ .. code-block:: python
+
+ # imports
+
+ # compile and start server
+ ff_llm = FlexFlowLLM(...)
+ gen_config = ff.GenerationConfig(...)
+ ff_llm.compile_and_start(...)
+ ff_llm_wrapper = FF_LLM_wrapper(flexflow_llm=ff_llm)
+
+
+ # Load web page content
+ loader = WebBaseLoader("https://example.com/data")
+ data = loader.load()
+
+ # Split text
+ text_splitter = RecursiveCharacterTextSplitter(...)
+ all_splits = text_splitter.split_documents(data)
+
+ # Initialize embeddings
+ embeddings = OpenAIEmbeddings(...)
+
+ # Create VectorStore
+ vectorstore = Chroma.from_documents(all_splits, embeddings)
+
+ # Use VectorStore as a retriever
+ retriever = vectorstore.as_retriever()
+
+ # Apply similarity search
+ question = "Example Question"
+ docs = vectorstore.similarity_search(question)
+ max_chars_per_doc = 100
+ docs_text = ''.join([docs[i].page_content[:max_chars_per_doc] for i in range(len(docs))])
+
+ # Using a Prompt Template
+ prompt_rag = PromptTemplate.from_template(
+ "Summarize the main themes in these retrieved docs: {docs_text}"
+ )
+
+ # Build Chain
+ llm_chain_rag = LLMChain(llm=ff_llm_wrapper, prompt=prompt_rag)
+
+ # Run
+ rag_result = llm_chain_rag(docs_text)
+
+ # Stop the server
+ ff_llm.stop_server()
\ No newline at end of file
diff --git a/docs/source/serve_api.rst b/docs/source/serve_api.rst
new file mode 100644
index 0000000000..6a607cbf0c
--- /dev/null
+++ b/docs/source/serve_api.rst
@@ -0,0 +1,7 @@
+**************************
+FlexFlow Serve Python API
+**************************
+
+.. toctree::
+ serve_fastapi
+ serve_gradioapi
\ No newline at end of file
diff --git a/docs/source/serve_fastapi.rst b/docs/source/serve_fastapi.rst
new file mode 100644
index 0000000000..62a28e5937
--- /dev/null
+++ b/docs/source/serve_fastapi.rst
@@ -0,0 +1,106 @@
+:tocdepth: 1
+***********************
+FlexFlow Serve FastAPI
+***********************
+
+Introduction
+============
+
+The Python API for FlexFlow Serve enables users to initialize, manage and interact with large language models (LLMs) via FastAPI or Gradio.
+
+Requirements
+------------
+
+- FlexFlow Serve setup with necessary configurations.
+- FastAPI and Uvicorn for running the API server.
+
+API Configuration
+=================
+
+Users can configure the API using FastAPI to handle requests and manage the model.
+
+1. FastAPI Application Initialization
+ Initialize the FastAPI application to create API endpoints.
+
+2. Request Model Definition
+ Define the model for API requests using Pydantic.
+
+3. Global Variable for LLM Model
+ Declare a global variable to store the LLM model.
+
+Example
+-------
+
+.. code-block:: python
+
+ from fastapi import FastAPI
+ from pydantic import BaseModel
+ import flexflow.serve as ff
+
+ app = FastAPI()
+
+ class PromptRequest(BaseModel):
+ prompt: str
+
+ llm = None
+
+Endpoint Creation
+=================
+
+Create API endpoints for LLM interactions to handle generation requests.
+
+1. Initialize Model on Startup
+ Use the FastAPI event handler to initialize and compile the LLM model when the API server starts.
+
+2. Generate Response Endpoint
+ Create a POST endpoint to generate responses based on the user's prompt.
+
+Example
+-------
+
+.. code-block:: python
+
+ @app.on_event("startup")
+ async def startup_event():
+ global llm
+ # Initialize and compile the LLM model
+ llm.compile(
+ generation_config,
+ # ... other params as needed
+ )
+ llm.start_server()
+
+ @app.post("/generate/")
+ async def generate(prompt_request: PromptRequest):
+ # ... exception handling
+ full_output = llm.generate([prompt_request.prompt])[0].output_text.decode('utf-8')
+ # ... split prompt and response text for returning results
+ return {"prompt": prompt_request.prompt, "response": full_output}
+
+Running and Testing
+===================
+
+Instructions for running and testing the FastAPI server.
+
+1. Run the FastAPI Server
+ Use Uvicorn to run the FastAPI server with specified host and port.
+
+2. Testing the API
+ Make requests to the API endpoints and verify the responses.
+
+Example
+-------
+
+.. code-block:: bash
+
+ # Running within the inference/python folder:
+ uvicorn entrypoint.fastapi_incr:app --reload --port 3000
+
+Full API Entrypoint Code
+=========================
+
+A complete code example for a web-document Q&A using FlexFlow can be found here:
+
+1. `FastAPI Example with incremental decoding `__
+
+2. `FastAPI Example with speculative inference `__
diff --git a/docs/source/serve_gradioapi.rst b/docs/source/serve_gradioapi.rst
new file mode 100644
index 0000000000..ed19e05347
--- /dev/null
+++ b/docs/source/serve_gradioapi.rst
@@ -0,0 +1,30 @@
+:tocdepth: 1
+*************************
+FlexFlow Serve Gradio API
+*************************
+
+Introduction
+============
+
+Users can also set up the API endpoints with a Gradio Chatbot Interface.
+
+Requirements
+------------
+
+- FlexFlow Serve setup with necessary configurations.
+- Running the gradio chatbot interface.
+
+Example
+========
+
+In a running gradio chatbot interface, hit the "Use via API" button on the bottom left.
+
+ .. image:: /imgs/gradio_interface.png
+ :alt: Gradio Chatbot Interface
+ :align: center
+
+Users can easily access an API endpoint for sending prompts to the model.
+
+ .. image:: /imgs/gradio_api.png
+ :alt: Gradio API
+ :align: center
\ No newline at end of file
diff --git a/docs/source/serve_overview.rst b/docs/source/serve_overview.rst
new file mode 100644
index 0000000000..35c992a853
--- /dev/null
+++ b/docs/source/serve_overview.rst
@@ -0,0 +1,7 @@
+:tocdepth: 1
+*************
+Serving Overview
+*************
+
+.. mdinclude:: ../../SERVE.md
+ :start-line: 3
diff --git a/docs/source/serve_usecases.rst b/docs/source/serve_usecases.rst
new file mode 100644
index 0000000000..4aa3fd2807
--- /dev/null
+++ b/docs/source/serve_usecases.rst
@@ -0,0 +1,8 @@
+*******************
+Serving Usecases
+*******************
+
+.. toctree::
+ chatbot
+ prompt_template
+ rag
\ No newline at end of file
diff --git a/docs/source/train_examples.rst b/docs/source/train_examples.rst
new file mode 100644
index 0000000000..84d58c3465
--- /dev/null
+++ b/docs/source/train_examples.rst
@@ -0,0 +1,6 @@
+*************
+Training Examples
+*************
+
+.. toctree::
+ mt5
\ No newline at end of file
diff --git a/docs/source/train_interface.rst b/docs/source/train_interface.rst
new file mode 100644
index 0000000000..ce81fc1f3c
--- /dev/null
+++ b/docs/source/train_interface.rst
@@ -0,0 +1,8 @@
+*******************
+Training Interface
+*******************
+
+.. toctree::
+ keras
+ pytorch
+ onnx
\ No newline at end of file
diff --git a/docs/source/train_overview.rst b/docs/source/train_overview.rst
new file mode 100644
index 0000000000..58898ad35c
--- /dev/null
+++ b/docs/source/train_overview.rst
@@ -0,0 +1,7 @@
+:tocdepth: 1
+*************
+Training Overview
+*************
+
+.. mdinclude:: ../../TRAIN.md
+ :start-line: 3
diff --git a/docs/source/train_python_api.rst b/docs/source/train_python_api.rst
new file mode 100644
index 0000000000..40451dedf9
--- /dev/null
+++ b/docs/source/train_python_api.rst
@@ -0,0 +1,11 @@
+*******************
+Python API
+*******************
+This section documents the Python API for FlexFlow Train.
+
+.. toctree::
+ :maxdepth: 3
+
+ python/models
+ python/layers
+ python/dataloader
\ No newline at end of file
diff --git a/docs/source/welcome.rst b/docs/source/welcome.rst
index 8108b1dd67..7f73f15563 100644
--- a/docs/source/welcome.rst
+++ b/docs/source/welcome.rst
@@ -1,3 +1,4 @@
+:tocdepth: 1
*************
Overview
*************
diff --git a/examples/cpp/AlexNet/alexnet.cc b/examples/cpp/AlexNet/alexnet.cc
index 128496eab1..3507882329 100644
--- a/examples/cpp/AlexNet/alexnet.cc
+++ b/examples/cpp/AlexNet/alexnet.cc
@@ -26,7 +26,7 @@ using FlexFlow::ParallelTensor;
using FlexFlow::SGDOptimizer;
using FlexFlow::Tensor;
-LegionRuntime::Logger::Category log_app("AlexNet");
+Legion::Logger log_app("AlexNet");
void parse_input_args(char **argv, int argc, AlexNetConfig &config) {
for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/DLRM/dlrm.cc b/examples/cpp/DLRM/dlrm.cc
index 7dc49215b3..d7dc167557 100644
--- a/examples/cpp/DLRM/dlrm.cc
+++ b/examples/cpp/DLRM/dlrm.cc
@@ -19,7 +19,7 @@
using namespace Legion;
-LegionRuntime::Logger::Category log_app("DLRM");
+Legion::Logger log_app("DLRM");
void parse_input_args(char **argv, int argc, DLRMConfig &apConfig);
diff --git a/examples/cpp/InceptionV3/inception.cc b/examples/cpp/InceptionV3/inception.cc
index b2070cc52d..6d0fa7ee53 100644
--- a/examples/cpp/InceptionV3/inception.cc
+++ b/examples/cpp/InceptionV3/inception.cc
@@ -21,7 +21,7 @@
using namespace Legion;
using namespace FlexFlow;
-LegionRuntime::Logger::Category log_app("Inceptionv3");
+Legion::Logger log_app("Inceptionv3");
Tensor InceptionA(FFModel &ff, Tensor input, int pool_features) {
Tensor t1 = input;
diff --git a/examples/cpp/ResNet/resnet.cc b/examples/cpp/ResNet/resnet.cc
index 455eb743ae..49ce934a6a 100644
--- a/examples/cpp/ResNet/resnet.cc
+++ b/examples/cpp/ResNet/resnet.cc
@@ -24,7 +24,7 @@ using FlexFlow::Optimizer;
using FlexFlow::SGDOptimizer;
using FlexFlow::Tensor;
-LegionRuntime::Logger::Category log_app("ResNet");
+Legion::Logger log_app("ResNet");
void parse_input_args(char **argv, int argc, ResNetConfig &config) {
for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/Transformer/transformer.cc b/examples/cpp/Transformer/transformer.cc
index d61a63cd03..b04093b0a9 100644
--- a/examples/cpp/Transformer/transformer.cc
+++ b/examples/cpp/Transformer/transformer.cc
@@ -17,7 +17,7 @@
using namespace Legion;
-LegionRuntime::Logger::Category log_app("Transformer");
+Legion::Logger log_app("Transformer");
Tensor create_emb(FFModel *model,
Tensor const &input,
diff --git a/examples/cpp/XDL/xdl.cc b/examples/cpp/XDL/xdl.cc
index 2e6c3cec98..a2272f36e5 100644
--- a/examples/cpp/XDL/xdl.cc
+++ b/examples/cpp/XDL/xdl.cc
@@ -18,7 +18,7 @@
using namespace Legion;
-LegionRuntime::Logger::Category log_app("XDL");
+Legion::Logger log_app("XDL");
void parse_input_args(char **argv, int argc, XDLConfig &apConfig);
diff --git a/examples/cpp/candle_uno/candle_uno.cc b/examples/cpp/candle_uno/candle_uno.cc
index 779b8e9c14..e9f4bf876a 100644
--- a/examples/cpp/candle_uno/candle_uno.cc
+++ b/examples/cpp/candle_uno/candle_uno.cc
@@ -21,7 +21,7 @@
using namespace Legion;
using namespace std;
-LegionRuntime::Logger::Category log_app("Candle_Uno");
+Legion::Logger log_app("Candle_Uno");
void parse_input_args(char **argv, int argc, CandleConfig &apConfig);
diff --git a/examples/cpp/mixture_of_experts/moe.cc b/examples/cpp/mixture_of_experts/moe.cc
index a707310885..a25f94abd9 100644
--- a/examples/cpp/mixture_of_experts/moe.cc
+++ b/examples/cpp/mixture_of_experts/moe.cc
@@ -20,7 +20,7 @@
using namespace Legion;
-LegionRuntime::Logger::Category log_app("MoE");
+Legion::Logger log_app("MoE");
void parse_input_args(char **argv, int argc, MoeConfig &config) {
for (int i = 1; i < argc; i++) {
diff --git a/examples/cpp/resnext50/resnext.cc b/examples/cpp/resnext50/resnext.cc
index 3c28ca27b8..9b71b37cce 100644
--- a/examples/cpp/resnext50/resnext.cc
+++ b/examples/cpp/resnext50/resnext.cc
@@ -7,7 +7,7 @@ using FlexFlow::Optimizer;
using FlexFlow::SGDOptimizer;
using FlexFlow::Tensor;
-LegionRuntime::Logger::Category log_app("resnext");
+Legion::Logger log_app("resnext");
Tensor resnext_block(FFModel &ff,
Tensor input,
diff --git a/examples/cpp/split_test/split_test.cc b/examples/cpp/split_test/split_test.cc
index 97b98c3214..ac9d516a59 100644
--- a/examples/cpp/split_test/split_test.cc
+++ b/examples/cpp/split_test/split_test.cc
@@ -3,7 +3,7 @@
using namespace Legion;
using namespace FlexFlow;
-LegionRuntime::Logger::Category log_app("split_test");
+Legion::Logger log_app("split_test");
void FlexFlow::top_level_task(Task const *task,
std::vector const ®ions,
diff --git a/examples/cpp/split_test_2/split_test_2.cc b/examples/cpp/split_test_2/split_test_2.cc
index 69385d14cb..fef078adbc 100644
--- a/examples/cpp/split_test_2/split_test_2.cc
+++ b/examples/cpp/split_test_2/split_test_2.cc
@@ -9,7 +9,7 @@ using FlexFlow::PCG::Graph;
using FlexFlow::PCG::GraphSearchHelper;
using FlexFlow::PCG::Node;
-LegionRuntime::Logger::Category log_app("split_test_2");
+Legion::Logger log_app("split_test_2");
void top_level_task(Task const *task,
std::vector const ®ions,
diff --git a/examples/python/keras/callback.py b/examples/python/keras/callback.py
index f4ebc03d17..c647822957 100644
--- a/examples/python/keras/callback.py
+++ b/examples/python/keras/callback.py
@@ -20,6 +20,7 @@
from flexflow.keras.datasets import cifar10
from flexflow.keras import backend as K
from accuracy import ModelAccuracy
+import flexflow.core as ff
import numpy as np
@@ -68,4 +69,6 @@ def top_level_task():
if __name__ == "__main__":
print("Functional API, cifar10 cnn callback")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/elementwise_max_min.py b/examples/python/keras/elementwise_max_min.py
index 95291f1273..52a80b431b 100644
--- a/examples/python/keras/elementwise_max_min.py
+++ b/examples/python/keras/elementwise_max_min.py
@@ -1,5 +1,6 @@
from flexflow.keras.layers import Dense, Input, Maximum, Minimum
import flexflow.keras.optimizers
+import flexflow.core as ff
import numpy as np
@@ -54,7 +55,8 @@ def elementwise_min():
epochs = 2
)
-
if __name__ == '__main__':
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
elementwise_max()
elementwise_min()
diff --git a/examples/python/keras/elementwise_mul_broadcast.py b/examples/python/keras/elementwise_mul_broadcast.py
index d68476a6cb..1405871a7a 100644
--- a/examples/python/keras/elementwise_mul_broadcast.py
+++ b/examples/python/keras/elementwise_mul_broadcast.py
@@ -1,6 +1,6 @@
from flexflow.keras.layers import Dense, Input, Reshape, Multiply
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
def broadcast1():
@@ -92,8 +92,9 @@ def broadcast_both():
epochs = 2
)
-
if __name__ == '__main__':
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
broadcast1()
broadcast2()
broadcast_both()
diff --git a/examples/python/keras/func_cifar10_alexnet.py b/examples/python/keras/func_cifar10_alexnet.py
index c0ade0b722..a4f8dc61ac 100644
--- a/examples/python/keras/func_cifar10_alexnet.py
+++ b/examples/python/keras/func_cifar10_alexnet.py
@@ -77,5 +77,7 @@ def top_level_task():
if __name__ == "__main__":
print("Functional API, cifar10 alexnet")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn.py b/examples/python/keras/func_cifar10_cnn.py
index 423541386f..ce0358da53 100644
--- a/examples/python/keras/func_cifar10_cnn.py
+++ b/examples/python/keras/func_cifar10_cnn.py
@@ -61,7 +61,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
print("Functional API, cifar10 cnn")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_concat.py b/examples/python/keras/func_cifar10_cnn_concat.py
index 72dfdeffaf..4fe0f5ce18 100644
--- a/examples/python/keras/func_cifar10_cnn_concat.py
+++ b/examples/python/keras/func_cifar10_cnn_concat.py
@@ -75,5 +75,7 @@ def top_level_task():
if __name__ == "__main__":
print("Functional API, cifar10 cnn concat")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_concat_model.py b/examples/python/keras/func_cifar10_cnn_concat_model.py
index 39885bac8c..c8838de1eb 100644
--- a/examples/python/keras/func_cifar10_cnn_concat_model.py
+++ b/examples/python/keras/func_cifar10_cnn_concat_model.py
@@ -75,7 +75,10 @@ def top_level_task():
model.fit([x_train, x_train], y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
print("Functional API, cifar10 cnn concat model")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_concat_seq_model.py b/examples/python/keras/func_cifar10_cnn_concat_seq_model.py
index cda95beb49..3e4f939283 100644
--- a/examples/python/keras/func_cifar10_cnn_concat_seq_model.py
+++ b/examples/python/keras/func_cifar10_cnn_concat_seq_model.py
@@ -68,7 +68,10 @@ def top_level_task():
model.fit([x_train, x_train], y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
print("Functional API, cifar10 cnn concat sequential model")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_nested.py b/examples/python/keras/func_cifar10_cnn_nested.py
index def8a6bcf4..7391ba5a2b 100644
--- a/examples/python/keras/func_cifar10_cnn_nested.py
+++ b/examples/python/keras/func_cifar10_cnn_nested.py
@@ -67,7 +67,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
print("Functional API, cifar10 cnn nested")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_cifar10_cnn_net2net.py b/examples/python/keras/func_cifar10_cnn_net2net.py
index 5434e28aca..695a1157dd 100644
--- a/examples/python/keras/func_cifar10_cnn_net2net.py
+++ b/examples/python/keras/func_cifar10_cnn_net2net.py
@@ -120,5 +120,7 @@ def top_level_task():
if __name__ == "__main__":
print("Functional API, cifarf10 cnn teach student")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_cnn.py b/examples/python/keras/func_mnist_cnn.py
index a81ddd0f94..8f2041dfe2 100644
--- a/examples/python/keras/func_mnist_cnn.py
+++ b/examples/python/keras/func_mnist_cnn.py
@@ -70,7 +70,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
+
if __name__ == "__main__":
print("Functional API, mnist cnn")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_cnn_concat.py b/examples/python/keras/func_mnist_cnn_concat.py
index 54c1f32d36..64bb2cdbb0 100644
--- a/examples/python/keras/func_mnist_cnn_concat.py
+++ b/examples/python/keras/func_mnist_cnn_concat.py
@@ -61,7 +61,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
+
if __name__ == "__main__":
print("Functional API, mnist cnn concat")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp.py b/examples/python/keras/func_mnist_mlp.py
index 5521f193c1..ddf2022366 100644
--- a/examples/python/keras/func_mnist_mlp.py
+++ b/examples/python/keras/func_mnist_mlp.py
@@ -54,7 +54,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp_concat.py b/examples/python/keras/func_mnist_mlp_concat.py
index 29b982cea8..6b282f65e6 100644
--- a/examples/python/keras/func_mnist_mlp_concat.py
+++ b/examples/python/keras/func_mnist_mlp_concat.py
@@ -76,7 +76,10 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp concat")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp_concat2.py b/examples/python/keras/func_mnist_mlp_concat2.py
index 5a35bd9f8b..b309a00187 100644
--- a/examples/python/keras/func_mnist_mlp_concat2.py
+++ b/examples/python/keras/func_mnist_mlp_concat2.py
@@ -87,7 +87,10 @@ def top_level_task():
model.fit([x_train, x_train, x_train], y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp concat with input")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/func_mnist_mlp_net2net.py b/examples/python/keras/func_mnist_mlp_net2net.py
index ed8589e22e..0b44029938 100644
--- a/examples/python/keras/func_mnist_mlp_net2net.py
+++ b/examples/python/keras/func_mnist_mlp_net2net.py
@@ -88,7 +88,10 @@ def top_level_task():
student_model.fit(x_train, y_train, epochs=160, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp teach student")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
- gc.collect()
\ No newline at end of file
+ gc.collect()
diff --git a/examples/python/keras/gather.py b/examples/python/keras/gather.py
index 15ccd61579..f14d737d17 100644
--- a/examples/python/keras/gather.py
+++ b/examples/python/keras/gather.py
@@ -1,7 +1,7 @@
from flexflow.keras.layers import Dense, Input, Reshape
from flexflow.keras.backend.internal import gather
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
@@ -42,4 +42,6 @@ def gather_example():
if __name__ == '__main__':
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
gather_example()
diff --git a/examples/python/keras/identity_loss.py b/examples/python/keras/identity_loss.py
index d0396c6d46..8e26fc246b 100644
--- a/examples/python/keras/identity_loss.py
+++ b/examples/python/keras/identity_loss.py
@@ -15,7 +15,7 @@
from flexflow.keras.layers import Dense, Input, Reshape, Multiply
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
def test_identity_loss():
@@ -36,4 +36,6 @@ def test_identity_loss():
if __name__ == "__main__":
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
test_identity_loss()
diff --git a/examples/python/keras/reduce_sum.py b/examples/python/keras/reduce_sum.py
index 3857738d4b..33030e2cec 100644
--- a/examples/python/keras/reduce_sum.py
+++ b/examples/python/keras/reduce_sum.py
@@ -15,7 +15,7 @@
from flexflow.keras.layers import Dense, Input, Reshape, Multiply
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
def test_reduce_sum1():
@@ -74,6 +74,8 @@ def test_reduce_sum3():
if __name__ == "__main__":
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
test_reduce_sum1()
test_reduce_sum2()
test_reduce_sum3()
diff --git a/examples/python/keras/regularizer.py b/examples/python/keras/regularizer.py
index 3b1e30d04d..3a24129db2 100644
--- a/examples/python/keras/regularizer.py
+++ b/examples/python/keras/regularizer.py
@@ -2,7 +2,7 @@
from flexflow.keras.layers import Dense, Input, Reshape
from flexflow.keras.backend.internal import gather
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
@@ -26,4 +26,6 @@ def regularizer_example():
if __name__ == '__main__':
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
regularizer_example()
diff --git a/examples/python/keras/reshape.py b/examples/python/keras/reshape.py
index 1acce1b2b6..ae756a8f70 100644
--- a/examples/python/keras/reshape.py
+++ b/examples/python/keras/reshape.py
@@ -55,7 +55,10 @@ def top_level_task():
print(model.summary())
model.fit(x_train, y_train, epochs=10, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
+
if __name__ == "__main__":
print("Functional API, mnist mlp")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
gc.collect()
diff --git a/examples/python/keras/rsqrt.py b/examples/python/keras/rsqrt.py
index be55c8a1fd..e33873ecd5 100644
--- a/examples/python/keras/rsqrt.py
+++ b/examples/python/keras/rsqrt.py
@@ -16,7 +16,7 @@
from flexflow.keras.layers import Dense, Input
from flexflow.keras.backend.internal import rsqrt
import flexflow.keras.optimizers
-
+import flexflow.core as ff
import numpy as np
def test_rsqrt():
@@ -40,4 +40,6 @@ def test_rsqrt():
if __name__ == "__main__":
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
test_rsqrt()
diff --git a/examples/python/keras/seq_cifar10_cnn.py b/examples/python/keras/seq_cifar10_cnn.py
index 80f4390d4c..66ea8530e0 100644
--- a/examples/python/keras/seq_cifar10_cnn.py
+++ b/examples/python/keras/seq_cifar10_cnn.py
@@ -54,6 +54,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=80, callbacks=[VerifyMetrics(ModelAccuracy.CIFAR10_CNN), EpochVerifyMetrics(ModelAccuracy.CIFAR10_CNN)])
+
if __name__ == "__main__":
- print("Sequantial model, cifar10 cnn")
+ print("Sequential model, cifar10 cnn")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_cnn.py b/examples/python/keras/seq_mnist_cnn.py
index eaf0fdfc16..09ad4ea4cf 100644
--- a/examples/python/keras/seq_mnist_cnn.py
+++ b/examples/python/keras/seq_mnist_cnn.py
@@ -55,6 +55,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
+
if __name__ == "__main__":
print("Sequential model, mnist cnn")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_cnn_nested.py b/examples/python/keras/seq_mnist_cnn_nested.py
index 2c92349cd6..628129ddb9 100644
--- a/examples/python/keras/seq_mnist_cnn_nested.py
+++ b/examples/python/keras/seq_mnist_cnn_nested.py
@@ -65,6 +65,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=5, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_CNN), EpochVerifyMetrics(ModelAccuracy.MNIST_CNN)])
+
if __name__ == "__main__":
print("Sequential model, mnist cnn nested model")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_cnn_net2net.py b/examples/python/keras/seq_mnist_cnn_net2net.py
index 4b9c9c16ba..e2a04ba686 100644
--- a/examples/python/keras/seq_mnist_cnn_net2net.py
+++ b/examples/python/keras/seq_mnist_cnn_net2net.py
@@ -98,6 +98,9 @@ def top_level_task():
create_student_model_cnn(teacher_model, num_classes, x_train, y_train)
+
if __name__ == "__main__":
print("Sequential model, mnist mlp teacher student")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_mlp.py b/examples/python/keras/seq_mnist_mlp.py
index 21c7435eb7..46b774a2e1 100644
--- a/examples/python/keras/seq_mnist_mlp.py
+++ b/examples/python/keras/seq_mnist_mlp.py
@@ -55,6 +55,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=20, callbacks=[VerifyMetrics(ModelAccuracy.MNIST_MLP), EpochVerifyMetrics(ModelAccuracy.MNIST_MLP)])
model.evaluate(x=x_train, y=y_train)
+
if __name__ == "__main__":
print("Sequential model, mnist mlp")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_mnist_mlp_net2net.py b/examples/python/keras/seq_mnist_mlp_net2net.py
index 628f76db3a..c7a7d7a6f8 100644
--- a/examples/python/keras/seq_mnist_mlp_net2net.py
+++ b/examples/python/keras/seq_mnist_mlp_net2net.py
@@ -91,6 +91,9 @@ def top_level_task():
create_student_model_mlp(teacher_model, num_classes, x_train, y_train)
+
if __name__ == "__main__":
print("Sequential model, mnist mlp teacher student")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/seq_reuters_mlp.py b/examples/python/keras/seq_reuters_mlp.py
index 5412ad0599..ed748f67d8 100644
--- a/examples/python/keras/seq_reuters_mlp.py
+++ b/examples/python/keras/seq_reuters_mlp.py
@@ -19,6 +19,7 @@
from flexflow.keras.datasets import reuters
from flexflow.keras.preprocessing.text import Tokenizer
from flexflow.keras.callbacks import Callback, VerifyMetrics
+import flexflow.core as ff
import numpy as np
from accuracy import ModelAccuracy
@@ -61,6 +62,9 @@ def top_level_task():
model.fit(x_train, y_train, epochs=epochs, callbacks=[VerifyMetrics(ModelAccuracy.REUTERS_MLP)])
+
if __name__ == "__main__":
print("Sequential model, reuters mlp")
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/keras/unary.py b/examples/python/keras/unary.py
index 622e15dc2d..63c83b9af2 100644
--- a/examples/python/keras/unary.py
+++ b/examples/python/keras/unary.py
@@ -62,4 +62,6 @@ def top_level_task():
if __name__ == "__main__":
print("alexnet keras")
- top_level_task()
\ No newline at end of file
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
+ top_level_task()
diff --git a/examples/python/native/alexnet.py b/examples/python/native/alexnet.py
index 61397cefc1..6d6e58a7f2 100644
--- a/examples/python/native/alexnet.py
+++ b/examples/python/native/alexnet.py
@@ -3,7 +3,7 @@
from accuracy import ModelAccuracy
from PIL import Image
-import argparse
+import argparse, json
import numpy as np
@@ -133,7 +133,18 @@ def test_accuracy():
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--test_acc",
action="store_true", help="Test accuracy flag")
+ parser.add_argument(
+ "-config-file",
+ help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+ type=str,
+ default=None,
+ )
args, unknown = parser.parse_known_args()
+ configs_dict = None
+ if args.config_file is not None:
+ with open(args.config_file) as f:
+ configs_dict = json.load(f)
+ init_flexflow_runtime(configs_dict)
if args.test_acc:
print("Testing cifar10 alexnet training accuracy")
test_accuracy()
diff --git a/examples/python/native/cifar10_cnn.py b/examples/python/native/cifar10_cnn.py
index 44bdce4519..11bc936617 100644
--- a/examples/python/native/cifar10_cnn.py
+++ b/examples/python/native/cifar10_cnn.py
@@ -2,7 +2,7 @@
from flexflow.keras.datasets import cifar10
from accuracy import ModelAccuracy
-import argparse
+import argparse, json
def top_level_task():
@@ -90,7 +90,18 @@ def test_accuracy():
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--test_acc",
action="store_true", help="Test accuracy flag")
+ parser.add_argument(
+ "-config-file",
+ help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+ type=str,
+ default=None,
+ )
args, unknown = parser.parse_known_args()
+ configs_dict = None
+ if args.config_file is not None:
+ with open(args.config_file) as f:
+ configs_dict = json.load(f)
+ init_flexflow_runtime(configs_dict)
if args.test_acc:
print("Testing cifar10 cnn training accuracy")
test_accuracy()
diff --git a/examples/python/native/cifar10_cnn_attach.py b/examples/python/native/cifar10_cnn_attach.py
index ba4288c8cd..e200cc03cf 100644
--- a/examples/python/native/cifar10_cnn_attach.py
+++ b/examples/python/native/cifar10_cnn_attach.py
@@ -144,4 +144,6 @@ def top_level_task():
if __name__ == "__main__":
print("cifar10 cnn attach")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/native/cifar10_cnn_concat.py b/examples/python/native/cifar10_cnn_concat.py
index b177295ad6..7234116b3c 100644
--- a/examples/python/native/cifar10_cnn_concat.py
+++ b/examples/python/native/cifar10_cnn_concat.py
@@ -70,6 +70,10 @@ def top_level_task():
if accuracy < ModelAccuracy.CIFAR10_CNN.value:
assert 0, 'Check Accuracy'
+
+
if __name__ == "__main__":
print("cifar10 cnn concat")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/native/mnist_cnn.py b/examples/python/native/mnist_cnn.py
index 6eabbe57db..f6787a4827 100644
--- a/examples/python/native/mnist_cnn.py
+++ b/examples/python/native/mnist_cnn.py
@@ -18,7 +18,7 @@
from flexflow.keras.datasets import mnist
from accuracy import ModelAccuracy
-import argparse
+import argparse, json
def top_level_task():
@@ -89,7 +89,18 @@ def test_accuracy():
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--test_acc",
action="store_true", help="Test accuracy flag")
+ parser.add_argument(
+ "-config-file",
+ help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+ type=str,
+ default=None,
+ )
args, unknown = parser.parse_known_args()
+ configs_dict = None
+ if args.config_file is not None:
+ with open(args.config_file) as f:
+ configs_dict = json.load(f)
+ init_flexflow_runtime(configs_dict)
if args.test_acc:
print("Testing mnist cnn training accuracy")
test_accuracy()
diff --git a/examples/python/native/mnist_mlp.py b/examples/python/native/mnist_mlp.py
index aefe7cfd57..8763eba40c 100644
--- a/examples/python/native/mnist_mlp.py
+++ b/examples/python/native/mnist_mlp.py
@@ -3,7 +3,7 @@
from flexflow.keras.datasets import mnist
from accuracy import ModelAccuracy
-import argparse
+import argparse, json
def top_level_task():
@@ -75,7 +75,18 @@ def test_accuracy():
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--test_acc",
action="store_true", help="Test accuracy flag")
+ parser.add_argument(
+ "-config-file",
+ help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+ type=str,
+ default=None,
+ )
args, unknown = parser.parse_known_args()
+ configs_dict = None
+ if args.config_file is not None:
+ with open(args.config_file) as f:
+ configs_dict = json.load(f)
+ init_flexflow_runtime(configs_dict)
if args.test_acc:
print("Testing mnist mlp training accuracy")
test_accuracy()
diff --git a/examples/python/native/mnist_mlp_attach.py b/examples/python/native/mnist_mlp_attach.py
index 6e7c8f8405..1294432ec5 100644
--- a/examples/python/native/mnist_mlp_attach.py
+++ b/examples/python/native/mnist_mlp_attach.py
@@ -134,4 +134,6 @@ def top_level_task():
if __name__ == "__main__":
print("mnist mlp attach")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/native/ops/add.py b/examples/python/native/ops/add.py
new file mode 100644
index 0000000000..50b9d16fd0
--- /dev/null
+++ b/examples/python/native/ops/add.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'add' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_add(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.add(input_tensor1, input_tensor2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ _ = test_add(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/add_bias_residual_layer_norm.py b/examples/python/native/ops/add_bias_residual_layer_norm.py
new file mode 100644
index 0000000000..6e8dffbc9e
--- /dev/null
+++ b/examples/python/native/ops/add_bias_residual_layer_norm.py
@@ -0,0 +1,78 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_add_bias_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+ residual_tensor = ffmodel.create_tensor(residual_arr.shape, DataType.DT_FLOAT)
+
+ output_tensor, layer_norm_output = ffmodel.add_bias_residual_layer_norm(
+ input_tensor,
+ residual_tensor,
+ axes=axes,
+ elementwise_affine=elementwise_affine,
+ eps=eps,
+ use_bias=use_bias,
+ name="add_bias_residual_layer_norm_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+ dataloader_residual = ffmodel.create_data_loader(residual_tensor, residual_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_residual.reset()
+
+ dataloader_input.next_batch(ffmodel)
+ dataloader_residual.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ output_tensor.inline_map(ffmodel, ffconfig)
+ layer_norm_output.inline_map(ffmodel, ffconfig)
+ output_result = output_tensor.get_array(ffmodel, ffconfig)
+ layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+ return output_result, layer_norm_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ residual_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ axes_to_normalize = [1, 2] # Example axes to normalize
+
+ output_result, layer_norm_result = test_add_bias_residual_layer_norm(
+ ffconfig,
+ input_data,
+ residual_data,
+ axes=axes_to_normalize,
+ elementwise_affine=True,
+ eps=1e-5,
+ use_bias=True
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nResidual Array:")
+ print(residual_data)
+ print(f"\nOutput Array after applying add_bias_residual_layer_norm along axes {axes_to_normalize}:")
+ print(output_result)
+ print("\nLayer Norm Result:")
+ print(layer_norm_result)
diff --git a/examples/python/native/ops/arg_top_k.py b/examples/python/native/ops/arg_top_k.py
new file mode 100644
index 0000000000..79edc5dfad
--- /dev/null
+++ b/examples/python/native/ops/arg_top_k.py
@@ -0,0 +1,61 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_arg_top_k(ffconfig, input_arr: np.ndarray, k: int, sorted: bool, speculative_decoding: bool, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ arg_top_k_output = ffmodel.arg_top_k(
+ input_tensor,
+ k,
+ sorted,
+ speculative_decoding,
+ name="arg_top_k_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_MEAN_SQUARED_ERROR,
+ metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR],
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ arg_top_k_output.inline_map(ffmodel, ffconfig)
+ output_result = arg_top_k_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ k_value = 5
+ sorted_value = True
+ speculative_decoding_value = False # Example value for speculative_decoding
+
+ output_result = test_arg_top_k(
+ ffconfig,
+ input_data,
+ k=k_value,
+ sorted=sorted_value,
+ speculative_decoding=speculative_decoding_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying arg_top_k:")
+ print(output_result)
diff --git a/examples/python/native/ops/argmax.py b/examples/python/native/ops/argmax.py
new file mode 100644
index 0000000000..dda0e6b0bc
--- /dev/null
+++ b/examples/python/native/ops/argmax.py
@@ -0,0 +1,55 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_argmax(ffconfig, input_arr: np.ndarray, beam_search: bool, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ argmax_output = ffmodel.argmax(
+ input_tensor,
+ beam_search,
+ name="argmax_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ argmax_output.inline_map(ffmodel, ffconfig)
+ output_result = argmax_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ beam_search_value = True # Set to True or False based on your requirement
+
+ output_result = test_argmax(
+ ffconfig,
+ input_data,
+ beam_search=beam_search_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying argmax:")
+ print(output_result)
diff --git a/examples/python/native/ops/batch_matmul.py b/examples/python/native/ops/batch_matmul.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/batch_norm.py b/examples/python/native/ops/batch_norm.py
new file mode 100644
index 0000000000..b243e79d37
--- /dev/null
+++ b/examples/python/native/ops/batch_norm.py
@@ -0,0 +1,36 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def ff(ffconfig, input_arr: np.ndarray):
+ ffmodel = FFModel(ffconfig)
+ # TODO: convert input to ff tensor
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.batch_norm(
+ input_tensor
+ )
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = ff(ffconfig, input)
diff --git a/examples/python/native/ops/beam_top_k.py b/examples/python/native/ops/beam_top_k.py
new file mode 100644
index 0000000000..cb2fdfb3d2
--- /dev/null
+++ b/examples/python/native/ops/beam_top_k.py
@@ -0,0 +1,58 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_beam_top_k(ffconfig, input_arr: np.ndarray, max_beam_size: int, sorted: bool, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ beam_top_k_output = ffmodel.beam_top_k(
+ input_tensor,
+ max_beam_size,
+ sorted,
+ name="beam_top_k_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ beam_top_k_output.inline_map(ffmodel, ffconfig)
+ output_result = beam_top_k_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ max_beam_size_value = 3
+ sorted_value = True
+
+ output_result = test_beam_top_k(
+ ffconfig,
+ input_data,
+ max_beam_size=max_beam_size_value,
+ sorted=sorted_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying beam_top_k:")
+ print(output_result)
diff --git a/examples/python/native/ops/concat.py b/examples/python/native/ops/concat.py
new file mode 100644
index 0000000000..0088d7b848
--- /dev/null
+++ b/examples/python/native/ops/concat.py
@@ -0,0 +1,43 @@
+# The basis for this test of the 'concatenate' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_concatenate(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.concat([input_tensor1, input_tensor2], axis=1)
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = test_concatenate(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/conv2d.py b/examples/python/native/ops/conv2d.py
new file mode 100644
index 0000000000..02b3646aaa
--- /dev/null
+++ b/examples/python/native/ops/conv2d.py
@@ -0,0 +1,45 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def ff(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.conv2d(
+ input_tensor,
+ 32,
+ 3,
+ 3,
+ 1,
+ 1,
+ 1,
+ 1,
+ use_bias=False
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = ff(ffconfig, input)
diff --git a/examples/python/native/ops/cos.py b/examples/python/native/ops/cos.py
new file mode 100644
index 0000000000..26f6307685
--- /dev/null
+++ b/examples/python/native/ops/cos.py
@@ -0,0 +1,44 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_cos(ffconfig, input_arr: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ cos_output = ffmodel.cos(input_tensor, name="cos_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ cos_output.inline_map(ffmodel, ffconfig)
+ cos_result = cos_output.get_array(ffmodel, ffconfig)
+
+ return cos_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ cos_result = test_cos(ffconfig, input_data)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying cos function:")
+ print(cos_result)
diff --git a/examples/python/native/ops/dense.py b/examples/python/native/ops/dense.py
new file mode 100644
index 0000000000..ec0a3dc65b
--- /dev/null
+++ b/examples/python/native/ops/dense.py
@@ -0,0 +1,38 @@
+# The basis for this test of the 'dense' layer is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_dense(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.dense(input_tensor, 64, activation=ActiMode.AC_MODE_RELU)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ _ = test_dense(ffconfig, input)
diff --git a/examples/python/native/ops/divide.py b/examples/python/native/ops/divide.py
new file mode 100644
index 0000000000..419bf714ab
--- /dev/null
+++ b/examples/python/native/ops/divide.py
@@ -0,0 +1,48 @@
+# The basis for this test of the 'divide' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_divide(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.divide(input_tensor1, input_tensor2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ # Avoid division by zero in input2
+ input2 = np.where(input2 == 0, 1e-6, input2)
+
+ _ = test_divide(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/dropout.py b/examples/python/native/ops/dropout.py
new file mode 100644
index 0000000000..3aa44a5a5b
--- /dev/null
+++ b/examples/python/native/ops/dropout.py
@@ -0,0 +1,49 @@
+# The basis for this test of the 'Dropout' layer is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_dropout(ffconfig, input_arr: np.ndarray, dropout_rate: float = 0.5) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply Dropout layer
+ out = ffmodel.dropout(input_tensor, dropout_rate, 0)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ # You can adjust the dropout rate as needed
+ dropout_rate_param = 0.5
+
+ result = test_dropout(ffconfig, input_data, dropout_rate_param)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after Dropout layer:")
+ print(result)
diff --git a/examples/python/native/ops/elu.py b/examples/python/native/ops/elu.py
new file mode 100644
index 0000000000..7a6ef1f621
--- /dev/null
+++ b/examples/python/native/ops/elu.py
@@ -0,0 +1,47 @@
+# The basis for this test of the 'ELU' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_elu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply ELU activation
+ out = ffmodel.elu(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ result = test_elu(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after ELU activation:")
+ print(result)
diff --git a/examples/python/native/ops/embedding.py b/examples/python/native/ops/embedding.py
new file mode 100644
index 0000000000..34bced3798
--- /dev/null
+++ b/examples/python/native/ops/embedding.py
@@ -0,0 +1,39 @@
+# The basis for this test of the 'embedding' layer is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_embedding(ffconfig, input_arr: np.ndarray, vocab_size: int, embedding_dim: int) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_INT32)
+
+ out = ffmodel.embedding(input_tensor, vocab_size, embedding_dim, AggrMode.AGGR_MODE_SUM)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ vocab_size = 1000
+ embedding_dim = 50
+ input = np.random.randint(low=0, high=vocab_size, size=(ffconfig.batch_size, 10), dtype=np.int32)
+ _ = test_embedding(ffconfig, input, vocab_size, embedding_dim)
diff --git a/examples/python/native/ops/exp.py b/examples/python/native/ops/exp.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/flat.py b/examples/python/native/ops/flat.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/gather.py b/examples/python/native/ops/gather.py
new file mode 100644
index 0000000000..e13b6e4c75
--- /dev/null
+++ b/examples/python/native/ops/gather.py
@@ -0,0 +1,60 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_gather(ffconfig, input_arr: np.ndarray, index_arr: np.ndarray, dim: int, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+ index_tensor = ffmodel.create_tensor(index_arr.shape, DataType.DT_INT32)
+
+ gather_output = ffmodel.gather(
+ input_tensor,
+ index_tensor,
+ dim,
+ name="gather_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+ dataloader_index = ffmodel.create_data_loader(index_tensor, index_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_index.reset()
+
+ dataloader_input.next_batch(ffmodel)
+ dataloader_index.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ gather_output.inline_map(ffmodel, ffconfig)
+ output_result = gather_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ index_data = np.random.randint(0, 5, size=(ffconfig.batch_size,)).astype(np.int32)
+ dim_to_gather = 2 # Example dimension to gather along
+
+ output_result = test_gather(ffconfig, input_data, index_data, dim=dim_to_gather)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nIndex Array:")
+ print(index_data)
+ print(f"\nOutput Array after applying gather along dimension {dim_to_gather}:")
+ print(output_result)
diff --git a/examples/python/native/ops/gelu.py b/examples/python/native/ops/gelu.py
new file mode 100644
index 0000000000..84fabd36e1
--- /dev/null
+++ b/examples/python/native/ops/gelu.py
@@ -0,0 +1,51 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_gelu(ffconfig, input_arr: np.ndarray, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ gelu_output = ffmodel.gelu(
+ input_tensor,
+ inplace=inplace,
+ name="gelu_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ gelu_output.inline_map(ffmodel, ffconfig)
+ output_result = gelu_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_gelu(ffconfig, input_data, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying gelu activation function (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/identity.py b/examples/python/native/ops/identity.py
new file mode 100644
index 0000000000..fbf63e717c
--- /dev/null
+++ b/examples/python/native/ops/identity.py
@@ -0,0 +1,49 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_identity(ffconfig, input_arr: np.ndarray, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ identity_output = ffmodel.identity(
+ input_tensor,
+ name="identity_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ identity_output.inline_map(ffmodel, ffconfig)
+ output_result = identity_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ output_result = test_identity(ffconfig, input_data)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying identity function:")
+ print(output_result)
diff --git a/examples/python/native/ops/inc_multihead_self_attention.py b/examples/python/native/ops/inc_multihead_self_attention.py
new file mode 100644
index 0000000000..dce7bd565d
--- /dev/null
+++ b/examples/python/native/ops/inc_multihead_self_attention.py
@@ -0,0 +1,103 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multihead_self_attention(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ inc_multihead_self_attention_output = ffmodel.inc_multihead_self_attention(
+ input_tensor,
+ embed_dim,
+ num_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="inc_multihead_self_attention_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig)
+ output_result = inc_multihead_self_attention_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_heads_value = 8
+
+ output_result = test_inc_multihead_self_attention(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_heads=num_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying inc_multihead_self_attention:")
+ print(output_result)
diff --git a/examples/python/native/ops/inc_multihead_self_attention_verify.py b/examples/python/native/ops/inc_multihead_self_attention_verify.py
new file mode 100644
index 0000000000..f6dc8e3933
--- /dev/null
+++ b/examples/python/native/ops/inc_multihead_self_attention_verify.py
@@ -0,0 +1,103 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multihead_self_attention_verify(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ inc_multihead_self_attention_verify_output = ffmodel.inc_multihead_self_attention_verify(
+ input_tensor,
+ embed_dim,
+ num_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="inc_multihead_self_attention_verify_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ inc_multihead_self_attention_verify_output.inline_map(ffmodel, ffconfig)
+ output_result = inc_multihead_self_attention_verify_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_heads_value = 8
+
+ output_result = test_inc_multihead_self_attention_verify(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_heads=num_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying inc_multihead_self_attention_verify:")
+ print(output_result)
diff --git a/examples/python/native/ops/inc_multiquery_self_attention.py b/examples/python/native/ops/inc_multiquery_self_attention.py
new file mode 100644
index 0000000000..33390ab1f6
--- /dev/null
+++ b/examples/python/native/ops/inc_multiquery_self_attention.py
@@ -0,0 +1,107 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multiquery_self_attention(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_q_heads: int,
+ num_kv_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ inc_multiquery_self_attention_output = ffmodel.inc_multiquery_self_attention(
+ input_tensor,
+ embed_dim,
+ num_q_heads,
+ num_kv_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="inc_multiquery_self_attention_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig)
+ output_result = inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_q_heads_value = 4
+ num_kv_heads_value = 4
+
+ output_result = test_inc_multiquery_self_attention(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_q_heads=num_q_heads_value,
+ num_kv_heads=num_kv_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying inc_multiquery_self_attention:")
+ print(output_result)
diff --git a/examples/python/native/ops/inc_multiquery_self_attention_verify.py b/examples/python/native/ops/inc_multiquery_self_attention_verify.py
new file mode 100644
index 0000000000..69a76f68bf
--- /dev/null
+++ b/examples/python/native/ops/inc_multiquery_self_attention_verify.py
@@ -0,0 +1,107 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_inc_multiquery_self_attention_verify(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_q_heads: int,
+ num_kv_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ inc_multiquery_self_attention_verify_output = ffmodel.inc_multiquery_self_attention_verify(
+ input_tensor,
+ embed_dim,
+ num_q_heads,
+ num_kv_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="inc_multiquery_self_attention_verify_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ inc_multiquery_self_attention_verify_output.inline_map(ffmodel, ffconfig)
+ output_result = inc_multiquery_self_attention_verify_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_q_heads_value = 4
+ num_kv_heads_value = 4
+
+ output_result = test_inc_multiquery_self_attention_verify(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_q_heads=num_q_heads_value,
+ num_kv_heads=num_kv_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying inc_multiquery_self_attention_verify:")
+ print(output_result)
diff --git a/examples/python/native/ops/layer_norm.py b/examples/python/native/ops/layer_norm.py
new file mode 100644
index 0000000000..b3cca93d6e
--- /dev/null
+++ b/examples/python/native/ops/layer_norm.py
@@ -0,0 +1,48 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_layer_norm(ffconfig, input_arr: np.ndarray, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ layer_norm_output = ffmodel.layer_norm(input_tensor, axes=axes, elementwise_affine=elementwise_affine, eps=eps, use_bias=use_bias, name="layer_norm_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ layer_norm_output.inline_map(ffmodel, ffconfig)
+ layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+ return layer_norm_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ axes_to_normalize = [1, 2] # Example axes to normalize
+
+ layer_norm_result = test_layer_norm(ffconfig, input_data, axes=axes_to_normalize, elementwise_affine=True, eps=1e-5, use_bias=True)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying layer_norm function along axes {axes_to_normalize}:")
+ print(layer_norm_result)
diff --git a/examples/python/native/ops/max.py b/examples/python/native/ops/max.py
new file mode 100644
index 0000000000..bf9c629406
--- /dev/null
+++ b/examples/python/native/ops/max.py
@@ -0,0 +1,54 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_max(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ max_output = ffmodel.max(input_tensor1, input_tensor2, name="max_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input2.reset()
+
+ dataloader_input1.next_batch(ffmodel)
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ max_output.inline_map(ffmodel, ffconfig)
+ max_result = max_output.get_array(ffmodel, ffconfig)
+
+ return max_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ max_result = test_max(ffconfig, input_data1, input_data2)
+
+ print("Input Array 1:")
+ print(input_data1)
+ print("\nInput Array 2:")
+ print(input_data2)
+ print("\nOutput Array after applying max function:")
+ print(max_result)
diff --git a/examples/python/native/ops/mean.py b/examples/python/native/ops/mean.py
new file mode 100644
index 0000000000..df8c3f642e
--- /dev/null
+++ b/examples/python/native/ops/mean.py
@@ -0,0 +1,48 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_mean(ffconfig, input_arr: np.ndarray, dims: List[int], keepdims: bool = False) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ mean_output = ffmodel.mean(input_tensor, dims=dims, keepdims=keepdims, name="mean_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ mean_output.inline_map(ffmodel, ffconfig)
+ mean_result = mean_output.get_array(ffmodel, ffconfig)
+
+ return mean_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ dims_to_mean = [1, 2] # Example dimensions to take the mean over
+
+ mean_result = test_mean(ffconfig, input_data, dims=dims_to_mean, keepdims=False)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying mean function along dimensions {dims_to_mean}:")
+ print(mean_result)
diff --git a/examples/python/native/ops/min.py b/examples/python/native/ops/min.py
new file mode 100644
index 0000000000..df81f4f2d2
--- /dev/null
+++ b/examples/python/native/ops/min.py
@@ -0,0 +1,54 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_min(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ min_output = ffmodel.min(input_tensor1, input_tensor2, name="min_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input2.reset()
+
+ dataloader_input1.next_batch(ffmodel)
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ min_output.inline_map(ffmodel, ffconfig)
+ min_result = min_output.get_array(ffmodel, ffconfig)
+
+ return min_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input_data2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ min_result = test_min(ffconfig, input_data1, input_data2)
+
+ print("Input Array 1:")
+ print(input_data1)
+ print("\nInput Array 2:")
+ print(input_data2)
+ print("\nOutput Array after applying min function:")
+ print(min_result)
diff --git a/examples/python/native/ops/multihead_attention.py b/examples/python/native/ops/multihead_attention.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/examples/python/native/ops/multiply.py b/examples/python/native/ops/multiply.py
new file mode 100644
index 0000000000..fb4f489150
--- /dev/null
+++ b/examples/python/native/ops/multiply.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'multiply' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_multiply(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.multiply(input_tensor1, input_tensor2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ _ = test_multiply(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/pool2d.py b/examples/python/native/ops/pool2d.py
new file mode 100644
index 0000000000..b4dc8b219e
--- /dev/null
+++ b/examples/python/native/ops/pool2d.py
@@ -0,0 +1,36 @@
+# AI generated from conv2d example
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_pool2d(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.pool2d(input_tensor, 3, 3, 1, 1, 0, 0, PoolType.POOL_MAX)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = test_pool2d(ffconfig, input)
\ No newline at end of file
diff --git a/examples/python/native/ops/pow.py b/examples/python/native/ops/pow.py
new file mode 100644
index 0000000000..cf5bbebd80
--- /dev/null
+++ b/examples/python/native/ops/pow.py
@@ -0,0 +1,46 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_pow(ffconfig, input_arr: np.ndarray, exponent: float) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ pow_output = ffmodel.pow(input_tensor, exponent, name="pow_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ pow_output.inline_map(ffmodel, ffconfig)
+ pow_result = pow_output.get_array(ffmodel, ffconfig)
+
+ return pow_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ exponent_value = 2.0 # Example exponent value
+
+ pow_result = test_pow(ffconfig, input_data, exponent=exponent_value)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying pow function with exponent {exponent_value}:")
+ print(pow_result)
diff --git a/examples/python/native/ops/reduce_sum.py b/examples/python/native/ops/reduce_sum.py
new file mode 100644
index 0000000000..7e7b41b799
--- /dev/null
+++ b/examples/python/native/ops/reduce_sum.py
@@ -0,0 +1,48 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_reduce_sum(ffconfig, input_arr: np.ndarray, axes: List[int], keepdims: bool = False) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ reduce_sum_output = ffmodel.reduce_sum(input_tensor, axes=axes, keepdims=keepdims, name="reduce_sum_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ reduce_sum_output.inline_map(ffmodel, ffconfig)
+ reduce_sum_result = reduce_sum_output.get_array(ffmodel, ffconfig)
+
+ return reduce_sum_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ axes_to_reduce = [1, 2] # Example axes to reduce
+
+ reduce_sum_result = test_reduce_sum(ffconfig, input_data, axes=axes_to_reduce, keepdims=False)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying reduce_sum along axes {axes_to_reduce}:")
+ print(reduce_sum_result)
diff --git a/examples/python/native/ops/relu.py b/examples/python/native/ops/relu.py
new file mode 100644
index 0000000000..d855b27164
--- /dev/null
+++ b/examples/python/native/ops/relu.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'ReLU' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_relu(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply ReLU activation
+ out = ffmodel.relu(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ result = test_relu(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after ReLU activation:")
+ print(result)
diff --git a/examples/python/native/ops/reshape.py b/examples/python/native/ops/reshape.py
new file mode 100644
index 0000000000..348d6bd935
--- /dev/null
+++ b/examples/python/native/ops/reshape.py
@@ -0,0 +1,41 @@
+# The basis for this test of the 'reshape' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_reshape(ffconfig, input_arr: np.ndarray, target_shape: List[int]) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.reshape(input_tensor, target_shape)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ target_shape = [ffconfig.batch_size, 500]
+
+ _ = test_reshape(ffconfig, input, target_shape)
diff --git a/examples/python/native/ops/residual_layer_norm.py b/examples/python/native/ops/residual_layer_norm.py
new file mode 100644
index 0000000000..e12f2e53d9
--- /dev/null
+++ b/examples/python/native/ops/residual_layer_norm.py
@@ -0,0 +1,93 @@
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_residual_layer_norm(ffconfig, input_arr: np.ndarray, residual1_arr: np.ndarray, residual2_arr: np.ndarray, use_two_residuals: bool, axes: List[int], elementwise_affine: bool = True, eps: float = 1e-5, use_bias: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+ residual1_tensor = ffmodel.create_tensor(residual1_arr.shape, DataType.DT_FLOAT)
+ residual2_tensor = ffmodel.create_tensor(residual2_arr.shape, DataType.DT_FLOAT)
+
+ output_tensor, layer_norm_output = ffmodel.residual_layer_norm(
+ input_tensor,
+ residual1_tensor,
+ residual2_tensor if use_two_residuals else None,
+ use_two_residuals,
+ axes=axes,
+ elementwise_affine=elementwise_affine,
+ eps=eps,
+ use_bias=use_bias,
+ name="residual_layer_norm_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+ dataloader_residual1 = ffmodel.create_data_loader(residual1_tensor, residual1_arr)
+ dataloader_residual2 = ffmodel.create_data_loader(residual2_tensor, residual2_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_residual1.reset()
+ if use_two_residuals:
+ dataloader_residual2.reset()
+
+ dataloader_input.next_batch(ffmodel)
+ dataloader_residual1.next_batch(ffmodel)
+ if use_two_residuals:
+ dataloader_residual2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ output_tensor.inline_map(ffmodel, ffconfig)
+ layer_norm_output.inline_map(ffmodel, ffconfig)
+ output_result = output_tensor.get_array(ffmodel, ffconfig)
+ layer_norm_result = layer_norm_output.get_array(ffmodel, ffconfig)
+
+ return output_result, layer_norm_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ residual1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ residual2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ use_two_residuals_flag = True # Example flag
+
+ axes_to_normalize = [1, 2] # Example axes to normalize
+
+ output_result, layer_norm_result = test_residual_layer_norm(
+ ffconfig,
+ input_data,
+ residual1_data,
+ residual2_data,
+ use_two_residuals_flag,
+ axes=axes_to_normalize,
+ elementwise_affine=True,
+ eps=1e-5,
+ use_bias=True
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nResidual1 Array:")
+ print(residual1_data)
+ if use_two_residuals_flag:
+ print("\nResidual2 Array:")
+ print(residual2_data)
+ print(f"\nOutput Array after applying residual_layer_norm along axes {axes_to_normalize} with use_two_residuals={use_two_residuals_flag}:")
+ print(output_result)
+ print("\nLayer Norm Result:")
+ print(layer_norm_result)
diff --git a/examples/python/native/ops/residual_rms_norm.py b/examples/python/native/ops/residual_rms_norm.py
new file mode 100644
index 0000000000..9027dffada
--- /dev/null
+++ b/examples/python/native/ops/residual_rms_norm.py
@@ -0,0 +1,80 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_residual_rms_norm(
+ ffconfig,
+ input1_arr: np.ndarray,
+ input2_arr: np.ndarray,
+ eps: float,
+ dim: int,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT)
+ input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT)
+
+ residual_rms_norm_output1, residual_rms_norm_output2 = ffmodel.residual_rms_norm(
+ input1_tensor,
+ input2_tensor,
+ eps,
+ dim,
+ name="residual_rms_norm_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr)
+ dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ residual_rms_norm_output1.inline_map(ffmodel, ffconfig)
+ output_result1 = residual_rms_norm_output1.get_array(ffmodel, ffconfig)
+
+ residual_rms_norm_output2.inline_map(ffmodel, ffconfig)
+ output_result2 = residual_rms_norm_output2.get_array(ffmodel, ffconfig)
+
+ return output_result1, output_result2
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ input2_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ eps_value = 1e-6
+ dim_value = 1 # Example value for dim
+
+ output_result1, output_result2 = test_residual_rms_norm(
+ ffconfig,
+ input1_data,
+ input2_data,
+ eps=eps_value,
+ dim=dim_value,
+ )
+
+ print("Input Array 1:")
+ print(input1_data)
+ print("\nInput Array 2:")
+ print(input2_data)
+ print("\nOutput Array 1 after applying residual_rms_norm:")
+ print(output_result1)
+ print("\nOutput Array 2 after applying residual_rms_norm:")
+ print(output_result2)
diff --git a/examples/python/native/ops/reverse.py b/examples/python/native/ops/reverse.py
new file mode 100644
index 0000000000..25394d4b9a
--- /dev/null
+++ b/examples/python/native/ops/reverse.py
@@ -0,0 +1,37 @@
+# The basis for this test of the 'reverse' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_reverse(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.reverse(input_tensor, axis=2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = test_reverse(ffconfig, input)
diff --git a/examples/python/native/ops/rms_norm.py b/examples/python/native/ops/rms_norm.py
new file mode 100644
index 0000000000..3983d7f891
--- /dev/null
+++ b/examples/python/native/ops/rms_norm.py
@@ -0,0 +1,64 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_rms_norm(
+ ffconfig,
+ input_arr: np.ndarray,
+ eps: float,
+ dim: int,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ rms_norm_output = ffmodel.rms_norm(
+ input_tensor,
+ eps,
+ dim,
+ name="rms_norm_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY],
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ rms_norm_output.inline_map(ffmodel, ffconfig)
+ output_result = rms_norm_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ eps_value = 1e-6
+ dim_value = 1 # Example value for dim
+
+ output_result = test_rms_norm(
+ ffconfig,
+ input_data,
+ eps=eps_value,
+ dim=dim_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying rms_norm:")
+ print(output_result)
diff --git a/examples/python/native/ops/rsqrt.py b/examples/python/native/ops/rsqrt.py
new file mode 100644
index 0000000000..3d9ab65449
--- /dev/null
+++ b/examples/python/native/ops/rsqrt.py
@@ -0,0 +1,44 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_rsqrt(ffconfig, input_arr: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ rsqrt_output = ffmodel.rsqrt(input_tensor, name="rsqrt_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ rsqrt_output.inline_map(ffmodel, ffconfig)
+ rsqrt_result = rsqrt_output.get_array(ffmodel, ffconfig)
+
+ return rsqrt_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ rsqrt_result = test_rsqrt(ffconfig, input_data)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying rsqrt function:")
+ print(rsqrt_result)
diff --git a/examples/python/native/ops/sampling.py b/examples/python/native/ops/sampling.py
new file mode 100644
index 0000000000..2219f09eff
--- /dev/null
+++ b/examples/python/native/ops/sampling.py
@@ -0,0 +1,55 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_sampling(ffconfig, input_arr: np.ndarray, top_p: float, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ sampling_output = ffmodel.sampling(
+ input_tensor,
+ top_p,
+ name="sampling_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_MEAN_SQUARED_ERROR,
+ metrics=[MetricsType.METRICS_MEAN_SQUARED_ERROR],
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ sampling_output.inline_map(ffmodel, ffconfig)
+ output_result = sampling_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10).astype(np.float32)
+ top_p_value = 0.8
+
+ output_result = test_sampling(
+ ffconfig,
+ input_data,
+ top_p=top_p_value,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying sampling:")
+ print(output_result)
diff --git a/examples/python/native/ops/scalar_add.py b/examples/python/native/ops/scalar_add.py
new file mode 100644
index 0000000000..48a316ea8a
--- /dev/null
+++ b/examples/python/native/ops/scalar_add.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_add(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ scalar_add_output = ffmodel.scalar_add(
+ input_tensor,
+ scalar,
+ inplace=inplace,
+ name="scalar_add_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ scalar_add_output.inline_map(ffmodel, ffconfig)
+ output_result = scalar_add_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ scalar_value = 2.0 # Example scalar value
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_scalar_add(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying scalar addition with scalar value {scalar_value} (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/scalar_multiply.py b/examples/python/native/ops/scalar_multiply.py
new file mode 100644
index 0000000000..ebae5cce01
--- /dev/null
+++ b/examples/python/native/ops/scalar_multiply.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_multiply(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ scalar_multiply_output = ffmodel.scalar_multiply(
+ input_tensor,
+ scalar,
+ inplace=inplace,
+ name="scalar_multiply_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ scalar_multiply_output.inline_map(ffmodel, ffconfig)
+ output_result = scalar_multiply_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ scalar_value = 2.0 # Example scalar value
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_scalar_multiply(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying scalar multiplication with scalar value {scalar_value} (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/scalar_sub.py b/examples/python/native/ops/scalar_sub.py
new file mode 100644
index 0000000000..2dc467b573
--- /dev/null
+++ b/examples/python/native/ops/scalar_sub.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_sub(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ scalar_sub_output = ffmodel.scalar_sub(
+ input_tensor,
+ scalar,
+ inplace=inplace,
+ name="scalar_sub_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ scalar_sub_output.inline_map(ffmodel, ffconfig)
+ output_result = scalar_sub_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ scalar_value = 2.0 # Example scalar value
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_scalar_sub(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying scalar subtraction with scalar value {scalar_value} (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/scalar_true_divide.py b/examples/python/native/ops/scalar_true_divide.py
new file mode 100644
index 0000000000..f1b64df506
--- /dev/null
+++ b/examples/python/native/ops/scalar_true_divide.py
@@ -0,0 +1,53 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_scalar_true_divide(ffconfig, input_arr: np.ndarray, scalar: float, inplace: bool = True, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ scalar_true_divide_output = ffmodel.scalar_true_divide(
+ input_tensor,
+ scalar,
+ inplace=inplace,
+ name="scalar_true_divide_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ scalar_true_divide_output.inline_map(ffmodel, ffconfig)
+ output_result = scalar_true_divide_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ scalar_value = 2.0 # Example scalar value
+ inplace_flag = True # Example inplace flag
+
+ output_result = test_scalar_true_divide(ffconfig, input_data, scalar=scalar_value, inplace=inplace_flag)
+
+ print("Input Array:")
+ print(input_data)
+ print(f"\nOutput Array after applying scalar true division with scalar value {scalar_value} (inplace={inplace_flag}):")
+ print(output_result)
diff --git a/examples/python/native/ops/sigmoid.py b/examples/python/native/ops/sigmoid.py
new file mode 100644
index 0000000000..0fbe21df45
--- /dev/null
+++ b/examples/python/native/ops/sigmoid.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'Sigmoid' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_sigmoid(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply Sigmoid activation
+ out = ffmodel.sigmoid(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ result = test_sigmoid(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after Sigmoid activation:")
+ print(result)
diff --git a/examples/python/native/ops/sigmoid_silu_multi.py b/examples/python/native/ops/sigmoid_silu_multi.py
new file mode 100644
index 0000000000..cecc3e102e
--- /dev/null
+++ b/examples/python/native/ops/sigmoid_silu_multi.py
@@ -0,0 +1,58 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_sigmoid_silu_multi(ffconfig, input1_arr: np.ndarray, input2_arr: np.ndarray, name=None):
+ ffmodel = FFModel(ffconfig)
+
+ input1_tensor = ffmodel.create_tensor(input1_arr.shape, DataType.DT_FLOAT)
+ input2_tensor = ffmodel.create_tensor(input2_arr.shape, DataType.DT_FLOAT)
+
+ sigmoid_silu_multi_output = ffmodel.sigmoid_silu_multi(
+ input1_tensor,
+ input2_tensor,
+ name="sigmoid_silu_multi_layer"
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input1 = ffmodel.create_data_loader(input1_tensor, input1_arr)
+ dataloader_input2 = ffmodel.create_data_loader(input2_tensor, input2_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input2.reset()
+
+ dataloader_input1.next_batch(ffmodel)
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ sigmoid_silu_multi_output.inline_map(ffmodel, ffconfig)
+ output_result = sigmoid_silu_multi_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ output_result = test_sigmoid_silu_multi(ffconfig, input1_data, input2_data)
+
+ print("Input1 Array:")
+ print(input1_data)
+ print("\nInput2 Array:")
+ print(input2_data)
+ print("\nOutput Array after applying sigmoid_silu_multi:")
+ print(output_result)
diff --git a/examples/python/native/ops/sin.py b/examples/python/native/ops/sin.py
new file mode 100644
index 0000000000..4b60a4e1d4
--- /dev/null
+++ b/examples/python/native/ops/sin.py
@@ -0,0 +1,44 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_sin(ffconfig, input_arr: np.ndarray) -> np.ndarray:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ sin_output = ffmodel.sin(input_tensor, name="sin_layer")
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ sin_output.inline_map(ffmodel, ffconfig)
+ sin_result = sin_output.get_array(ffmodel, ffconfig)
+
+ return sin_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ sin_result = test_sin(ffconfig, input_data)
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying sin function:")
+ print(sin_result)
diff --git a/examples/python/native/ops/softmax.py b/examples/python/native/ops/softmax.py
new file mode 100644
index 0000000000..b5481bcc80
--- /dev/null
+++ b/examples/python/native/ops/softmax.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'Softmax' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_softmax(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply Softmax activation
+ out = ffmodel.softmax(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10).astype(np.float32)
+
+ result = test_softmax(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after Softmax activation:")
+ print(result)
diff --git a/examples/python/native/ops/spec_inc_multihead_self_attention.py b/examples/python/native/ops/spec_inc_multihead_self_attention.py
new file mode 100644
index 0000000000..bd1aaa189b
--- /dev/null
+++ b/examples/python/native/ops/spec_inc_multihead_self_attention.py
@@ -0,0 +1,103 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_spec_inc_multihead_self_attention(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ spec_inc_multihead_self_attention_output = ffmodel.spec_inc_multihead_self_attention(
+ input_tensor,
+ embed_dim,
+ num_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="spec_inc_multihead_self_attention_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ spec_inc_multihead_self_attention_output.inline_map(ffmodel, ffconfig)
+ output_result = spec_inc_multihead_self_attention_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_heads_value = 8
+
+ output_result = test_spec_inc_multihead_self_attention(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_heads=num_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying spec_inc_multihead_self_attention:")
+ print(output_result)
diff --git a/examples/python/native/ops/spec_inc_multiquery_self_attention.py b/examples/python/native/ops/spec_inc_multiquery_self_attention.py
new file mode 100644
index 0000000000..0b731c99e0
--- /dev/null
+++ b/examples/python/native/ops/spec_inc_multiquery_self_attention.py
@@ -0,0 +1,107 @@
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_spec_inc_multiquery_self_attention(
+ ffconfig,
+ input_arr: np.ndarray,
+ embed_dim: int,
+ num_q_heads: int,
+ num_kv_heads: int,
+ kdim: int = 0,
+ vdim: int = 0,
+ dropout: float = 0.0,
+ bias: bool = True,
+ add_bias_kv: bool = False,
+ add_zero_attn: bool = False,
+ data_type: DataType = DataType.DT_NONE,
+ kernel_initializer=None,
+ apply_rotary_embedding: bool = False,
+ scaling_query: bool = False,
+ scaling_factor: float = 1.0,
+ qk_prod_scaling: bool = True,
+ position_bias: bool = False,
+ name=None,
+):
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, data_type)
+
+ spec_inc_multiquery_self_attention_output = ffmodel.spec_inc_multiquery_self_attention(
+ input_tensor,
+ embed_dim,
+ num_q_heads,
+ num_kv_heads,
+ kdim=kdim,
+ vdim=vdim,
+ dropout=dropout,
+ bias=bias,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ data_type=data_type,
+ kernel_initializer=kernel_initializer,
+ apply_rotary_embedding=apply_rotary_embedding,
+ scaling_query=scaling_query,
+ scaling_factor=scaling_factor,
+ qk_prod_scaling=qk_prod_scaling,
+ position_bias=position_bias,
+ name="spec_inc_multiquery_self_attention_layer",
+ )
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY]
+ )
+
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ spec_inc_multiquery_self_attention_output.inline_map(ffmodel, ffconfig)
+ output_result = spec_inc_multiquery_self_attention_output.get_array(ffmodel, ffconfig)
+
+ return output_result
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 10, 20).astype(np.float32)
+ embed_dim_value = 64
+ num_q_heads_value = 4
+ num_kv_heads_value = 4
+
+ output_result = test_spec_inc_multiquery_self_attention(
+ ffconfig,
+ input_data,
+ embed_dim=embed_dim_value,
+ num_q_heads=num_q_heads_value,
+ num_kv_heads=num_kv_heads_value,
+ kdim=0, # Example value for kdim
+ vdim=0, # Example value for vdim
+ dropout=0.1, # Example value for dropout
+ bias=True,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ data_type=DataType.DT_FLOAT,
+ kernel_initializer=None, # Example value for kernel_initializer
+ apply_rotary_embedding=False,
+ scaling_query=False,
+ scaling_factor=1.0,
+ qk_prod_scaling=True,
+ position_bias=False,
+ )
+
+ print("Input Array:")
+ print(input_data)
+ print("\nOutput Array after applying spec_inc_multiquery_self_attention:")
+ print(output_result)
diff --git a/examples/python/native/ops/split.py b/examples/python/native/ops/split.py
new file mode 100644
index 0000000000..d03a52a769
--- /dev/null
+++ b/examples/python/native/ops/split.py
@@ -0,0 +1,47 @@
+# The basis for this test of the 'split' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+from typing import List
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_split(ffconfig, input_arr: np.ndarray) -> List[flexflow.core.Tensor]:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out1, out2 = ffmodel.split(input_tensor, 2, axis=1)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out1.inline_map(ffmodel, ffconfig)
+ out2.inline_map(ffmodel, ffconfig)
+
+ return [out1.get_array(ffmodel, ffconfig), out2.get_array(ffmodel, ffconfig)]
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 10, 10, 10).astype(np.float32)
+ output_list = test_split(ffconfig, input)
+
+ print("Output Tensor 1:")
+ print(output_list[0])
+
+ print("\nOutput Tensor 2:")
+ print(output_list[1])
diff --git a/examples/python/native/ops/subtract.py b/examples/python/native/ops/subtract.py
new file mode 100644
index 0000000000..5f829cbae1
--- /dev/null
+++ b/examples/python/native/ops/subtract.py
@@ -0,0 +1,45 @@
+# The basis for this test of the 'subtract' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_subtract(ffconfig, input_arr1: np.ndarray, input_arr2: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor1 = ffmodel.create_tensor(input_arr1.shape, DataType.DT_FLOAT)
+ input_tensor2 = ffmodel.create_tensor(input_arr2.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.subtract(input_tensor1, input_tensor2)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input1 = ffmodel.create_data_loader(input_tensor1, input_arr1)
+ dataloader_input2 = ffmodel.create_data_loader(input_tensor2, input_arr2)
+
+ ffmodel.init_layers()
+
+ dataloader_input1.reset()
+ dataloader_input1.next_batch(ffmodel)
+
+ dataloader_input2.reset()
+ dataloader_input2.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input1 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ input2 = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ _ = test_subtract(ffconfig, input1, input2)
diff --git a/examples/python/native/ops/tanh.py b/examples/python/native/ops/tanh.py
new file mode 100644
index 0000000000..ba4ba7d6ff
--- /dev/null
+++ b/examples/python/native/ops/tanh.py
@@ -0,0 +1,46 @@
+# The basis for this test of the 'tanh' activation function is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+def test_tanh(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ # Apply tanh activation
+ out = ffmodel.tanh(input_tensor)
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input_data = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+
+ result = test_tanh(ffconfig, input_data)
+
+ print("Input Data:")
+ print(input_data)
+
+ print("\nResult after tanh activation:")
+ print(result)
diff --git a/examples/python/native/ops/transpose.py b/examples/python/native/ops/transpose.py
new file mode 100644
index 0000000000..6f514d660c
--- /dev/null
+++ b/examples/python/native/ops/transpose.py
@@ -0,0 +1,38 @@
+# The basis for this test of the 'transpose' operation is generated by ChatGPT using the manually created conv2d.py as a template.
+
+
+import flexflow.core
+import numpy as np
+from flexflow.core import *
+
+
+def test_transpose(ffconfig, input_arr: np.ndarray) -> flexflow.core.Tensor:
+ ffmodel = FFModel(ffconfig)
+
+ input_tensor = ffmodel.create_tensor(input_arr.shape, DataType.DT_FLOAT)
+
+ out = ffmodel.transpose(input_tensor, [ffconfig.batch_size, 10, 5, 10])
+
+ ffoptimizer = SGDOptimizer(ffmodel, 0.001)
+ ffmodel.optimizer = ffoptimizer
+ ffmodel.compile(
+ loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+ metrics=[MetricsType.METRICS_ACCURACY, MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY])
+ dataloader_input = ffmodel.create_data_loader(input_tensor, input_arr)
+
+ ffmodel.init_layers()
+
+ dataloader_input.reset()
+ dataloader_input.next_batch(ffmodel)
+ ffmodel.forward()
+
+ out.inline_map(ffmodel, ffconfig)
+ return out.get_array(ffmodel, ffconfig)
+
+
+if __name__ == '__main__':
+ init_flexflow_runtime()
+ ffconfig = FFConfig()
+
+ input = np.random.randn(ffconfig.batch_size, 5, 10, 10).astype(np.float32)
+ _ = test_transpose(ffconfig, input)
diff --git a/examples/python/native/print_layers.py b/examples/python/native/print_layers.py
index 22b87e0b86..481ecc3477 100644
--- a/examples/python/native/print_layers.py
+++ b/examples/python/native/print_layers.py
@@ -119,6 +119,9 @@ def top_level_task():
# ffmodel.print_layers(0)
+
if __name__ == "__main__":
print("alexnet")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/native/split.py b/examples/python/native/split.py
index dfd8b0e572..f79ff04e14 100644
--- a/examples/python/native/split.py
+++ b/examples/python/native/split.py
@@ -77,6 +77,9 @@ def top_level_task():
# if accuracy < ModelAccuracy.CIFAR10_CNN.value:
# assert 0, 'Check Accuracy'
+
if __name__ == "__main__":
print("cifar10 cnn split")
+ configs = get_configs()
+ init_flexflow_runtime(configs)
top_level_task()
diff --git a/examples/python/pytorch/mt5/mt5_ff.py b/examples/python/pytorch/mt5/mt5_ff.py
index 5dff7415d3..b1dc442dd1 100644
--- a/examples/python/pytorch/mt5/mt5_ff.py
+++ b/examples/python/pytorch/mt5/mt5_ff.py
@@ -5,9 +5,10 @@
import numpy as np
import torch
from flexflow.core import *
+import flexflow.core as ff
from flexflow.torch.model import PyTorchModel
#from transformers import MT5ForConditionalGeneration, T5Tokenizer
-from transformers import BertForMaskedLM, BertTokenizer
+from transformers import BertForMaskedLM, BertTokenizer, BertConfig
sys.path.append("./examples/python/pytorch/mt5")
from mt5_torch import DataPreparer, get_dataloaders, set_seed
@@ -85,6 +86,12 @@ def top_level_task():
ffconfig = FFConfig()
ffmodel = FFModel(ffconfig)
#model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
+ # config = BertConfig.from_pretrained('bert-base-uncased')
+
+ # # Modify the configuration to set a different number of layers
+ # config.num_hidden_layers = 1 # Set the number of layers you want
+ # model = BertForMaskedLM.from_pretrained("bert-base-uncased", config=config)
+ # model.num_layers = 1
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
#model = BertModel.from_pretrained("bert-base-uncased")
# Load train data as numpy arrays
@@ -195,4 +202,6 @@ def top_level_task():
#if not os.path.exists(os.path.join(NUMPY_DIR, "train_y_ids.npy")) or \
# not os.path.exists(os.path.join(NUMPY_DIR, "train_lm_labels.npy")):
# preprocess_train()
+ configs = ff.get_configs()
+ ff.init_flexflow_runtime(configs)
top_level_task()
diff --git a/ichanges.txt b/ichanges.txt
new file mode 100644
index 0000000000..aa0912640b
--- /dev/null
+++ b/ichanges.txt
@@ -0,0 +1,5 @@
+changes:
+cudnnSetTensorDescriptorFromDomain4SoftMax
+try_one_lambda in grpah.cc
+
+field_space = runtime->create_field_space(lg_ctx in model.cc
\ No newline at end of file
diff --git a/img/overview.png b/img/overview.png
new file mode 100644
index 0000000000..5264e2d41a
Binary files /dev/null and b/img/overview.png differ
diff --git a/img/performance.png b/img/performance.png
new file mode 100644
index 0000000000..668e579197
Binary files /dev/null and b/img/performance.png differ
diff --git a/img/spec_infer_demo.gif b/img/spec_infer_demo.gif
new file mode 100644
index 0000000000..c0fda87b71
Binary files /dev/null and b/img/spec_infer_demo.gif differ
diff --git a/include/flexflow/accessor.h b/include/flexflow/accessor.h
index 6f95354823..65ab33b513 100644
--- a/include/flexflow/accessor.h
+++ b/include/flexflow/accessor.h
@@ -61,6 +61,7 @@ class GenericTensorAccessorW {
float *get_float_ptr() const;
double *get_double_ptr() const;
half *get_half_ptr() const;
+ char *get_byte_ptr() const;
DataType data_type;
Legion::Domain domain;
void *ptr;
@@ -79,6 +80,7 @@ class GenericTensorAccessorR {
float const *get_float_ptr() const;
double const *get_double_ptr() const;
half const *get_half_ptr() const;
+ char const *get_byte_ptr() const;
DataType data_type;
Legion::Domain domain;
void const *ptr;
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
new file mode 100644
index 0000000000..873fed0bdb
--- /dev/null
+++ b/include/flexflow/batch_config.h
@@ -0,0 +1,238 @@
+/* Copyright 2023 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "legion.h"
+#include
+#include
+
+// #define MAX_SEQ_LEN 1024
+// #define BATCH_SIZE 2
+// #define BATCH_SIZE 16
+// #define MAX_REQUESTS 256
+
+namespace FlexFlow {
+
+class InferenceResult;
+class BeamInferenceResult;
+
+using BatchConfigFuture = Legion::Future;
+using InferenceResultFuture = Legion::Future;
+using BeamSearchBatchConfigFuture = Legion::Future;
+using TreeVerifyBatchConfigFuture = Legion::Future;
+using BeamInferenceResultFuture = Legion::Future;
+
+struct OptimizerTasks {
+ bool compute_gradients = true;
+ bool reset_gradients_to_zero = false;
+ bool update_weights = false;
+ bool save_updated_weights = false;
+};
+
+void set_optimizer_tasks(OptimizerTasks &tasks,
+ int max_training_steps,
+ int completed_training_steps,
+ int gradient_accumulation_steps);
+
+class BatchConfig {
+public:
+ using RequestGuid = size_t;
+ using TokenId = int;
+ BatchConfig();
+ int num_active_requests() const;
+ int num_active_tokens() const;
+ int num_active_infr_tokens() const;
+ int num_active_peft_tokens() const;
+ static int max_requests_per_batch();
+ static int max_tokens_per_batch();
+ static int max_verify_tokens_per_batch();
+ static int max_spec_tree_token_num();
+ static int max_sequence_length();
+ friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
+ void print() const;
+ void save_to_file(std::string const &filename) const;
+ virtual InferenceMode get_mode() const;
+ static BatchConfig const *from_future(BatchConfigFuture const &future);
+ // Maximum possible values for different parameters
+ // These maximum values are used for copying BatchConfig
+ // across workers
+ static int const MAX_NUM_REQUESTS = 65;
+ static int const MAX_NUM_TOKENS = 1024;
+ static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
+
+ // Set by update
+
+ int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0;
+ // number of tokens in prompt phase, start offset of tokens in inc_decoding
+ // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
+ int num_generation_tokens = 0;
+
+ struct PerRequestInfo {
+ PerRequestInfo() {
+ first_token_depth_in_request = 0;
+ first_token_offset_in_batch = 0;
+ num_tokens_in_batch = 0;
+ max_sequence_length = 0;
+ request_guid = 0;
+ prompt_phase = false;
+ batch_config_request_id = -1;
+ peft_model_id = PEFTModelID::NO_ID;
+ peft_bwd = false;
+ optimizer_tasks = {true, false, false, false};
+ }
+ int first_token_depth_in_request;
+ int first_token_offset_in_batch;
+ int num_tokens_in_batch;
+ int max_sequence_length;
+
+ // request id in batch config:
+ int batch_config_request_id = -1;
+ bool prompt_phase = false;
+ RequestGuid request_guid;
+ // PEFT fields
+ PEFTModelID peft_model_id;
+ bool peft_bwd;
+ OptimizerTasks optimizer_tasks;
+ };
+ struct PerTokenInfo {
+ int abs_depth_in_request;
+ int request_index;
+ TokenId token_id;
+ };
+
+ struct BitMask {
+ unsigned long long mask[MAX_SPEC_TREE_TOKEN_NUM] = {0};
+
+ // how many tokens before the tree, every sub requests need this part of
+ // cache
+ int non_tree_cache_size = 0;
+
+ // current tree size
+ int tree_size = 0;
+
+ int this_layer_size = 0;
+
+ // input length-> prompt/root
+ int prompt_size = 0;
+ };
+
+ BitMask causalMask[MAX_NUM_REQUESTS];
+ PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
+ PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
+ PerTokenInfo labelsInfo[MAX_NUM_TOKENS];
+
+ bool request_completed[MAX_NUM_REQUESTS];
+ bool request_running[MAX_NUM_REQUESTS];
+};
+
+class TreeVerifyBatchConfig : public BatchConfig {
+public:
+ TreeVerifyBatchConfig();
+ ~TreeVerifyBatchConfig();
+ InferenceMode get_mode() const;
+ friend std::ostream &operator<<(std::ostream &os,
+ TreeVerifyBatchConfig const &bc);
+ void print() const;
+ void save_to_file(std::string const &filename) const;
+ struct CommittedTokensInfo {
+ int token_index; // the index of the token in the previous batch
+ int request_index; // request index in the batch
+ int token_depth; // position of the token in the request's sequence
+ };
+
+ int num_tokens_to_commit;
+ CommittedTokensInfo committed_tokens[MAX_NUM_TOKENS];
+};
+
+struct InferenceResult {
+ static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
+ BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
+ float finetuning_loss;
+};
+
+class BeamSearchBatchConfig : public BatchConfig {
+public:
+ BeamSearchBatchConfig();
+ BeamSearchBatchConfig(int model_id);
+ BeamSearchBatchConfig(size_t beam_width, size_t target_iterations);
+ BeamSearchBatchConfig(BeamSearchBatchConfig const &other, int model_id);
+ InferenceMode get_mode() const;
+
+ ~BeamSearchBatchConfig();
+
+ friend std::ostream &operator<<(std::ostream &os,
+ BeamSearchBatchConfig const &bc);
+ void print() const;
+ void save_to_file(std::string const &filename) const;
+ bool done() const;
+ int max_beam_depth_all_requests() const;
+ int current_depth_all_requests() const;
+ int get_speculative_request_num() const;
+
+ size_t beam_width;
+ size_t target_iterations;
+
+ // how many requests is in speculative phase
+ int speculative_request_num = 0;
+ inline static int const MAX_BEAM_WIDTH = 3;
+ inline static int const MAX_BEAM_DEPTH = 8;
+
+ // maximum tree branches for a request
+ inline static int const MAX_SPECULATIVE_TREE_BRANCHES = 3;
+
+ int model_id;
+
+ struct BeamSearchPerRequestInfo {
+ int beam_size;
+ int current_depth = -1;
+ int max_depth = MAX_BEAM_DEPTH;
+
+ BatchConfig::TokenId
+ tokens[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ float probs[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ int parent_id[BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ int sub_request_num;
+ };
+
+ struct BeamSearchPerTokenInfo {
+ int sub_request_index;
+ };
+
+ BeamSearchPerRequestInfo beamRequestsInfo[MAX_NUM_REQUESTS];
+ BeamSearchPerTokenInfo
+ beamTokenInfo[MAX_NUM_TOKENS +
+ MAX_SPEC_TREE_TOKEN_NUM * MAX_NUM_REQUESTS];
+
+ int sub_requests[MAX_NUM_REQUESTS];
+
+private:
+ size_t current_iteration;
+};
+
+struct BeamInferenceResult {
+ static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
+ BatchConfig::TokenId
+ token_ids[MAX_NUM_TOKENS *
+ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ float probs[MAX_NUM_TOKENS *
+ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+ int parent_id[MAX_NUM_TOKENS *
+ BeamSearchBatchConfig::MAX_SPECULATIVE_TREE_BRANCHES];
+};
+
+}; // namespace FlexFlow
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index b6a27a4f2a..2f6d22dd6f 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -16,22 +16,25 @@
#ifndef _FLEXFLOW_CONFIG_H_
#define _FLEXFLOW_CONFIG_H_
#include "ffconst.h"
+#include "flexflow/batch_config.h"
#include "legion.h"
#include
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
#include
#include
#elif defined(FF_USE_HIP_ROCM)
-#include
+#include
#include
#else
#error "Unknown device"
#endif
#include "tl/optional.hpp"
+#ifdef FF_USE_NCCL
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
#include
#else
-#include
+#include
+#endif
#endif
namespace FlexFlow {
@@ -39,14 +42,15 @@ namespace FlexFlow {
// ========================================================
// Define Runtime Constants
// ========================================================
-#define MAX_NUM_INPUTS 256
-#define MAX_NUM_WEIGHTS 64
-#define MAX_NUM_OUTPUTS 256
-#define MAX_NUM_FUSED_OPERATORS 64
-#define MAX_NUM_FUSED_TENSORS 64
+#define MAX_NUM_INPUTS 2048
+#define MAX_NUM_WEIGHTS 2048
+#define MAX_NUM_OUTPUTS 2048
+#define MAX_NUM_FUSED_OPERATORS 2048
+#define MAX_NUM_FUSED_TENSORS 2048
#define MAX_NUM_WORKERS 1024
#define MAX_FILENAME 200
#define MAX_OPNAME 128
+#define MAX_NUM_TRANSFORMER_LAYERS 100
// DataLoader
#define MAX_SAMPLES_PER_LOAD 64
#define MAX_FILE_LENGTH 128
@@ -61,6 +65,25 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
#endif
class FFConfig;
+class MemoryAllocator;
+class PEFTWeightAllocator;
+
+struct CombinedBatchConfigMetaStruct {
+ BatchConfig::PerTokenInfo tokens_info[BatchConfig::MAX_NUM_TOKENS];
+ BatchConfig::PerRequestInfo requestsInfo[BatchConfig::MAX_NUM_REQUESTS];
+ BatchConfig::BitMask causalMask[BatchConfig::MAX_NUM_REQUESTS];
+ bool request_completed[BatchConfig::MAX_NUM_REQUESTS];
+
+ BeamSearchBatchConfig::BeamSearchPerTokenInfo
+ beamTokenInfo[BeamSearchBatchConfig::MAX_NUM_TOKENS +
+ BeamSearchBatchConfig::MAX_SPEC_TREE_TOKEN_NUM *
+ BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+ BeamSearchBatchConfig::BeamSearchPerRequestInfo
+ beamRequestsInfo[BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+
+ TreeVerifyBatchConfig::CommittedTokensInfo
+ committed_tokens[TreeVerifyBatchConfig::MAX_NUM_TOKENS];
+};
struct FFHandler {
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -72,6 +95,19 @@ struct FFHandler {
#endif
void *workSpace;
size_t workSpaceSize;
+ CombinedBatchConfigMetaStruct *batch_config_metadata;
+
+ // request info + token info + topolopgy mask info
+ size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
+ void *offload_reserve_space;
+ size_t offload_reserve_space_size;
+ // PEFT related fields
+ MemoryAllocator *peft_activation_allocator;
+ size_t peft_activation_reserve_space_size;
+ PEFTWeightAllocator *peft_weight_allocator;
+ size_t peft_weight_reserve_space_size;
+ // Quantization fields
+ DataType quantization_type;
bool allowTensorOpMathConversion;
#ifdef FF_USE_NCCL
ncclComm_t ncclComm;
@@ -80,6 +116,10 @@ struct FFHandler {
struct FFInitInfo {
size_t workSpaceSize;
+ size_t offload_reserve_space_size;
+ size_t peft_activation_reserve_space_size;
+ size_t peft_weight_reserve_space_size;
+ DataType quantization_type;
bool allowTensorOpMathConversion;
// int myRank, allRanks;
};
@@ -127,19 +167,31 @@ class FFConfig {
Legion::IndexSpaceT<1> all_gpu_task_is;
Legion::FieldSpace field_space;
bool syntheticInput, profiling, perform_fusion;
+ // Legion::FieldSpace field_space;
+ bool benchmarking;
+ bool inference_debugging;
size_t simulator_work_space_size;
size_t search_budget;
float search_alpha;
bool search_overlap_backward_update;
CompMode computationMode;
+ bool cpu_offload;
+ size_t offload_reserve_space_size;
+ DataType quantization_type;
+ // PEFT related fields
+ bool enable_peft;
+ size_t peft_activation_reserve_space_size;
+ size_t peft_weight_reserve_space_size;
// Control parallelizable dimensions
bool only_data_parallel;
bool enable_sample_parallel;
bool enable_parameter_parallel;
bool enable_attribute_parallel;
bool enable_inplace_optimizations;
+ // Control parallelism degrees in inference
int data_parallelism_degree;
int tensor_parallelism_degree;
+ int pipeline_parallelism_degree;
// Control Tensor Op Math Conversion
bool allow_tensor_op_math_conversion;
std::string dataset_path;
@@ -177,4 +229,4 @@ enum FieldIDs {
}; // namespace FlexFlow
-#endif //_FLEXFLOW_CONFIG_H_
+#endif //_FLEXFLOW_CONFIG_H_
\ No newline at end of file
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 060983b020..24b722c36f 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -33,6 +33,8 @@ enum DataType {
DT_HALF = 43,
DT_FLOAT = 44,
DT_DOUBLE = 45,
+ DT_INT4 = 46,
+ DT_INT8 = 47,
DT_NONE = 49,
};
@@ -44,6 +46,12 @@ enum LossType {
LOSS_IDENTITY = 54,
};
+enum OptimizerType {
+ OPTIMIZER_TYPE_NONE = 60,
+ OPTIMIZER_TYPE_SGD = 61,
+ OPTIMIZER_TYPE_ADAM = 62,
+};
+
enum CompMode {
COMP_MODE_TRAINING = 70,
COMP_MODE_INFERENCE = 71,
@@ -64,6 +72,17 @@ enum MetricsType {
METRICS_MEAN_ABSOLUTE_ERROR = 1032,
};
+enum InferenceMode {
+ INC_DECODING_MODE = 2001,
+ BEAM_SEARCH_MODE = 2002,
+ TREE_VERIFY_MODE = 2003,
+};
+
+enum RequestType {
+ REQ_INFERENCE = 4001,
+ REQ_FINETUNING = 4002,
+};
+
// This is consistent with TASO's OpType
// https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138
enum OperatorType {
@@ -129,6 +148,7 @@ enum OperatorType {
OP_SHAPE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Shape
OP_SIZE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Size
OP_TOPK, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#TopK
+ OP_ARG_TOPK,
OP_WHERE, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Where
OP_CEIL, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Ceil
OP_CAST, // https://github.com/onnx/onnx/blob/master/docs/Operators.md#Cast
@@ -150,7 +170,21 @@ enum OperatorType {
OP_POW, // https://pytorch.org/docs/stable/generated/torch.pow.html
OP_MEAN, // https://pytorch.org/docs/stable/generated/torch.mean.html
OP_LAYERNORM,
+ OP_RESIDUAL_LAYERNORM,
+ OP_ADD_BIAS_RESIDUAL_LAYERNORM,
+ OP_SIGMOID_SILU_MULTI,
+ OP_EXPERTS,
OP_GATHER, // https://pytorch.org/docs/stable/generated/torch.gather.html
+ OP_RMS_NORM,
+ OP_RESIDUAL_RMS_NORM,
+ OP_BEAM_TOPK,
+ OP_ARGMAX,
+ OP_INC_MULTIHEAD_SELF_ATTENTION,
+ OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
+ OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
+ OP_SAMPLING,
+ // PEFT Ops
+ OP_LORA,
// Parallel Ops
OP_REPARTITION,
OP_COMBINE,
@@ -158,41 +192,52 @@ enum OperatorType {
OP_REDUCTION,
OP_PIPELINE,
OP_ALLREDUCE,
+ OP_PARALLEL_IDENTITY,
OP_FUSED_PARALLEL,
OP_INVALID,
};
+enum ModelType {
+ UNKNOWN = 3001,
+ LLAMA = 3002,
+ OPT = 3003,
+ FALCON = 3004,
+ STARCODER = 3005,
+ MPT = 3006
+};
+
enum PMParameter {
- PM_OP_TYPE, // AnyOp
- PM_NUM_INPUTS, // AnyOp
- PM_NUM_OUTPUTS, // AnyOp
- PM_GROUP, // Conv2D
- PM_KERNEL_H, // Conv2D, Pool2D
- PM_KERNEL_W, // Conv2D, Pool2D
- PM_STRIDE_H, // Conv2D, Pool2D
- PM_STRIDE_W, // Conv2D, Pool2D
- PM_PADDING_H, // Conv2D, Pool2D
- PM_PADDING_W, // Conv2D, Pool2D
- PM_ACTI, // Conv2D, Pool2D
- PM_NUMDIM, // Concat, Transpose
- PM_AXIS, // Concat, Split
- PM_PERM, // Transpose
- PM_OUTSHUFFLE, // Transpose
- PM_MERGE_GCONV_COUNT, // MergeGConv
- PM_AXES, // Squeeze, Unsqueeze, Reduce*
- PM_KEEP_DIMS, // Reduce*
- PM_EPSILON, // BatchNorm
- PM_REPARTITION_DIM, // Repartition
- PM_REPARTITION_DEGREE, // Repartition
- PM_REPLICATE_DIM, // Replicate
- PM_REPLICATE_DEGREE, // Replicate
- PM_COMBINE_DIM, // Combine
- PM_COMBINE_DEGREE, // Combine
- PM_REDUCTION_DIM, // Reduction
- PM_REDUCTION_DEGREE, // Reduction
- PM_ALLREDUCE_DIM, // AllReduce
- PM_SOFTMAX_DIM, // Softmax
- PM_NUM_HEADS, // MultiHeadAttention
+ PM_OP_TYPE, // AnyOp
+ PM_NUM_INPUTS, // AnyOp
+ PM_NUM_OUTPUTS, // AnyOp
+ PM_GROUP, // Conv2D
+ PM_KERNEL_H, // Conv2D, Pool2D
+ PM_KERNEL_W, // Conv2D, Pool2D
+ PM_STRIDE_H, // Conv2D, Pool2D
+ PM_STRIDE_W, // Conv2D, Pool2D
+ PM_PADDING_H, // Conv2D, Pool2D
+ PM_PADDING_W, // Conv2D, Pool2D
+ PM_ACTI, // Conv2D, Pool2D
+ PM_NUMDIM, // Concat, Transpose
+ PM_AXIS, // Concat, Split
+ PM_PERM, // Transpose
+ PM_OUTSHUFFLE, // Transpose
+ PM_MERGE_GCONV_COUNT, // MergeGConv
+ PM_AXES, // Squeeze, Unsqueeze, Reduce*
+ PM_KEEP_DIMS, // Reduce*
+ PM_EPSILON, // BatchNorm
+ PM_REPARTITION_DIM, // Repartition
+ PM_REPARTITION_DEGREE, // Repartition
+ PM_REPLICATE_DIM, // Replicate
+ PM_REPLICATE_DEGREE, // Replicate
+ PM_COMBINE_DIM, // Combine
+ PM_COMBINE_DEGREE, // Combine
+ PM_REDUCTION_DIM, // Reduction
+ PM_REDUCTION_DEGREE, // Reduction
+ PM_ALLREDUCE_DIM, // AllReduce
+ PM_PARALLEL_IDENTITY_DIM, // AllReduce
+ PM_SOFTMAX_DIM, // Softmax
+ PM_NUM_HEADS, // MultiHeadAttention
PM_INVALID,
PM_PARALLEL_DIM,
PM_PARALLEL_DEGREE,
@@ -238,5 +283,7 @@ enum {
TENSOR_GUID_LAST_VALID = 3999999,
PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000,
NODE_GUID_FIRST_VALID = 5000000,
+ PEFT_MODEL_ID_FIRST_VALID = 6000000,
+ PEFT_MODEL_ID_LAST_VALID = 6999999
};
#endif // _FLEXFLOW_CONST_H_
diff --git a/include/flexflow/ffconst_utils.h b/include/flexflow/ffconst_utils.h
index fcd881e57e..421a139d57 100644
--- a/include/flexflow/ffconst_utils.h
+++ b/include/flexflow/ffconst_utils.h
@@ -8,8 +8,16 @@ namespace FlexFlow {
std::string get_operator_type_name(OperatorType type);
+size_t data_type_size(DataType type);
+
+#define INT4_NUM_OF_ELEMENTS_PER_GROUP 32
+
+size_t get_quantization_to_byte_size(DataType type,
+ DataType quantization_type,
+ size_t num_elements);
+
std::ostream &operator<<(std::ostream &, OperatorType);
}; // namespace FlexFlow
-#endif // _FLEXFLOW_FFCONST_UTILS_H
\ No newline at end of file
+#endif // _FLEXFLOW_FFCONST_UTILS_H
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index a71c85dbc8..3e482b8d67 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -3,20 +3,46 @@
#include "flexflow/ffconst.h"
#include
+#include
+#include
namespace FlexFlow {
class LayerID {
public:
+ static const LayerID NO_ID;
LayerID();
- LayerID(size_t id);
+ LayerID(size_t id, size_t transformer_layer_id, size_t model_id);
bool is_valid_id() const;
friend bool operator==(LayerID const &lhs, LayerID const &rhs);
+public:
+ size_t id, transformer_layer_id, model_id;
+};
+
+class PEFTModelID {
+public:
+ static const PEFTModelID NO_ID;
+ PEFTModelID();
+ PEFTModelID(size_t id);
+ bool is_valid_id() const;
+ friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+ friend std::ostream &operator<<(std::ostream &os,
+ PEFTModelID const &peft_model_id);
+
public:
size_t id;
};
}; // namespace FlexFlow
-#endif // _FF_TYPE_H
\ No newline at end of file
+namespace std {
+template <>
+struct hash {
+ size_t operator()(FlexFlow::PEFTModelID const &n) const {
+ return n.id;
+ }
+};
+} // namespace std
+
+#endif // _FF_TYPE_H
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 2ddc8549fa..fbb98d090e 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -47,6 +47,19 @@ FF_NEW_OPAQUE_TYPE(flexflow_dlrm_config_t);
FF_NEW_OPAQUE_TYPE(flexflow_dataloader_4d_t);
FF_NEW_OPAQUE_TYPE(flexflow_dataloader_2d_t);
FF_NEW_OPAQUE_TYPE(flexflow_single_dataloader_t);
+// Inference
+FF_NEW_OPAQUE_TYPE(flexflow_batch_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_tree_verify_batch_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_beam_search_batch_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
+FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
+FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
+FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_optimizer_config_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_sgd_optimizer_config_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_adam_optimizer_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t);
// -----------------------------------------------------------------------
// FFConfig
@@ -72,12 +85,31 @@ int flexflow_config_get_epochs(flexflow_config_t handle);
bool flexflow_config_get_enable_control_replication(flexflow_config_t handle);
+int flexflow_config_get_data_parallelism_degree(flexflow_config_t handle_);
+
+int flexflow_config_get_tensor_parallelism_degree(flexflow_config_t handle_);
+
+int flexflow_config_get_pipeline_parallelism_degree(flexflow_config_t handle_);
+
+void flexflow_config_set_data_parallelism_degree(flexflow_config_t handle_,
+ int value);
+
+void flexflow_config_set_tensor_parallelism_degree(flexflow_config_t handle_,
+ int value);
+
+void flexflow_config_set_pipeline_parallelism_degree(flexflow_config_t handle_,
+ int value);
+
int flexflow_config_get_python_data_loader_type(flexflow_config_t handle);
+
+bool flexflow_config_get_offload(flexflow_config_t handle);
+
// -----------------------------------------------------------------------
// FFModel
// -----------------------------------------------------------------------
-flexflow_model_t flexflow_model_create(flexflow_config_t config);
+flexflow_model_t flexflow_model_create(flexflow_config_t config,
+ bool cpu_offload);
void flexflow_model_destroy(flexflow_model_t handle);
@@ -199,9 +231,10 @@ flexflow_tensor_t
flexflow_tensor_t
flexflow_model_add_embedding(flexflow_model_t handle,
const flexflow_tensor_t input,
- int num_entires,
+ int num_entries,
int out_dim,
enum AggrMode aggr,
+ enum DataType dtype,
flexflow_op_t shared_op,
flexflow_initializer_t kernel_initializer,
char const *name);
@@ -230,8 +263,41 @@ flexflow_tensor_t flexflow_model_add_layer_norm(flexflow_model_t handle,
int *axes,
bool elementwise_affine,
float eps,
+ bool use_bias,
char const *name);
+flexflow_tensor_t *
+ flexflow_model_add_residual_layer_norm(flexflow_model_t handle,
+ const flexflow_tensor_t input,
+ const flexflow_tensor_t residual1,
+ const flexflow_tensor_t residual2,
+ bool use_two_residuals,
+ int n,
+ int *axes,
+ bool elementwise_affine,
+ float eps,
+ bool use_bias,
+ bool inplace_residual,
+ char const *name);
+
+flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
+ flexflow_model_t handle,
+ const flexflow_tensor_t input,
+ const flexflow_tensor_t residual,
+ int n,
+ int *axes,
+ bool elementwise_affine,
+ float eps,
+ bool use_bias,
+ bool inplace_residual,
+ char const *name);
+
+flexflow_tensor_t
+ flexflow_model_add_sigmoid_silu_multi(flexflow_model_t handle,
+ const flexflow_tensor_t input1,
+ const flexflow_tensor_t input2,
+ char const *name);
+
flexflow_tensor_t
flexflow_model_add_batch_matmul(flexflow_model_t handle,
const flexflow_tensor_t a,
@@ -374,6 +440,170 @@ flexflow_tensor_t flexflow_model_add_multihead_attention(
flexflow_initializer_t kernel_initializer,
char const *name);
+flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_spec_inc_multihead_self_attention(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_inc_multihead_self_attention_verify(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_spec_inc_multiquery_self_attention(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_inc_multiquery_self_attention_verify(
+ flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim,
+ int vdim,
+ float dropout,
+ bool bias,
+ bool add_bias_kv,
+ bool add_zero_attn,
+ enum DataType data_type,
+ flexflow_initializer_t kernel_initializer_,
+ bool apply_rotary_embedding,
+ bool scaling_query,
+ float scaling_factor,
+ bool qk_prod_scaling,
+ bool position_bias,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_rms_norm(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ float eps,
+ int dim,
+ char const *name);
+
+flexflow_tensor_t *
+ flexflow_model_add_residual_rms_norm(flexflow_model_t handle_,
+ const flexflow_tensor_t input1_,
+ const flexflow_tensor_t input2_,
+ float eps,
+ int dim,
+ bool inplace_residual,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int k,
+ bool sorted,
+ bool speculative_decoding,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_beam_top_k(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ int max_beam_size,
+ bool sorted,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_sampling(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ float top_p,
+ char const *name);
+
+flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
+ const flexflow_tensor_t input_,
+ bool beam_search,
+ char const *name);
+
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+ flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
+
void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
flexflow_sgd_optimizer_t optimizer);
@@ -393,6 +623,23 @@ flexflow_tensor_t flexflow_model_get_parameter_by_id(flexflow_model_t handle,
flexflow_perf_metrics_t
flexflow_model_get_perf_metrics(flexflow_model_t handle);
+void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
+
+void flexflow_model_generate(flexflow_model_t handle_,
+ int num_requests,
+ enum RequestType *request_types,
+ char const **input_texts,
+ char **output_texts,
+ int *max_seq_lengths,
+ flexflow_peft_model_id_t *peft_model_ids,
+ char const **dataset_filepaths,
+ int *training_steps,
+ int **output_length_and_tokens,
+ int *num_finetuning_losses,
+ float *finetuning_losses);
+
+void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);
+
// -----------------------------------------------------------------------
// Tensor
// -----------------------------------------------------------------------
@@ -702,6 +949,222 @@ void flexflow_op_forward(flexflow_op_t handle, flexflow_model_t model);
void flexflow_perform_registration(void);
+// -----------------------------------------------------------------------
+// BatchConfig
+// -----------------------------------------------------------------------
+
+flexflow_batch_config_t flexflow_batch_config_create(void);
+
+void flexflow_batch_config_destroy(flexflow_batch_config_t handle);
+
+// -----------------------------------------------------------------------
+// TreeVerifyBatchConfig
+// -----------------------------------------------------------------------
+
+flexflow_tree_verify_batch_config_t
+ flexflow_tree_verify_batch_config_create(void);
+
+void flexflow_tree_verify_batch_config_destroy(
+ flexflow_tree_verify_batch_config_t handle);
+
+// -----------------------------------------------------------------------
+// BeamSearchBatchConfig
+// -----------------------------------------------------------------------
+
+flexflow_beam_search_batch_config_t
+ flexflow_beam_search_batch_config_create(void);
+
+void flexflow_beam_search_batch_config_destroy(
+ flexflow_beam_search_batch_config_t handle);
+
+// -----------------------------------------------------------------------
+// RequestManager
+// -----------------------------------------------------------------------
+
+flexflow_request_manager_t flexflow_request_manager_get_request_manager(void);
+
+// void flexflow_request_manager_destroy(flexflow_request_manager_t handle_);
+
+void flexflow_request_manager_set_max_requests_per_batch(
+ flexflow_request_manager_t handle_, int max_num_requests);
+
+void flexflow_request_manager_set_max_tokens_per_batch(
+ flexflow_request_manager_t handle_, int max_num_tokens);
+
+void flexflow_request_manager_set_max_spec_tree_token_num(
+ flexflow_request_manager_t handle_, int max_num_tokens);
+
+void flexflow_request_manager_set_max_sequence_length(
+ flexflow_request_manager_t handle_, int max_seq_length);
+
+void flexflow_request_manager_set_enable_peft_finetuning(
+ flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
+
+void flexflow_request_manager_register_tokenizer(
+ flexflow_request_manager_t handle_,
+ enum ModelType model_type,
+ int bos_token_id,
+ int eos_token_id,
+ char const *tokenizer_filepath);
+
+void flexflow_request_manager_register_output_filepath(
+ flexflow_request_manager_t handle_, char const *output_filepath);
+
+int flexflow_request_manager_register_ssm_model(
+ flexflow_request_manager_t handle_, flexflow_model_t model_handle_);
+
+void flexflow_request_manager_start_background_server(
+ flexflow_request_manager_t handle_, flexflow_model_t model_handle_);
+
+void flexflow_request_manager_terminate_background_server(
+ flexflow_request_manager_t handle_);
+
+// -----------------------------------------------------------------------
+// InferenceManager
+// -----------------------------------------------------------------------
+
+flexflow_inference_manager_t
+ flexflow_inference_manager_get_inference_manager(void);
+
+// void flexflow_inference_manager_destroy(flexflow_inference_manager_t
+// handle_);
+
+void flexflow_inference_manager_compile_model_and_allocate_buffer(
+ flexflow_inference_manager_t handle_, flexflow_model_t model_handle);
+
+void flexflow_inference_manager_init_operators_inference(
+ flexflow_inference_manager_t handle_, flexflow_model_t model_handle);
+
+void flexflow_inference_manager_register_model_weights_loader(
+ flexflow_inference_manager_t handle_,
+ flexflow_model_t model_handle,
+ flexflow_file_data_loader_t loader_handle);
+
+// -----------------------------------------------------------------------
+// FileDataLoader
+// -----------------------------------------------------------------------
+
+flexflow_file_data_loader_t
+ flexflow_file_data_loader_create(char const *weight_file_path,
+ int num_q_heads,
+ int num_kv_heads,
+ int hidden_dim,
+ int qkv_inner_dim,
+ int tensor_parallelism_degree,
+ bool use_full_precision);
+
+void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
+
+void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
+ flexflow_model_t model_handle_);
+
+// // -----------------------------------------------------------------------
+// // LoraSGDOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_sgd_optimizer_config_t
+// flexflow_lora_sgd_optimizer_config_create(
+// double lr, double momentum, bool nesterov, bool weight_decay);
+
+// void flexflow_lora_sgd_optimizer_config_destroy(
+// flexflow_lora_sgd_optimizer_config_t handle_);
+
+// // -----------------------------------------------------------------------
+// // LoraAdamOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_adam_optimizer_config_t
+// flexflow_lora_adam_optimizer_config_create(double alpha,
+// double beta1,
+// double beta2,
+// double weight_decay,
+// double epsilon);
+
+// void flexflow_lora_adam_optimizer_config_destroy(
+// flexflow_lora_adam_optimizer_config_t handle_);
+
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+ flexflow_lora_linear_config_create(char const *cache_folder_,
+ char const *peft_model_id_,
+ bool trainable,
+ bool init_lora_weights,
+ char const *base_model_name_or_path,
+ char const *precision,
+ int rank,
+ float lora_alpha,
+ float lora_dropout,
+ int num_target_modules,
+ char const **target_modules_,
+ enum OptimizerType optimizer_type,
+ float sgd_learning_rate,
+ float sgd_momentum,
+ bool sgd_nesterov,
+ float sgd_weight_decay,
+ float adam_alpha,
+ float adam_beta1,
+ float adam_beta2,
+ float adam_weight_decay,
+ float adam_epsilon);
+
+void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_cache_folder(
+ flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_peft_model_id(
+ flexflow_lora_linear_config_t handle_);
+
+int flexflow_lora_linear_config_get_rank(flexflow_lora_linear_config_t handle_);
+
+float flexflow_lora_linear_config_get_lora_alpha(
+ flexflow_lora_linear_config_t handle_);
+
+float flexflow_lora_linear_config_get_lora_dropout(
+ flexflow_lora_linear_config_t handle_);
+
+bool flexflow_lora_linear_config_get_trainable(
+ flexflow_lora_linear_config_t handle_);
+
+bool flexflow_lora_linear_config_get_init_lora_weights(
+ flexflow_lora_linear_config_t handle_);
+
+char const **flexflow_lora_linear_config_get_target_modules(
+ flexflow_lora_linear_config_t handle_, int *num_target_modules);
+
+char const *flexflow_lora_linear_config_get_base_model_name_or_path(
+ flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_precision(
+ flexflow_lora_linear_config_t handle_);
+
+void flexflow_lora_linear_config_set_lora_alpha(
+ flexflow_lora_linear_config_t handle_, float value);
+
+void flexflow_lora_linear_config_set_lora_dropout(
+ flexflow_lora_linear_config_t handle_, float value);
+
+void flexflow_lora_linear_config_set_trainable(
+ flexflow_lora_linear_config_t handle_, bool value);
+
+void flexflow_lora_linear_config_set_init_lora_weights(
+ flexflow_lora_linear_config_t handle_, bool value);
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create();
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id);
+
+flexflow_peft_model_id_t flexflow_peft_model_id_no_id();
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_);
+
#ifdef __cplusplus
}
#endif
diff --git a/include/flexflow/gpt_tokenizer.h b/include/flexflow/gpt_tokenizer.h
new file mode 100644
index 0000000000..ec08435809
--- /dev/null
+++ b/include/flexflow/gpt_tokenizer.h
@@ -0,0 +1,221 @@
+// version 0.1
+// Licensed under the MIT License .
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2019-2020 zili wang .
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using json = nlohmann::json;
+
+typedef std::pair bigram_pair;
+typedef std::pair wbigram_pair;
+
+struct hash_pair {
+ template
+ size_t operator()(std::pair const &p) const {
+ auto hash1 = std::hash{}(p.first);
+ auto hash2 = std::hash{}(p.second);
+ return hash1 ^ hash2;
+ }
+};
+
+enum tokenizer_mode { GPT2_TOKENIZER, OPT_TOKENIZER };
+
+class GPT_Tokenizer {
+
+public:
+ GPT_Tokenizer(tokenizer_mode mode_,
+ std::string const &vocab_file,
+ std::string const &merge_file,
+ std::string const &bos_token_str = "",
+ const std::string eos_token_str = "",
+ const std::string pad_token_str = "",
+ const std::string unk_token_str = "",
+ const std::string mask_token_str = "") {
+ mode = mode_;
+ load_vocab(vocab_file);
+ load_merge(merge_file);
+ bos_token = bos_token_str;
+ eos_token = eos_token_str;
+ pad_token = pad_token_str;
+ unk_token = unk_token_str;
+ mask_token = mask_token_str;
+ bytes_encoder = bytes_to_unicode();
+ unicode_to_bytes();
+ };
+ // ~GPT_Tokenizer();
+ std::vector bpe(std::wstring token);
+ std::vector tokenize(std::string str);
+ int32_t convert_token_to_id(std::string token);
+ void encode(std::string str,
+ size_t max_length,
+ std::vector *input_ids,
+ std::vector *mask_ids);
+ std::string decode(std::vector input_ids,
+ std::vector mask_ids);
+ tokenizer_mode mode;
+ std::string bos_token;
+ std::string eos_token;
+ std::string pad_token;
+ std::string unk_token;
+ std::string mask_token;
+ std::string strip(std::string const &inpt);
+
+private:
+ std::unordered_map vocab;
+ std::unordered_map inverse_vocab;
+ std::unordered_map bpe_ranks;
+ wchar_t *bytes_to_unicode();
+ void unicode_to_bytes();
+ wchar_t *bytes_encoder;
+ std::unordered_map bytes_decoder;
+ uint32_t cache_max_size = 500000;
+ uint32_t cache_word_max_length = 30;
+ std::string unicode_letter_expr =
+ "\\u0041-\\u005A\\u0061-\\u007A\\u00AA-\\u00AA\\u00B5-\\u00B5"
+ "\\u00BA-\\u00BA\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02C1"
+ "\\u02C6-\\u02D1\\u02E0-\\u02E4\\u02EC-\\u02EC\\u02EE-\\u02EE"
+ "\\u0370-\\u0374\\u0376-\\u0377\\u037A-\\u037D\\u037F-\\u037F"
+ "\\u0386-\\u0386\\u0388-\\u038A\\u038C-\\u038C\\u038E-\\u03A1"
+ "\\u03A3-\\u03F5\\u03F7-\\u0481\\u048A-\\u052F\\u0531-\\u0556"
+ "\\u0559-\\u0559\\u0560-\\u0588\\u05D0-\\u05EA\\u05EF-\\u05F2"
+ "\\u0620-\\u064A\\u066E-\\u066F\\u0671-\\u06D3\\u06D5-\\u06D5"
+ "\\u06E5-\\u06E6\\u06EE-\\u06EF\\u06FA-\\u06FC\\u06FF-\\u06FF"
+ "\\u0710-\\u0710\\u0712-\\u072F\\u074D-\\u07A5\\u07B1-\\u07B1"
+ "\\u07CA-\\u07EA\\u07F4-\\u07F5\\u07FA-\\u07FA\\u0800-\\u0815"
+ "\\u081A-\\u081A\\u0824-\\u0824\\u0828-\\u0828\\u0840-\\u0858"
+ "\\u0860-\\u086A\\u08A0-\\u08B4\\u08B6-\\u08C7\\u0904-\\u0939"
+ "\\u093D-\\u093D\\u0950-\\u0950\\u0958-\\u0961\\u0971-\\u0980"
+ "\\u0985-\\u098C\\u098F-\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0"
+ "\\u09B2-\\u09B2\\u09B6-\\u09B9\\u09BD-\\u09BD\\u09CE-\\u09CE"
+ "\\u09DC-\\u09DD\\u09DF-\\u09E1\\u09F0-\\u09F1\\u09FC-\\u09FC"
+ "\\u0A05-\\u0A0A\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30"
+ "\\u0A32-\\u0A33\\u0A35-\\u0A36\\u0A38-\\u0A39\\u0A59-\\u0A5C"
+ "\\u0A5E-\\u0A5E\\u0A72-\\u0A74\\u0A85-\\u0A8D\\u0A8F-\\u0A91"
+ "\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9"
+ "\\u0ABD-\\u0ABD\\u0AD0-\\u0AD0\\u0AE0-\\u0AE1\\u0AF9-\\u0AF9"
+ "\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30"
+ "\\u0B32-\\u0B33\\u0B35-\\u0B39\\u0B3D-\\u0B3D\\u0B5C-\\u0B5D"
+ "\\u0B5F-\\u0B61\\u0B71-\\u0B71\\u0B83-\\u0B83\\u0B85-\\u0B8A"
+ "\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99-\\u0B9A\\u0B9C-\\u0B9C"
+ "\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9"
+ "\\u0BD0-\\u0BD0\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28"
+ "\\u0C2A-\\u0C39\\u0C3D-\\u0C3D\\u0C58-\\u0C5A\\u0C60-\\u0C61"
+ "\\u0C80-\\u0C80\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8"
+ "\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBD-\\u0CBD\\u0CDE-\\u0CDE"
+ "\\u0CE0-\\u0CE1\\u0CF1-\\u0CF2\\u0D04-\\u0D0C\\u0D0E-\\u0D10"
+ "\\u0D12-\\u0D3A\\u0D3D-\\u0D3D\\u0D4E-\\u0D4E\\u0D54-\\u0D56"
+ "\\u0D5F-\\u0D61\\u0D7A-\\u0D7F\\u0D85-\\u0D96\\u0D9A-\\u0DB1"
+ "\\u0DB3-\\u0DBB\\u0DBD-\\u0DBD\\u0DC0-\\u0DC6\\u0E01-\\u0E30"
+ "\\u0E32-\\u0E33\\u0E40-\\u0E46\\u0E81-\\u0E82\\u0E84-\\u0E84"
+ "\\u0E86-\\u0E8A\\u0E8C-\\u0EA3\\u0EA5-\\u0EA5\\u0EA7-\\u0EB0"
+ "\\u0EB2-\\u0EB3\\u0EBD-\\u0EBD\\u0EC0-\\u0EC4\\u0EC6-\\u0EC6"
+ "\\u0EDC-\\u0EDF\\u0F00-\\u0F00\\u0F40-\\u0F47\\u0F49-\\u0F6C"
+ "\\u0F88-\\u0F8C\\u1000-\\u102A\\u103F-\\u103F\\u1050-\\u1055"
+ "\\u105A-\\u105D\\u1061-\\u1061\\u1065-\\u1066\\u106E-\\u1070"
+ "\\u1075-\\u1081\\u108E-\\u108E\\u10A0-\\u10C5\\u10C7-\\u10C7"
+ "\\u10CD-\\u10CD\\u10D0-\\u10FA\\u10FC-\\u1248\\u124A-\\u124D"
+ "\\u1250-\\u1256\\u1258-\\u1258\\u125A-\\u125D\\u1260-\\u1288"
+ "\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE"
+ "\\u12C0-\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310"
+ "\\u1312-\\u1315\\u1318-\\u135A\\u1380-\\u138F\\u13A0-\\u13F5"
+ "\\u13F8-\\u13FD\\u1401-\\u166C\\u166F-\\u167F\\u1681-\\u169A"
+ "\\u16A0-\\u16EA\\u16F1-\\u16F8\\u1700-\\u170C\\u170E-\\u1711"
+ "\\u1720-\\u1731\\u1740-\\u1751\\u1760-\\u176C\\u176E-\\u1770"
+ "\\u1780-\\u17B3\\u17D7-\\u17D7\\u17DC-\\u17DC\\u1820-\\u1878"
+ "\\u1880-\\u1884\\u1887-\\u18A8\\u18AA-\\u18AA\\u18B0-\\u18F5"
+ "\\u1900-\\u191E\\u1950-\\u196D\\u1970-\\u1974\\u1980-\\u19AB"
+ "\\u19B0-\\u19C9\\u1A00-\\u1A16\\u1A20-\\u1A54\\u1AA7-\\u1AA7"
+ "\\u1B05-\\u1B33\\u1B45-\\u1B4B\\u1B83-\\u1BA0\\u1BAE-\\u1BAF"
+ "\\u1BBA-\\u1BE5\\u1C00-\\u1C23\\u1C4D-\\u1C4F\\u1C5A-\\u1C7D"
+ "\\u1C80-\\u1C88\\u1C90-\\u1CBA\\u1CBD-\\u1CBF\\u1CE9-\\u1CEC"
+ "\\u1CEE-\\u1CF3\\u1CF5-\\u1CF6\\u1CFA-\\u1CFA\\u1D00-\\u1DBF"
+ "\\u1E00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D"
+ "\\u1F50-\\u1F57\\u1F59-\\u1F59\\u1F5B-\\u1F5B\\u1F5D-\\u1F5D"
+ "\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC\\u1FBE-\\u1FBE"
+ "\\u1FC2-\\u1FC4\\u1FC6-\\u1FCC\\u1FD0-\\u1FD3\\u1FD6-\\u1FDB"
+ "\\u1FE0-\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FFC\\u2071-\\u2071"
+ "\\u207F-\\u207F\\u2090-\\u209C\\u2102-\\u2102\\u2107-\\u2107"
+ "\\u210A-\\u2113\\u2115-\\u2115\\u2119-\\u211D\\u2124-\\u2124"
+ "\\u2126-\\u2126\\u2128-\\u2128\\u212A-\\u212D\\u212F-\\u2139"
+ "\\u213C-\\u213F\\u2145-\\u2149\\u214E-\\u214E\\u2183-\\u2184"
+ "\\u2C00-\\u2C2E\\u2C30-\\u2C5E\\u2C60-\\u2CE4\\u2CEB-\\u2CEE"
+ "\\u2CF2-\\u2CF3\\u2D00-\\u2D25\\u2D27-\\u2D27\\u2D2D-\\u2D2D"
+ "\\u2D30-\\u2D67\\u2D6F-\\u2D6F\\u2D80-\\u2D96\\u2DA0-\\u2DA6"
+ "\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6"
+ "\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u2E2F-\\u2E2F"
+ "\\u3005-\\u3006\\u3031-\\u3035\\u303B-\\u303C\\u3041-\\u3096"
+ "\\u309D-\\u309F\\u30A1-\\u30FA\\u30FC-\\u30FF\\u3105-\\u312F"
+ "\\u3131-\\u318E\\u31A0-\\u31BF\\u31F0-\\u31FF\\u3400-\\u4DBF"
+ "\\u4E00-\\u9FFC\\uA000-\\uA48C\\uA4D0-\\uA4FD\\uA500-\\uA60C"
+ "\\uA610-\\uA61F\\uA62A-\\uA62B\\uA640-\\uA66E\\uA67F-\\uA69D"
+ "\\uA6A0-\\uA6E5\\uA717-\\uA71F\\uA722-\\uA788\\uA78B-\\uA7BF"
+ "\\uA7C2-\\uA7CA\\uA7F5-\\uA801\\uA803-\\uA805\\uA807-\\uA80A"
+ "\\uA80C-\\uA822\\uA840-\\uA873\\uA882-\\uA8B3\\uA8F2-\\uA8F7"
+ "\\uA8FB-\\uA8FB\\uA8FD-\\uA8FE\\uA90A-\\uA925\\uA930-\\uA946"
+ "\\uA960-\\uA97C\\uA984-\\uA9B2\\uA9CF-\\uA9CF\\uA9E0-\\uA9E4"
+ "\\uA9E6-\\uA9EF\\uA9FA-\\uA9FE\\uAA00-\\uAA28\\uAA40-\\uAA42"
+ "\\uAA44-\\uAA4B\\uAA60-\\uAA76\\uAA7A-\\uAA7A\\uAA7E-\\uAAAF"
+ "\\uAAB1-\\uAAB1\\uAAB5-\\uAAB6\\uAAB9-\\uAABD\\uAAC0-\\uAAC0"
+ "\\uAAC2-\\uAAC2\\uAADB-\\uAADD\\uAAE0-\\uAAEA\\uAAF2-\\uAAF4"
+ "\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26"
+ "\\uAB28-\\uAB2E\\uAB30-\\uAB5A\\uAB5C-\\uAB69\\uAB70-\\uABE2"
+ "\\uAC00-\\uD7A3\\uD7B0-\\uD7C6\\uD7CB-\\uD7FB\\uF900-\\uFA6D"
+ "\\uFA70-\\uFAD9\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D-\\uFB1D"
+ "\\uFB1F-\\uFB28\\uFB2A-\\uFB36\\uFB38-\\uFB3C\\uFB3E-\\uFB3E"
+ "\\uFB40-\\uFB41\\uFB43-\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D"
+ "\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFB\\uFE70-\\uFE74"
+ "\\uFE76-\\uFEFC\\uFF21-\\uFF3A\\uFF41-\\uFF5A\\uFF66-\\uFFBE"
+ "\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC";
+
+ std::string unicode_number_expr =
+ "\\u0030-\\u0039\\u00B2-\\u00B3\\u00B9-\\u00B9\\u00BC-\\u00BE"
+ "\\u0660-\\u0669\\u06F0-\\u06F9\\u07C0-\\u07C9\\u0966-\\u096F"
+ "\\u09E6-\\u09EF\\u09F4-\\u09F9\\u0A66-\\u0A6F\\u0AE6-\\u0AEF"
+ "\\u0B66-\\u0B6F\\u0B72-\\u0B77\\u0BE6-\\u0BF2\\u0C66-\\u0C6F"
+ "\\u0C78-\\u0C7E\\u0CE6-\\u0CEF\\u0D58-\\u0D5E\\u0D66-\\u0D78"
+ "\\u0DE6-\\u0DEF\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F33"
+ "\\u1040-\\u1049\\u1090-\\u1099\\u1369-\\u137C\\u16EE-\\u16F0"
+ "\\u17E0-\\u17E9\\u17F0-\\u17F9\\u1810-\\u1819\\u1946-\\u194F"
+ "\\u19D0-\\u19DA\\u1A80-\\u1A89\\u1A90-\\u1A99\\u1B50-\\u1B59"
+ "\\u1BB0-\\u1BB9\\u1C40-\\u1C49\\u1C50-\\u1C59\\u2070-\\u2070"
+ "\\u2074-\\u2079\\u2080-\\u2089\\u2150-\\u2182\\u2185-\\u2189"
+ "\\u2460-\\u249B\\u24EA-\\u24FF\\u2776-\\u2793\\u2CFD-\\u2CFD"
+ "\\u3007-\\u3007\\u3021-\\u3029\\u3038-\\u303A\\u3192-\\u3195"
+ "\\u3220-\\u3229\\u3248-\\u324F\\u3251-\\u325F\\u3280-\\u3289"
+ "\\u32B1-\\u32BF\\uA620-\\uA629\\uA6E6-\\uA6EF\\uA830-\\uA835"
+ "\\uA8D0-\\uA8D9\\uA900-\\uA909\\uA9D0-\\uA9D9\\uA9F0-\\uA9F9"
+ "\\uAA50-\\uAA59\\uABF0-\\uABF9\\uFF10-\\uFF19";
+
+ std::wstring wpat_expr = utf8_to_wstring(
+ "'s|'t|'re|'ve|'m|'ll|'d| ?[" + unicode_letter_expr + "]+| ?[" +
+ unicode_number_expr + "]+| ?[^\\s" + unicode_letter_expr +
+ unicode_number_expr + "]+|\\s+(?!\\S)|\\s+");
+
+ const std::wregex pat = std::wregex(wpat_expr);
+ std::unordered_map> cache;
+ void load_vocab(std::string const &vocab_file);
+ void load_merge(std::string const &merge_file);
+
+ std::unordered_set
+ get_pairs(std::vector word);
+ std::wstring utf8_to_wstring(std::string const &src);
+ std::u32string utf8_to_utf32(std::string const &src);
+ std::string wstring_to_utf8(std::wstring const &src);
+ std::string utf32_to_utf8(std::u32string const &src);
+
+ std::vector split(std::string const &s,
+ std::regex rgx = std::regex("\\s+"));
+};
diff --git a/include/flexflow/graph.h b/include/flexflow/graph.h
index 2c92eeeb31..d441adef17 100644
--- a/include/flexflow/graph.h
+++ b/include/flexflow/graph.h
@@ -24,7 +24,7 @@
#include "legion/legion_utilities.h"
#include
-extern LegionRuntime::Logger::Category log_dp;
+extern Legion::Logger log_dp;
namespace FlexFlow::PCG {
@@ -91,9 +91,9 @@ struct NodeCompare {
struct GraphOptimalViewSerialized {
#ifdef LEGION_MAX_RETURN_SIZE
- static const size_t buffer_size = 4 * LEGION_MAX_RETURN_SIZE - 8;
+ static size_t const buffer_size = 4 * LEGION_MAX_RETURN_SIZE - 8;
#else
- static const size_t buffer_size = 1024 * 1024 - 8;
+ static size_t const buffer_size = 1024 * 1024 - 8;
#endif
size_t total_bytes;
char data[buffer_size];
@@ -279,7 +279,7 @@ class SearchHelper {
mutable std::unordered_map cached_graph_costs;
mutable std::unordered_map>>
+ std::unique_ptr const>>
cached_operator_valid_views;
};
@@ -332,8 +332,8 @@ class Graph {
std::vector const ®ions,
Legion::Context ctx,
Legion::Runtime *runtime);
- static GraphOptimalViewSerialized
- graph_optimize_wrapper(FFModel * model);
+ // static GraphOptimalViewSerialized
+ // graph_optimize_wrapper(FFModel * model);
Node find_bottleneck_node(Node const &sink_node,
Node const &source_node) const;
void print_strategy_computation_graph(
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
new file mode 100644
index 0000000000..ba4101c173
--- /dev/null
+++ b/include/flexflow/inference.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 CMU, Stanford, Facebook, LANL
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "flexflow/batch_config.h"
+#include
+#include
+
+namespace FlexFlow {
+
+struct GenerationConfig {
+ bool do_sample = false;
+ float temperature = 0.8;
+ float topp = 0.6;
+ GenerationConfig(bool _do_sample, float _temperature, float _topp) {
+ temperature = _temperature > 0 ? _temperature : temperature;
+ topp = _topp > 0 ? _topp : topp;
+ do_sample = _do_sample;
+ }
+ GenerationConfig() {}
+};
+
+struct GenerationResult {
+ using RequestGuid = BatchConfig::RequestGuid;
+ using TokenId = BatchConfig::TokenId;
+ RequestGuid guid;
+ std::string input_text;
+ std::string output_text;
+ std::vector input_tokens;
+ std::vector output_tokens;
+ std::vector finetuning_losses;
+};
+
+#include
+#include
+
+std::string join_path(std::vector const &paths);
+
+} // namespace FlexFlow
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
index 0c1d7a6092..c3dbcac422 100644
--- a/include/flexflow/layer.h
+++ b/include/flexflow/layer.h
@@ -49,9 +49,10 @@ class Layer {
Tensor outputs[MAX_NUM_OUTPUTS];
Tensor inputs[MAX_NUM_INPUTS];
Tensor weights[MAX_NUM_WEIGHTS];
- bool trainableInputs[MAX_NUM_INPUTS];
+ // bool trainable_inputs[MAX_NUM_INPUTS];
int numInputs, numWeights, numOutputs;
bool profiling;
+ bool inference_debugging;
private:
std::unordered_map int_properties;
diff --git a/include/flexflow/machine_view.h b/include/flexflow/machine_view.h
index b843555e06..76cc05d8f5 100644
--- a/include/flexflow/machine_view.h
+++ b/include/flexflow/machine_view.h
@@ -3,10 +3,12 @@
#include "legion.h"
#include
+#ifdef FF_USE_NCCL
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
#include
#else
-#include
+#include
+#endif
#endif
#include "flexflow/config.h"
@@ -14,7 +16,7 @@ namespace FlexFlow {
class FFConfig;
struct MachineView {
- static const MachineView NO_VIEW;
+ static MachineView const NO_VIEW;
MachineView();
int get_device_id(Legion::DomainPoint const &p) const;
diff --git a/include/flexflow/mapper.h b/include/flexflow/mapper.h
index 71be1892aa..e8337818ec 100644
--- a/include/flexflow/mapper.h
+++ b/include/flexflow/mapper.h
@@ -83,11 +83,10 @@ class FFMapper : public NullMapper {
Task const &task,
MapTaskInput const &input,
MapTaskOutput &output);
- virtual void map_replicate_task(const MapperContext ctx,
- Task const &task,
- MapTaskInput const &input,
- MapTaskOutput const &default_output,
- MapReplicateTaskOutput &output);
+ virtual void replicate_task(const MapperContext ctx,
+ Task const &task,
+ ReplicateTaskInput const &input,
+ ReplicateTaskOutput &output);
virtual void select_task_variant(const MapperContext ctx,
Task const &task,
SelectVariantInput const &input,
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index fe73e6a0e3..46c6282a65 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -17,10 +17,12 @@
#include "accessor.h"
#include "config.h"
#include "device.h"
+#include "flexflow/inference.h"
#include "flexflow/memory_optimization.h"
#include "flexflow/node.h"
#include "flexflow/operator_params.h"
#include "flexflow/utils/hash_utils.h"
+#include "flexflow/utils/memory_allocator.h"
#include "flexflow/utils/tuple.h"
#include "initializer.h"
#include "layer.h"
@@ -30,6 +32,7 @@
#include "optimizer.h"
#include "parallel_tensor.h"
#include "recompile.h"
+#include "runtime.h"
#include "simulator.h"
#include "tensor.h"
#include "tl/optional.hpp"
@@ -50,11 +53,17 @@ enum TaskIDs {
LOAD_IMAGES_TASK_ID,
NORMALIZE_IMAGES_TASK_ID,
ELEMENTBINARY_INIT_TASK_ID,
+ ELEMENTBINARY_INF_TASK_ID,
ELEMENTBINARY_FWD_TASK_ID,
ELEMENTBINARY_BWD_TASK_ID,
ELEMENTUNARY_INIT_TASK_ID,
ELEMENTUNARY_FWD_TASK_ID,
+ ELEMENTUNARY_INF_TASK_ID,
ELEMENTUNARY_BWD_TASK_ID,
+ EXPERTS_INIT_TASK_ID,
+ EXPERTS_FWD_TASK_ID,
+ EXPERTS_BWD_TASK_ID,
+ EXPERTS_INF_TASK_ID,
CONV2D_INIT_TASK_ID,
CONV2D_INIT_PARA_TASK_ID,
CONV2D_FWD_TASK_ID,
@@ -65,6 +74,7 @@ enum TaskIDs {
DROPOUT_BWD_TASK_ID,
EMBED_INIT_TASK_ID,
EMBED_FWD_TASK_ID,
+ EMBED_INF_TASK_ID,
EMBED_BWD_TASK_ID,
GATHER_INIT_TASK_ID,
GATHER_FWD_TASK_ID,
@@ -96,19 +106,41 @@ enum TaskIDs {
BATCHMATMUL_BWD_TASK_ID,
LAYERNORM_INIT_TASK_ID,
LAYERNORM_FWD_TASK_ID,
+ LAYERNORM_INF_TASK_ID,
LAYERNORM_BWD_TASK_ID,
+ LAYERNORM_PEFT_BWD_TASK_ID,
+ RESIDUAL_LAYERNORM_INIT_TASK_ID,
+ RESIDUAL_LAYERNORM_INF_TASK_ID,
+ RESIDUAL_LAYERNORM_BWD_TASK_ID,
+ RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+ ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
+ ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID,
+ ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+ ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+ SIGMOID_SILU_MULTI_INIT_TASK_ID,
+ SIGMOID_SILU_MULTI_INF_TASK_ID,
+ SIGMOID_SILU_MULTI_BWD_TASK_ID,
+ SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
LINEAR_INIT_TASK_ID,
LINEAR_INIT_PARA_TASK_ID,
+ LINEAR_INF_TASK_ID,
+ LINEAR_PEFT_BWD_TASK_ID,
LINEAR_FWD_TASK_ID,
LINEAR_BWD_TASK_ID,
LINEAR_BWD2_TASK_ID,
LINEAR_UPD_TASK_ID,
+ LORA_LINEAR_INIT_TASK_ID,
+ LORA_LINEAR_REG_TASK_ID,
+ LORA_LINEAR_INF_TASK_ID,
+ LORA_LINEAR_PEFT_BWD_TASK_ID,
FLAT_INIT_TASK_ID,
FLAT_FWD_TASK_ID,
FLAT_BWD_TASK_ID,
SOFTMAX_INIT_TASK_ID,
SOFTMAX_FWD_TASK_ID,
SOFTMAX_BWD_TASK_ID,
+ SOFTMAX_INF_TASK_ID,
+ SOFTMAX_PEFT_BWD_TASK_ID,
CONCAT_INIT_TASK_ID,
CONCAT_FWD_TASK_ID,
CONCAT_BWD_TASK_ID,
@@ -127,16 +159,46 @@ enum TaskIDs {
TOPK_INIT_TASK_ID,
TOPK_FWD_TASK_ID,
TOPK_BWD_TASK_ID,
+ ARG_TOPK_INIT_TASK_ID,
+ ARG_TOPK_INF_TASK_ID,
+ ARG_TOPK_INF_SPECULATIVE_TASK_ID,
+ SAMPLING_INIT_TASK_ID,
+ SAMPLING_INF_TASK_ID,
+ ARGMAX_INIT_TASK_ID,
+ ARGMAX_BEAM_INF_TASK_ID,
+ ARGMAX_NORM_INF_TASK_ID,
TRANSPOSE_INIT_TASK_ID,
TRANSPOSE_FWD_TASK_ID,
TRANSPOSE_BWD_TASK_ID,
ATTENTION_INIT_TASK_ID,
ATTENTION_FWD_TASK_ID,
ATTENTION_BWD_TASK_ID,
+ RMSNORM_INIT_TASK_ID,
+ RMSNORM_FWD_TASK_ID,
+ RMSNORM_INF_TASK_ID,
+ RMSNORM_BWD_TASK_ID,
+ RMSNORM_PEFT_BWD_TASK_ID,
+ RESIDUAL_RMSNORM_INIT_TASK_ID,
+ RESIDUAL_RMSNORM_INF_TASK_ID,
+ RESIDUAL_RMSNORM_BWD_TASK_ID,
+ RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
+ BEAM_TOPK_INIT_TASK_ID,
+ BEAM_TOPK_INF_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
+ INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
+ SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
+ SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
+ TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
+ TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
MSELOSS_BWD_TASK_ID,
FUSEDOP_INIT_TASK_ID,
+ FUSEDOP_PEFT_BWD_TASK_ID,
FUSEDOP_FWD_TASK_ID,
FUSEDOP_BWD_TASK_ID,
+ FUSEDOP_INF_TASK_ID,
NOOP_INIT_TASK_ID,
// Metrics tasks
METRICS_COMP_TASK_ID,
@@ -161,6 +223,7 @@ enum TaskIDs {
// NCCL tasks
NCCL_GETUNIQUEID_TASK_ID,
NCCL_INIT_COMMS_TASK_ID,
+ NCCL_FINISH_COMMS_TASK_ID,
// Search
STRATEGY_SEARCH_TASK_ID,
// Graph
@@ -181,10 +244,13 @@ enum TaskIDs {
REPARTITION_BWD_TASK_ID,
COMBINE_INIT_TASK_ID,
COMBINE_FWD_TASK_ID,
+ COMBINE_INF_TASK_ID,
COMBINE_BWD_TASK_ID,
+ COMBINE_PEFT_BWD_TASK_ID,
REPLICATE_INIT_TASK_ID,
REPLICATE_FWD_TASK_ID,
REPLICATE_BWD_TASK_ID,
+ REPLICATE_PEFT_BWD_TASK_ID,
REDUCTION_INIT_TASK_ID,
REDUCTION_FWD_TASK_ID,
REDUCTION_BWD_TASK_ID,
@@ -192,12 +258,27 @@ enum TaskIDs {
PIPELINE_FWD_TASK_ID,
PIPELINE_BWD_TASK_ID,
ALLREDUCE_INIT_TASK_ID,
- ALLREDUCE_INF_TASK_ID,
ALLREDUCE_FWD_TASK_ID,
ALLREDUCE_BWD_TASK_ID,
+ ALLREDUCE_INF_TASK_ID,
+ ALLREDUCE_PEFT_BWD_TASK_ID,
+ PARALLEL_IDENTITY_INIT_TASK_ID,
+ PARALLEL_IDENTITY_FWD_TASK_ID,
+ PARALLEL_IDENTITY_BWD_TASK_ID,
+ PARALLEL_IDENTITY_INF_TASK_ID,
+ PARALLEL_IDENTITY_PEFT_BWD_TASK_ID,
FUSED_PARALLELOP_INIT_TASK_ID,
FUSED_PARALLELOP_FWD_TASK_ID,
FUSED_PARALLELOP_BWD_TASK_ID,
+ // InferenceManager & RequestManager
+ RM_LOAD_TOKENS_TASK_ID,
+ RM_LOAD_POSITION_TASK_ID,
+ RM_LOAD_BATCH_CONFIG_TASK_ID,
+ RM_PREPARE_NEXT_BATCH_TASK_ID,
+ RM_PREPARE_NEXT_BATCH_INIT_TASK_ID,
+ RM_PREPARE_NEXT_BATCH_BEAM_TASK_ID,
+ RM_PREPARE_NEXT_BATCH_VERIFY_TASK_ID,
+ RM_BACKGROUND_SERVING_TASK_ID,
// Custom tasks
CUSTOM_GPU_TASK_ID_FIRST,
CUSTOM_GPU_TASK_ID_1,
@@ -221,6 +302,8 @@ enum TaskIDs {
// Make sure PYTHON_TOP_LEVEL_TASK_ID is
// consistent with python/main.cc
PYTHON_TOP_LEVEL_TASK_ID = 11111,
+ // Tensor Equal Task
+ TENSOR_EQUAL_TASK_ID,
};
enum ShardingID {
@@ -264,27 +347,45 @@ class Dropout;
class ElementBinary;
class ElementUnary;
class Embedding;
+class Experts;
class Flat;
class Gather;
class Group_by;
class LayerNorm;
+class ResidualLayerNorm;
+class AddBiasResidualLayerNorm;
+class SigmoidSiluMulti;
class Linear;
+class LoraLinear;
class MultiHeadAttention;
+class IncMultiHeadSelfAttention;
+class TreeIncMultiHeadSelfAttention;
class Pool2D;
class Reduce;
class Reshape;
class Softmax;
class Split;
class TopK;
+class ArgTopK;
class Transpose;
+class RMSNorm;
+class ResidualRMSNorm;
+class BeamTopK;
+class SpecIncMultiHeadSelfAttention;
+class Sampling;
+class ArgMax;
class Combine;
class AllReduce;
class Repartition;
class Reduction;
class Replicate;
+class AllReduce;
+class ParallelIdentity;
class FusedParallelOp;
class ParallelOpInfo;
+struct Request;
+
// TODO: Move to an appropriate place
/*
This is used to create a type that recursively replaces value type
@@ -331,82 +432,84 @@ std::vector
class FFModel {
public:
- FFModel(FFConfig &config);
+ FFModel(FFConfig &config, bool cpu_offload = false);
+ ~FFModel();
static constexpr float PROPAGATION_CHANCE = 0.25;
static constexpr float CONTINUE_PROPAGATION_CHANCE = 0.75;
static constexpr float PROPAGATION_SIZE_WEIGHT = 1.0;
+ bool cpu_offload;
// C++ APIs for constructing models
// Add an exp layer
- Tensor exp(const Tensor x, char const *name = NULL);
+ Tensor exp(Tensor const x, char const *name = NULL);
// Add an add layer
- Tensor add(const Tensor x,
- const Tensor y,
+ Tensor add(Tensor const x,
+ Tensor const y,
bool inplace_a = false,
char const *name = NULL);
// Add a subtract layer
- Tensor subtract(const Tensor x,
- const Tensor y,
+ Tensor subtract(Tensor const x,
+ Tensor const y,
bool inplace_a = false,
char const *name = NULL);
// Add a multiply layer
- Tensor multiply(const Tensor x,
- const Tensor y,
+ Tensor multiply(Tensor const x,
+ Tensor const y,
bool inplace_a = false,
char const *name = NULL);
// Add a divide layer
- Tensor divide(const Tensor x,
- const Tensor y,
+ Tensor divide(Tensor const x,
+ Tensor const y,
bool inplace_a = false,
char const *name = NULL);
// Add a max layer
- Tensor max(const Tensor x,
- const Tensor y,
+ Tensor max(Tensor const x,
+ Tensor const y,
bool inplace_a = false,
char const *name = NULL);
// Add a min layer
- Tensor min(const Tensor x,
- const Tensor y,
+ Tensor min(Tensor const x,
+ Tensor const y,
bool inplace_a = false,
char const *name = NULL);
// Add a rsqrt layer
- Tensor rsqrt(const Tensor x, bool inplace = true, char const *name = NULL);
+ Tensor rsqrt(Tensor const x, bool inplace = true, char const *name = NULL);
// Add a pow layer
- Tensor pow(const Tensor x,
+ Tensor pow(Tensor const x,
float const exponent,
bool inplace = true,
char const *name = NULL);
// Add a scalar multiply layer
- Tensor scalar_multiply(const Tensor x,
+ Tensor scalar_multiply(Tensor const x,
float const scalar,
bool inplace = true,
char const *name = NULL);
- Tensor scalar_add(const Tensor x,
+ Tensor scalar_add(Tensor const x,
float const scalar,
bool inplace = true,
char const *name = NULL);
- Tensor scalar_sub(const Tensor x,
+ Tensor scalar_sub(Tensor const x,
float const scalar,
bool inplace = true,
char const *name = NULL);
- Tensor scalar_truediv(const Tensor x,
+ Tensor scalar_truediv(Tensor const x,
float const scalar,
bool inplace = true,
char const *name = NULL);
// Add a sin layer
- Tensor sin(const Tensor x, char const *name = NULL);
+ Tensor sin(Tensor const x, char const *name = NULL);
// Add a cos layer
- Tensor cos(const Tensor x, char const *name = NULL);
+ Tensor cos(Tensor const x, char const *name = NULL);
// Add an activation layer
- Tensor relu(const Tensor x, bool inplace = true, char const *name = NULL);
- Tensor identity(const Tensor x, char const *name = NULL);
- Tensor gelu(const Tensor x, char const *name = NULL);
- Tensor sigmoid(const Tensor x, char const *name = NULL);
- Tensor tanh(const Tensor x, char const *name = NULL);
- Tensor elu(const Tensor x, bool inplace = true, char const *name = NULL);
+ Tensor relu(Tensor const x, bool inplace = true, char const *name = NULL);
+ Tensor identity(Tensor const x, char const *name = NULL);
+ Tensor gelu(Tensor const x, char const *name = NULL);
+ Tensor sigmoid(Tensor const x, char const *name = NULL);
+ Tensor tanh(Tensor const x, char const *name = NULL);
+ Tensor elu(Tensor const x, bool inplace = true, char const *name = NULL);
// Add a 2D convolutional layer
- Tensor conv2d(const Tensor input,
+ Tensor conv2d(Tensor const input,
int outChannels,
int kernelH,
int kernelW,
@@ -422,13 +525,13 @@ class FFModel {
Initializer *bias_initializer = NULL,
char const *name = NULL);
// Add a dropout layer
- Tensor dropout(const Tensor input,
+ Tensor dropout(Tensor const input,
float rate,
unsigned long long seed = 0,
char const *name = NULL);
// Add an embedding layer
- Tensor embedding(const Tensor input,
- int num_entires,
+ Tensor embedding(Tensor const input,
+ int num_entries,
int outDim,
AggrMode aggr,
DataType dtype = DT_FLOAT,
@@ -436,13 +539,13 @@ class FFModel {
Initializer *kernel_initializer = NULL,
char const *name = NULL);
// Add a gather layer
- Tensor gather(const Tensor input,
- const Tensor index,
+ Tensor gather(Tensor const input,
+ Tensor const index,
int dim,
char const *name = NULL);
// Add a group_by layer
- void group_by(const Tensor data,
- const Tensor assign,
+ void group_by(Tensor const data,
+ Tensor const assign,
Tensor *outputs,
int n,
float alpha,
@@ -464,7 +567,7 @@ class FFModel {
float lambda_bal,
char const *name = NULL);
// Add a 2D pooling layer
- Tensor pool2d(const Tensor input,
+ Tensor pool2d(Tensor const input,
int kernelH,
int kernelW,
int strideH,
@@ -474,28 +577,79 @@ class FFModel {
PoolType type = POOL_MAX,
ActiMode activation = AC_MODE_NONE,
char const *name = NULL);
- // Add a batch_norm layer
- Tensor layer_norm(const Tensor input,
+ // Add a layer_norm layer
+ Tensor layer_norm(Tensor const input,
std::vector const &axes,
bool elementwise_affine,
float eps,
+ bool use_bias = true,
DataType data_type = DT_NONE,
char const *name = NULL);
+ // Add a layer_norm layer with residual(s)
+ void residual_layer_norm(Tensor const input,
+ Tensor const residual1,
+ Tensor const residual2,
+ Tensor *outputs,
+ bool use_two_residuals,
+ std::vector const &axes,
+ bool elementwise_affine,
+ float eps,
+ bool use_bias = true,
+ bool inplace_residual = false,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
+ // Add a add_bias_residual_layer_norm layer
+ void add_bias_residual_layer_norm(Tensor const input,
+ Tensor const residual,
+ Tensor *outputs,
+ std::vector const &axes,
+ bool elementwise_affine,
+ float eps,
+ bool use_bias = true,
+ bool inplace_residual = false,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
+ // Add a sigmoid_silu_multi layer
+ Tensor sigmoid_silu_multi(Tensor const input1,
+ Tensor const input2,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
// Add a batch_norm layer
Tensor
- batch_norm(const Tensor input, bool relu = true, char const *name = NULL);
+ batch_norm(Tensor const input, bool relu = true, char const *name = NULL);
// Add a batch_matmul layer
- Tensor batch_matmul(const Tensor A,
- const Tensor B,
+ Tensor batch_matmul(Tensor const A,
+ Tensor const B,
int a_seq_length_dim = -1,
int b_seq_length_dim = -1,
char const *name = nullptr);
+ // Add a root mean square layer
+ Tensor rms_norm(Tensor const input,
+ float eps,
+ int dim,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
+ // Add a residual root mean square layer
+ void residual_rms_norm(Tensor const input1,
+ Tensor const input2,
+ Tensor *outputs,
+ float eps,
+ int dim,
+ bool inplace_residual = false,
+ DataType data_type = DT_NONE,
+ char const *name = NULL);
+ // Add a beam search top k layer
+ Tensor beam_top_k(Tensor const input,
+ int max_beam_size,
+ bool sorted,
+ char const *name = NULL);
+
// Add a dense layer
- Tensor dense(const Tensor input,
+ Tensor dense(Tensor const input,
int outDim,
ActiMode activation = AC_MODE_NONE,
bool use_bias = true,
- DataType data_type = DT_FLOAT,
+ DataType data_type = DT_NONE,
Layer const *shared_op = NULL,
Initializer *kernel_initializer = NULL,
Initializer *bias_initializer = NULL,
@@ -503,55 +657,74 @@ class FFModel {
float regularizer_lambda = 0.0,
char const *name = NULL);
// Add a cast layer
- Tensor cast(const Tensor input, DataType dtype, char const *name = nullptr);
+ Tensor cast(Tensor const input, DataType dtype, char const *name = nullptr);
// Add a concat layer
Tensor
concat(int n, Tensor const *tensors, int axis, char const *name = NULL);
+ // Add an experts layer
+ Tensor experts(
+ Tensor const *inputs,
+ int num_experts,
+ int experts_start_idx,
+ int experts_output_dim_size,
+ float alpha,
+ int experts_num_layers = 1, // number of linear layers per expert
+ int experts_internal_dim_size = 0, // hidden dimension for internal layers
+ char const *name = NULL);
// Add a mean layer
- Tensor mean(const Tensor input,
+ Tensor mean(Tensor const input,
std::vector const &dims,
bool keepdims,
char const *name);
// Add a moe layer (wrapping topk, group_by and aggregate operators)
- Tensor moe(const Tensor input,
+ Tensor moe(Tensor const input,
int num_exp,
int num_select,
int expert_hidden_size,
float alpha,
float lambda);
// Add a split layer
- void split(const Tensor input,
+ void split(Tensor const input,
Tensor *outputs,
std::vector const &split,
int axis,
char const *name = NULL);
// Add a flat layer
- Tensor flat(const Tensor input, char const *name = NULL);
+ Tensor flat(Tensor const input, char const *name = NULL);
// Add a softmax layer
- Tensor softmax(const Tensor input,
+ Tensor softmax(Tensor const input,
int dim = -1,
bool last_layer = false,
+ DataType data_type = DT_NONE,
char const *name = NULL);
// Create input tensors and constants
- Tensor transpose(const Tensor input,
+ Tensor transpose(Tensor const input,
std::vector const &perm,
char const *name = NULL);
- Tensor reduce_sum(const Tensor input,
+ Tensor reduce_sum(Tensor const input,
std::vector const &axes,
bool keepdims = false,
char const *name = nullptr);
- Tensor reshape(const Tensor input,
+ Tensor reshape(Tensor const input,
std::vector const &shape,
char const *name = NULL);
- Tensor reverse(const Tensor input, int axis, char const *name = NULL);
- void top_k(const Tensor input,
+ Tensor reverse(Tensor const input, int axis, char const *name = NULL);
+ void top_k(Tensor const input,
Tensor *outputs,
int k,
bool sorted,
char const *name = NULL);
- Tensor multihead_attention(const Tensor query,
- const Tensor key,
- const Tensor value,
+ Tensor arg_top_k(Tensor const input,
+ // Tensor *outputs,
+ int k,
+ bool sorted,
+ bool speculative_decoding,
+ char const *name = NULL);
+ Tensor argmax(Tensor const input, bool beam_search, char const *name = NULL);
+ Tensor sampling(Tensor const input, float top_p, char const *name = NULL);
+ Tensor multihead_attention(Tensor const query,
+ Tensor const key,
+ Tensor const value,
int embed_dim,
int num_heads,
int kdim = 0,
@@ -560,8 +733,127 @@ class FFModel {
bool bias = true,
bool add_bias_kv = false,
bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
Initializer *kernel_initializer = NULL,
char const *name = NULL);
+ Tensor inc_multihead_self_attention(Tensor const input,
+ int embed_dim,
+ int num_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor
+ spec_inc_multihead_self_attention(Tensor const input,
+ int embed_dim,
+ int num_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor inc_multihead_self_attention_verify(
+ Tensor const input,
+ int embed_dim,
+ int num_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor inc_multiquery_self_attention(Tensor const input,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor
+ spec_inc_multiquery_self_attention(Tensor const input,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ Tensor inc_multiquery_self_attention_verify(
+ Tensor const input,
+ int embed_dim,
+ int num_q_heads,
+ int num_kv_heads,
+ int kdim = 0,
+ int vdim = 0,
+ float dropout = 0.0f,
+ bool bias = false,
+ bool add_bias_kv = false,
+ bool add_zero_attn = false,
+ DataType data_type = DT_NONE,
+ Initializer *kernel_initializer = NULL,
+ bool apply_rotary_embedding = false,
+ bool scaling_query = false,
+ float scaling_factor = 1.0f,
+ bool qk_prod_scaling = true,
+ bool position_bias = false,
+ char const *name = NULL);
+ // ========================================
+ // PEFT Layers
+ // ========================================
+ PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
+ // ========================================
+ // Inference APIs
+ // ========================================
+ std::vector generate(std::vector const &requests);
+
Tensor create_tensor_legion_ordering(int num_dim,
int const dims[],
DataType data_type,
@@ -570,7 +862,7 @@ class FFModel {
bool create_grad = true);
ParallelTensor
create_parallel_tensor_legion_ordering(int num_dim,
- const ParallelDim dims[],
+ ParallelDim const dims[],
DataType data_type,
Op const *owner_op = NULL,
int owner_idx = 0,
@@ -583,7 +875,7 @@ class FFModel {
int owner_idx = 0,
bool create_grad = true);
ParallelTensor create_parallel_tensor(int num_dim,
- const ParallelDim dims[],
+ ParallelDim const dims[],
DataType data_type,
Op const *owner_op = NULL,
int owner_idx = 0,
@@ -596,7 +888,7 @@ class FFModel {
int owner_idx = 0,
bool create_grad = true);
template
- ParallelTensor create_parallel_tensor(const ParallelDim dims[],
+ ParallelTensor create_parallel_tensor(ParallelDim const dims[],
DataType data_type,
Op const *owner_op = NULL,
int owner_idx = 0,
@@ -620,7 +912,7 @@ class FFModel {
ParameterSyncType sync_type = ParameterSyncType::NONE);
template
ParallelParameter create_parallel_weight(
- const ParallelDim dims[],
+ ParallelDim const dims[],
DataType data_type,
Op const *owner_op = NULL,
bool create_grad = true,
@@ -628,7 +920,7 @@ class FFModel {
ParameterSyncType sync_type = ParameterSyncType::NONE);
ParallelParameter create_parallel_weight(
int numdim,
- const ParallelDim dims[],
+ ParallelDim const dims[],
DataType data_type,
Op const *owner_op = NULL,
bool create_grad = true,
@@ -636,7 +928,7 @@ class FFModel {
ParameterSyncType sync_type = ParameterSyncType::NONE);
ParallelParameter create_parallel_weight_legion_ordering(
int numdim,
- const ParallelDim dims[],
+ ParallelDim const dims[],
DataType data_type,
Op const *owner_op = NULL,
bool create_grad = true,
@@ -645,7 +937,7 @@ class FFModel {
void map_tensor(ParallelTensor tensor, Op const *parallel_op);
void map_weight(ParallelTensor tensor, Op const *parallel_op);
- bool get_parallel_tensor_from_tensor(const Tensor tensor,
+ bool get_parallel_tensor_from_tensor(Tensor const tensor,
ParallelTensor ¶llel_tensor) const;
template
@@ -686,13 +978,14 @@ class FFModel {
// Internal PCG::Node creation APIs
// ========================================
template
- PCG::Node get_or_create_node(const typename T::Input &input,
+ PCG::Node get_or_create_node(typename T::Input const &input,
typename T::Params const ¶ms) {
using Params = typename T::Params;
auto input_shapes = get_input_shape(input);
if (!params.is_valid(input_shapes)) {
+ printf("!params.is_valid(input_shapes)\n");
return PCG::Node::INVALID_NODE;
}
@@ -700,7 +993,7 @@ class FFModel {
std::pair::type, Params> key{
input_shapes, params};
- auto &cache = get::type, Params>,
T *>>(this->cached_ops);
auto const &it = cache.find(key);
@@ -715,50 +1008,50 @@ class FFModel {
return this->new_node(op);
}
- PCG::Node get_or_create_noop_node(const ParallelTensor input);
+ PCG::Node get_or_create_noop_node(ParallelTensor const input);
PCG::Node get_or_create_input_node(ParallelTensorShape const &);
PCG::Node get_or_create_fused_parallel_node(
- const ParallelTensor input,
+ ParallelTensor const input,
std::vector const ¶llel_ops);
- PCG::Node get_or_create_parallel_op_node(const ParallelTensor input,
+ PCG::Node get_or_create_parallel_op_node(ParallelTensor const input,
ParallelOpInfo const &);
// ========================================
// Internal APIs that should not be invoked from applications
// ========================================
void create_disjoint_partition(int num_dims,
- const ParallelDim dims[],
+ ParallelDim const dims[],
Legion::IndexSpace const &part_is,
Legion::LogicalRegion const ®ion,
Legion::LogicalPartition &part);
template
void create_disjoint_partition_with_dim2(
- const ParallelDim dims[],
+ ParallelDim const dims[],
Legion::IndexSpaceT const &part_is,
Legion::LogicalRegion const ®ion,
Legion::LogicalPartition &part);
void create_aliased_partition(int num_dims,
- const ParallelDim dims[],
+ ParallelDim const dims[],
int aliased_dim,
Legion::IndexSpace const &part_is,
Legion::LogicalRegion const ®ion,
Legion::LogicalPartition &part);
template
void create_aliased_partition_with_dim2(
- const ParallelDim dims[],
+ ParallelDim const dims[],
int aliased_dim,
Legion::IndexSpaceT const &part_is,
Legion::LogicalRegion const ®ion,
Legion::LogicalPartition &part);
template
- void create_disjoint_partition(const ParallelTensor tensor,
+ void create_disjoint_partition(ParallelTensor const tensor,
Legion::IndexSpaceT const &part_is,
Legion::LogicalPartition &part_fwd,
Legion::LogicalPartition &part_bwd);
template
void create_data_parallel_partition_with_diff_dims(
- const ParallelTensor tensor,
+ ParallelTensor const tensor,
Legion::IndexSpaceT const &task_is,
Legion::LogicalPartition &part_fwd,
Legion::LogicalPartition &part_bwd);
@@ -775,8 +1068,14 @@ class FFModel {
std::vector const ®ions,
Legion::Context ctx,
Legion::Runtime *runtime);
+ // ========================================
+ // Internal APIs that should not be invoked from applications
+ // ========================================
void reset_metrics();
void init_operators();
+ void init_operators_inference(
+ std::vector const &batch_inputs,
+ std::vector const &batch_outputs);
void prefetch();
void forward(int seq_length = -1);
void compute_metrics();
@@ -784,8 +1083,17 @@ class FFModel {
void backward(int seq_length = -1);
void update();
void unified_update();
- bool apply_fusion(std::vector const &operators,
- std::vector &new_operators);
+ // bool apply_fusion(std::vector const &operators,
+ // std::vector &new_operators);
+ bool apply_fusion(
+ std::vector const &operators,
+ std::vector &new_operators,
+ std::unordered_map>
+ *parallel_tensor_mapping = nullptr);
+ bool check_operators_integrity(
+ std::vector const &old_operators,
+ std::unordered_map>
+ *pt_mapping = nullptr);
Op *get_final_operator() const;
void compile(LossType loss_type,
std::vector const &metrics,
@@ -794,6 +1102,9 @@ class FFModel {
LossType loss_type,
std::vector const &metrics,
CompMode comp_mode = COMP_MODE_TRAINING);
+ void compile_inference();
+ void set_transformer_layer_id(int id);
+ void set_position_offset(int offset);
void graph_optimize(size_t budget,
bool only_data_parallel,
std::unique_ptr &best_graph,
@@ -812,6 +1123,7 @@ class FFModel {
bool use_propagation) const;
#ifdef FF_USE_NCCL
ncclComm_t *find_nccl_comms(MachineView const &view) const;
+ void finish_nccl_comms();
#endif
#ifdef FF_USE_PROPAGATE
void propagate(std::map const ¤t,
@@ -827,15 +1139,18 @@ class FFModel {
std::unordered_map>>
get_bwd_edge_map() const;
- // Internal funcitons
+ // Internal functions
Legion::IndexSpace get_or_create_task_is(ParallelConfig const &pc);
Legion::IndexSpace get_or_create_task_is(MachineView const &view);
Legion::IndexSpace get_or_create_task_is(Legion::Domain const &domain);
- Legion::IndexSpace get_or_create_task_is(const ParallelTensor);
+ Legion::IndexSpace get_or_create_task_is(ParallelTensor const);
Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
Legion::IndexSpace get_task_is(MachineView const &view) const;
bool is_transformer_block(int layer_idx) const;
+ bool need_to_add_combine(int layer_idx) const;
+ bool need_to_add_allreduce(int layer_idx) const;
+ bool need_to_add_parallel_identity(int layer_idx) const;
bool is_mlp_block(int layer_idx) const;
void create_operators_from_layers();
Op *create_operator_from_layer(Layer *layer,
@@ -850,8 +1165,11 @@ class FFModel {
void clear_graph_search_cache();
public:
- size_t op_global_guid, layer_global_guid;
+ size_t op_global_guid, layer_global_guid, peft_model_global_guid;
size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
+ size_t current_transformer_layer_id;
+ // positional embedding start offset
+ int position_offset;
FFConfig config;
FFIterationConfig iter_config;
Optimizer *optimizer;
@@ -868,6 +1186,12 @@ class FFModel {
std::vector layers;
std::vector operators;
std::vector parameters;
+ // PEFT related
+ std::unordered_map base_layer_to_peft_layer;
+ std::unordered_map> peft_layer_to_peft_id;
+ std::unordered_map peft_configs;
+ // std::vector peft_operators;
+
FFHandler handlers[MAX_NUM_WORKERS];
Legion::Future current_metrics;
// Cached operators: key: operator hash, value: operator pointer
@@ -897,6 +1221,9 @@ class FFModel {
ElementUnary *>,
std::unordered_map,
Embedding *>,
+ std::unordered_map<
+ std::pair, ExpertsParams>,
+ Experts *>,
std::unordered_map, Flat *>,
std::unordered_map<
std::pair,
@@ -908,8 +1235,25 @@ class FFModel {
Group_by *>,
std::unordered_map,
LayerNorm *>,
+ std::unordered_map,
+ ResidualLayerNormParams>,
+ ResidualLayerNorm *>,
+ std::unordered_map<
+ std::pair,
+ AddBiasResidualLayerNormParams>,
+ AddBiasResidualLayerNorm *>,
+ std::unordered_map<
+ std::pair,
+ SigmoidSiluMultiParams>,
+ SigmoidSiluMulti *>,
std::unordered_map,
Linear *>,
+ std::unordered_map<
+ std::pair,
+ LoraLinearParams>,
+ LoraLinear *>,
std::unordered_map,
Pool2D *>,
std::unordered_map,
MultiHeadAttentionParams>,
MultiHeadAttention *>,
+ std::unordered_map<
+ std::pair,
+ IncMultiHeadSelfAttention *>,
+ std::unordered_map,
+ BeamTopK *>,
+ std::unordered_map,
+ Sampling *>,
+ std::unordered_map,
+ ArgMax *>,
+ std::unordered_map<
+ std::pair,
+ SpecIncMultiHeadSelfAttention *>,
+ std::unordered_map<
+ std::pair,
+ TreeIncMultiHeadSelfAttention *>,
std::unordered_map,
Reduce *>,
std::unordered_map,
@@ -925,8 +1284,16 @@ class FFModel {
std::unordered_map,
Softmax *>,
std::unordered_map, TopK *>,
+ std::unordered_map,
+ ArgTopK *>,
std::unordered_map,
Transpose *>,
+ std::unordered_map,
+ RMSNorm *>,
+ std::unordered_map<
+ std::pair,
+ ResidualRMSNormParams>,
+ ResidualRMSNorm *>,
std::unordered_map,
Repartition *>,
std::unordered_map,
@@ -937,12 +1304,18 @@ class FFModel {
AllReduce *>,
std::unordered_map,
Combine *>,
+ std::unordered_map,
+ AllReduce *>,
+ std::unordered_map,
+ ParallelIdentity *>,
std::unordered_map,
FusedParallelOp *>>
cached_ops;
std::unordered_map cached_noop_ops;
std::unordered_map cached_input_ops;
std::vector all_valid_views;
+ int model_id; // unique incremental id assigned to each model. Used in the
+ // inference_debugging mode.
#ifdef FF_USE_NCCL
std::unordered_map view_hash_to_nccl_comms;
#endif
@@ -971,6 +1344,9 @@ class FFModel {
ElementUnary *
unary(OperatorType op, char const *name = NULL, float scalar = 0.0);
PCG::Node new_node(Op *);
+ static int model_counter; // number of instantiated FFModel objects. Used to
+ // assign a unique incremental id to each model.
+ // Used in the inference_debugging mode.
};
class UtilityTasks {
diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
index 512844db92..d31c12b16c 100644
--- a/include/flexflow/op_meta.h
+++ b/include/flexflow/op_meta.h
@@ -9,13 +9,19 @@ class Op;
class OpMeta {
public:
- OpMeta(FFHandler _handle);
+ // OpMeta(FFHandler _handle);
OpMeta(FFHandler _handle, Op const *op);
public:
FFHandler handle;
bool profiling; // Measure the run time of the task
- bool trainableInputs[MAX_NUM_INPUTS];
+ bool inference_debugging;
+ int decoding_step;
+ int bwd_step;
+ char op_name[MAX_OPNAME];
+ LayerID layer_guid;
+ bool trainable_inputs[MAX_NUM_INPUTS];
+ bool reset_input_grads[MAX_NUM_INPUTS];
DataType input_type[MAX_NUM_INPUTS];
DataType weight_type[MAX_NUM_WEIGHTS];
DataType output_type[MAX_NUM_OUTPUTS];
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 3fd84ce55b..1a5af67b36 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -1,15 +1,27 @@
#ifndef _OPERATOR_H
#define _OPERATOR_H
+#include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
#include "flexflow/fftype.h"
#include "flexflow/machine_view.h"
#include "flexflow/parallel_tensor.h"
#include "flexflow/utils/dot/record_formatter.h"
+#include
#include
+namespace fs = std::filesystem;
+
+#include
+#include
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
namespace FlexFlow {
-extern LegionRuntime::Logger::Category log_measure;
+extern Legion::Logger log_measure;
class OpMeta;
class Simulator;
@@ -19,11 +31,38 @@ enum class MappingRecordType { INPUT_OUTPUT, INPUT_WEIGHT };
enum class MappingOperation { PARTITION, REPLICATE };
+fs::path get_dst_folder(std::string const &subdir,
+ int step_idx = 0,
+ int shard_idx = 0,
+ bool before_kernel = false);
+
+/** @brief A class to keep track of a dimension relation between two tensors
+ * used by an operator.
+ *
+ * Dimension relations are one-to-one mappings between the dimensions of the
+ * input, weights, and output tensors of an operator. Introduced in the Unity
+ * paper, dimension relations allow FlexFlow to keep track of an operator's
+ * parallelization plans as part of the Parallel Computation Graph (PCG).
+ *
+ * Each ParallelDimMappingRecord only keeps track of a single dimension
+ * relation.
+ *
+ * ParallelDimMappingRecord objects must be initialized with a
+ * MappingRecordType, which can be INPUT_OUTPUT, if the ParallelDimMappingRecord
+ * is tracking a dimension relation between the input and the output tensor, or
+ * INPUT_WEIGHT, if the ParallelDimMappingRecord is tracking a dimension
+ * relation between the input tensor and the weights tensor.
+ *
+ */
class ParallelDimMappingRecord {
private:
ParallelDimMappingRecord(MappingRecordType);
public:
+ /**
+ * @brief We disable this constructor because ParallelDimMappingRecord objects
+ * must specify the MappingRecordType upon creation.
+ */
ParallelDimMappingRecord() = delete;
static ParallelDimMappingRecord input_output_record(
@@ -160,6 +199,7 @@ class Op {
const ParallelTensor input4 = NULL);
Op(int guid,
bool profiling,
+ bool inference_debugging,
OperatorType otype,
DataType dtype,
char const *name,
@@ -185,9 +225,182 @@ class Op {
virtual bool get_weight_parameter(TNParameter, DIMParameter, int *) const;
// Pure virtual functions that must be implemented
virtual void init(FFModel const &) = 0;
+ virtual void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) {
+ assert(false);
+ };
virtual void forward(FFModel const &) = 0;
virtual void backward(FFModel const &) = 0;
+ // Pure virtual functions for inference
+ virtual Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) {
+ assert(false);
+ Legion::FutureMap empty_map;
+ return empty_map;
+ };
+ virtual Legion::FutureMap peft_bwd(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) {
+ assert(false);
+ }
virtual void print_layer(FFModel const &model) = 0;
+ template
+ static std::string get_op_name_without_uid(OpMetaType *m) {
+ std::string op_name_without_uid = std::string(m->op_name);
+ size_t last_underscore = op_name_without_uid.length();
+ for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
+ if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
+ break;
+ } else if (m->op_name[i] == '_') {
+ last_underscore = i;
+ }
+ }
+ if (last_underscore < op_name_without_uid.length()) {
+ op_name_without_uid.erase(last_underscore);
+ }
+ return op_name_without_uid;
+ }
+ template
+ static void save_inference_tensors_to_file(
+ OpMetaType *m,
+ int shard_id,
+ BatchConfig const *bc,
+ std::vector input_tensors,
+ std::vector weight_tensors,
+ std::vector output_tensors,
+ bool fwd_pass = true,
+ bool before_kernel = false) {
+ // get operator name and print it
+ std::string op_name_without_uid = get_op_name_without_uid(m);
+ std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
+ << std::endl;
+ // build the path to save the tensor
+ fs::path dst_filepath;
+ if (fwd_pass) {
+ dst_filepath =
+ get_dst_folder("fwd", m->decoding_step, shard_id, before_kernel);
+ } else {
+ dst_filepath =
+ get_dst_folder("bwd", m->bwd_step, shard_id, before_kernel);
+ }
+ if (m->layer_guid.model_id > 0) {
+ assert(false && "Model ID > 0 not supported yet");
+ }
+ std::string layername = "layers." +
+ std::to_string(m->layer_guid.transformer_layer_id) +
+ "." + op_name_without_uid;
+ dst_filepath /= layername;
+
+ // save batch config, if passed
+ if (bc != nullptr) {
+ bc->save_to_file(dst_filepath.string() + ".batch_config");
+ }
+
+ // save all inputs
+ for (int i = 0; i < input_tensors.size(); i++) {
+ std::string filename = dst_filepath.string() + ".input_";
+ if (fwd_pass) {
+ filename += std::to_string(i);
+ } else {
+ filename += "gradient_" + std::to_string(i);
+ }
+ if (input_tensors[i].data_type == DT_FLOAT) {
+ save_tensor(input_tensors[i].get_float_ptr(),
+ input_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (input_tensors[i].data_type == DT_HALF) {
+ save_tensor(input_tensors[i].get_half_ptr(),
+ input_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (input_tensors[i].data_type == DT_INT32) {
+ save_tensor(input_tensors[i].get_int32_ptr(),
+ input_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (input_tensors[i].data_type == DT_INT64) {
+ save_tensor(input_tensors[i].get_int64_ptr(),
+ input_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else {
+ assert(false && "Tensor data type not supported");
+ }
+ }
+
+ // only dump the weights in the forward pass, at the first step
+ // note that we do not save the weight gradients, since we only support
+ // finetuning LoRA weights, which are not FF tensors.
+ if (fwd_pass && m->decoding_step == 0) {
+ fs::path dst_filepath_weights =
+ get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) /
+ layername;
+ for (int i = 0; i < weight_tensors.size(); i++) {
+ std::string filename =
+ dst_filepath_weights.string() + ".weight_" + std::to_string(i);
+ if (weight_tensors[i].data_type == DT_FLOAT) {
+ save_tensor(weight_tensors[i].get_float_ptr(),
+ weight_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (weight_tensors[i].data_type == DT_HALF) {
+ save_tensor(weight_tensors[i].get_half_ptr(),
+ weight_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (weight_tensors[i].data_type == DT_INT32) {
+ save_tensor(weight_tensors[i].get_int32_ptr(),
+ weight_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (weight_tensors[i].data_type == DT_INT64) {
+ save_tensor(weight_tensors[i].get_int64_ptr(),
+ weight_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else {
+ assert(false && "Tensor data type not supported");
+ }
+ }
+ }
+
+ // save all outputs
+ for (int i = 0; i < output_tensors.size(); i++) {
+ std::string filename = dst_filepath.string() + ".output_";
+ if (fwd_pass) {
+ filename += std::to_string(i);
+ } else {
+ filename += "gradient_" + std::to_string(i);
+ }
+ if (output_tensors[i].data_type == DT_FLOAT) {
+ save_tensor(output_tensors[i].get_float_ptr(),
+ output_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (output_tensors[i].data_type == DT_HALF) {
+ save_tensor(output_tensors[i].get_half_ptr(),
+ output_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (output_tensors[i].data_type == DT_INT32) {
+ save_tensor(output_tensors[i].get_int32_ptr(),
+ output_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else if (output_tensors[i].data_type == DT_INT64) {
+ save_tensor(output_tensors[i].get_int64_ptr(),
+ output_tensors[i].domain.get_volume(),
+ filename.c_str());
+ } else {
+ assert(false && "Tensor data type not supported");
+ }
+ }
+ // increase count of decoding steps
+ if (!before_kernel) {
+ if (fwd_pass) {
+ m->decoding_step++;
+ } else {
+ m->bwd_step++;
+ }
+ }
+ }
virtual bool measure_operator_cost(Simulator *sim,
MachineView const &mv,
CostMetrics &cost_metrics) const = 0;
@@ -239,15 +452,29 @@ class Op {
std::vector const ®ions,
Legion::Context ctx,
Legion::Runtime *runtime);
+ static void
+ finish_nccl_comms_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
#endif
protected:
void set_argumentmap_for_init(FFModel const &ff, Legion::ArgumentMap &argmap);
+ void set_argumentmap_for_init_inference(FFModel const &ff,
+ Legion::ArgumentMap &argmap,
+ ParallelTensor const output0);
void set_argumentmap_for_forward(FFModel const &ff,
Legion::ArgumentMap &argmap);
+ void set_argumentmap_for_inference(FFModel const &ff,
+ Legion::ArgumentMap &argmap,
+ ParallelTensor const output0);
void set_argumentmap_for_backward(FFModel const &ff,
Legion::ArgumentMap &argmap);
void set_opmeta_from_futuremap(FFModel const &ff,
Legion::FutureMap const &fm);
+ void set_opmeta_from_futuremap_inference(FFModel const &ff,
+ Legion::FutureMap const &fm,
+ ParallelTensor const output0);
void solve_parallel_dim_mappings(
std::vector const &inputs,
std::vector const &weights,
@@ -265,10 +492,14 @@ class Op {
ParallelTensor outputs[MAX_NUM_OUTPUTS];
ParallelTensor inputs[MAX_NUM_INPUTS];
ParallelParameter weights[MAX_NUM_WEIGHTS];
- bool trainableInputs[MAX_NUM_INPUTS];
+ bool trainable_inputs[MAX_NUM_INPUTS];
+ bool reset_input_grads[MAX_NUM_INPUTS];
OpMeta *meta[MAX_NUM_WORKERS];
+ std::map inference_meta;
int numInputs, numWeights, numOutputs;
bool profiling;
+ bool inference_debugging;
+ bool add_bias_only_once;
#ifdef FF_USE_NCCL
ncclUniqueId ncclId;
#endif
diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
index 84653ac9ca..673f78ad46 100644
--- a/include/flexflow/operator_params.h
+++ b/include/flexflow/operator_params.h
@@ -1,32 +1,47 @@
#ifndef _OPERATOR_PARAMS_H
#define _OPERATOR_PARAMS_H
+#include "flexflow/ops/add_bias_residual_layer_norm_params.h"
#include "flexflow/ops/aggregate_params.h"
#include "flexflow/ops/aggregate_spec_params.h"
+#include "flexflow/ops/arg_topk_params.h"
+#include "flexflow/ops/argmax_params.h"
#include "flexflow/ops/attention_params.h"
#include "flexflow/ops/batch_matmul_params.h"
+#include "flexflow/ops/beam_topk_params.h"
#include "flexflow/ops/cast_params.h"
#include "flexflow/ops/concat_params.h"
-#include "flexflow/parallel_ops/allreduce_params.h"
#include "flexflow/ops/conv_2d_params.h"
#include "flexflow/ops/dropout_params.h"
#include "flexflow/ops/element_binary_params.h"
#include "flexflow/ops/element_unary_params.h"
#include "flexflow/ops/embedding_params.h"
+#include "flexflow/ops/experts_params.h"
#include "flexflow/ops/flat_params.h"
#include "flexflow/ops/gather_params.h"
#include "flexflow/ops/groupby_params.h"
+#include "flexflow/ops/inc_multihead_self_attention_params.h"
#include "flexflow/ops/layer_norm_params.h"
#include "flexflow/ops/linear_params.h"
+#include "flexflow/ops/lora_linear_params.h"
#include "flexflow/ops/pool_2d_params.h"
#include "flexflow/ops/reduce_params.h"
#include "flexflow/ops/reshape_params.h"
+#include "flexflow/ops/residual_layer_norm_params.h"
+#include "flexflow/ops/residual_rms_norm_params.h"
+#include "flexflow/ops/rms_norm_params.h"
+#include "flexflow/ops/sampling_params.h"
+#include "flexflow/ops/sigmoid_silu_multi_params.h"
#include "flexflow/ops/softmax_params.h"
+#include "flexflow/ops/spec_inc_multihead_self_attention_params.h"
#include "flexflow/ops/split_params.h"
#include "flexflow/ops/topk_params.h"
#include "flexflow/ops/transpose_params.h"
+#include "flexflow/ops/tree_inc_multihead_self_attention_params.h"
+#include "flexflow/parallel_ops/allreduce_params.h"
#include "flexflow/parallel_ops/combine_params.h"
#include "flexflow/parallel_ops/fused_parallel_op_params.h"
+#include "flexflow/parallel_ops/parallel_identity_params.h"
#include "flexflow/parallel_ops/partition_params.h"
#include "flexflow/parallel_ops/reduction_params.h"
#include "flexflow/parallel_ops/replicate_params.h"
@@ -50,13 +65,26 @@ using OperatorParameters = mp::variant;
tl::optional get_op_parameters(Op const *op);
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
new file mode 100644
index 0000000000..9510ac0f28
--- /dev/null
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -0,0 +1,165 @@
+#pragma once
+
+#include "flexflow/inference.h"
+#include "flexflow/model.h"
+#include "flexflow/utils/memory_allocator.h"
+namespace FlexFlow {
+
+class AddBiasResidualLayerNormMeta;
+
+class AddBiasResidualLayerNorm : public Op {
+public:
+ using Params = AddBiasResidualLayerNormParams;
+ using Input = std::pair;
+ AddBiasResidualLayerNorm(FFModel &model,
+ Params const ¶ms,
+ Input const &inputs,
+ char const *name = nullptr,
+ bool allocate_weights = false);
+ AddBiasResidualLayerNorm(FFModel &model,
+ LayerID const &_layer_guid,
+ const ParallelTensor _input,
+ const ParallelTensor _residual,
+ std::vector const &axes,
+ bool _elementwise_affine,
+ bool _use_bias,
+ float _eps,
+ bool _inplace_residual,
+ bool allocate_weights,
+ char const *name);
+ void map_output_tensors(FFModel &ff) override;
+ void init(FFModel const &) override;
+ void init_inference(FFModel const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ void forward(FFModel const &) override;
+ void backward(FFModel const &) override;
+ Legion::FutureMap inference(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ Legion::FutureMap peft_bwd(FFModel const &,
+ BatchConfigFuture const &,
+ std::vector const &,
+ std::vector const &,
+ MachineView const *mv = nullptr) override;
+ void print_layer(FFModel const &model) override {
+ assert(0);
+ }
+ static Op *
+ create_operator_from_layer(FFModel &model,
+ Layer const *layer,
+ std::vector const &inputs);
+ void serialize(Legion::Serializer &) const override;
+ static PCG::Node deserialize(FFModel &ff,
+ Legion::Deserializer &d,
+ ParallelTensor inputs[],
+ int num_inputs);
+
+ AddBiasResidualLayerNormParams get_params() const;
+
+ static OpMeta *init_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static void inference_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static void backward_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ static void peft_bwd_task(Legion::Task const *task,
+ std::vector const ®ions,
+ Legion::Context ctx,
+ Legion::Runtime *runtime);
+ bool measure_operator_cost(Simulator *sim,
+ MachineView const &pc,
+ CostMetrics &cost_metrics) const override;
+ template
+ static void inference_kernel(AddBiasResidualLayerNormMeta const *m,
+ int attn_bias_dim,
+ int residual_volume,
+ T const *input_ptr,
+ T const *attn_bias_ptr,
+ T const *residual_ptr,
+ T *added_output_ptr,
+ T *output_ptr,
+ T const *gamma_ptr,
+ T const *beta_ptr,
+ ffStream_t stream);
+ static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta *m,
+ BatchConfig const *bc,
+ GenericTensorAccessorR const &input,
+ GenericTensorAccessorR const &attn_bias,
+ GenericTensorAccessorR const &residual,
+ GenericTensorAccessorW &added_output,
+ GenericTensorAccessorW &output,
+ GenericTensorAccessorR const &gamma,
+ GenericTensorAccessorR const &beta);
+ template
+ static void backward_kernel(AddBiasResidualLayerNormMeta const *m,
+ T const *output_grad_ptr,
+ T const *added_output_ptr,
+ T *input_grad_ptr,
+ T *residual_grad_ptr,
+ T *attn_bias_grad_ptr,
+ T const *gamma_ptr,
+ T *gamma_grad_ptr,
+ T *beta_grad_ptr,
+ ffStream_t stream);
+ static void
+ backward_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+ GenericTensorAccessorR const &output_grad,
+ GenericTensorAccessorR &added_output,
+ GenericTensorAccessorW &input_grad,
+ GenericTensorAccessorW const &residual_grad,
+ GenericTensorAccessorW const &attn_bias_grad,
+ GenericTensorAccessorR const &gamma,
+ GenericTensorAccessorW const &gamma_grad,
+ GenericTensorAccessorW const &beta_grad);
+ template
+ static void peft_bwd_kernel(AddBiasResidualLayerNormMeta const *m,
+ T const *output_grad_ptr,
+ T *input_grad_ptr,
+ T *residual_grad_ptr,
+ T const *gamma_ptr,
+ ffStream_t stream);
+ static void
+ peft_bwd_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+ GenericTensorAccessorR const &output_grad,
+ GenericTensorAccessorW &input_grad,
+ GenericTensorAccessorW const &residual_grad,
+ GenericTensorAccessorR const &gamma);
+
+public:
+ bool elementwise_affine, use_bias;
+ int64_t effective_batch_size, effective_num_elements;
+ float eps;
+ bool inplace_residual;
+ std::vector axes;
+};
+
+class AddBiasResidualLayerNormMeta : public OpMeta {
+public:
+ AddBiasResidualLayerNormMeta(FFHandler handle,
+ AddBiasResidualLayerNorm const *ln,
+ MemoryAllocator &gpu_mem_allocator);
+ ~AddBiasResidualLayerNormMeta(void);
+
+public:
+ bool elementwise_affine, use_bias;
+ int64_t effective_batch_size, effective_num_elements;
+ float eps;
+ bool inplace_residual;
+ void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
+ Realm::RegionInstance reserveInst;
+ // PEFT related fields
+ void *input_activation;
+ size_t allocated_peft_buffer_size = 0;
+};
+
+}; // namespace FlexFlow
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
new file mode 100644
index 0000000000..840f521b01
--- /dev/null
+++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "flexflow/parallel_tensor.h"
+
+namespace FlexFlow {
+
+struct AddBiasResidualLayerNormParams {
+ LayerID layer_guid;
+ std::vector axes;
+ bool elementwise_affine;
+ float eps;
+ bool use_bias;
+ bool inplace_residual;
+ char name[MAX_OPNAME];
+ bool is_valid(
+ std::pair