diff --git a/.github/README.md b/.github/README.md index a8846260c8..528df18faf 100644 --- a/.github/README.md +++ b/.github/README.md @@ -72,7 +72,7 @@ ff.init( Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms). ```python # Specify the LLM -llm = ff.LLM("decapoda-research/llama-7b-hf") +llm = ff.LLM("meta-llama/Llama-2-7b-hf") # Specify a list of SSMs (just one in this case) ssms=[] @@ -116,7 +116,7 @@ ff.init( ) # Create the FlexFlow LLM -llm = ff.LLM("decapoda-research/llama-7b-hf") +llm = ff.LLM("meta-llama/Llama-2-7b-hf") # Create the sampling configs generation_config = ff.GenerationConfig( @@ -152,7 +152,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui * `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) * `-ll:fsize`: size of device memory on each GPU in MB * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. -* `-llm-model`: the LLM model ID from HuggingFace (e.g. "decapoda-research/llama-7b-hf") +* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf") * `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. * `-cache-folder`: the folder * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. @@ -162,7 +162,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion ``` @@ -193,7 +193,7 @@ Below is a list of models that we have explicitly tested and for which a SSM may | Model | Model id on HuggingFace | Boost-tuned SSMs | | :---- | :---- | :---- | -| LLaMA-7B | decapoda-research/llama-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | | LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | | LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | | LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | diff --git a/.github/workflows/gpu-ci-skip.yml b/.github/workflows/gpu-ci-skip.yml index 6a18e56bd1..f4cb950931 100644 --- a/.github/workflows/gpu-ci-skip.yml +++ b/.github/workflows/gpu-ci-skip.yml @@ -15,7 +15,7 @@ on: - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" - "tests/inference_tests.sh" - - "tests/multi_gpu_tests.sh" + - "tests/training_tests.sh" - "tests/python_interface_test.sh" workflow_dispatch: @@ -44,8 +44,8 @@ jobs: steps: - run: 'echo "No gpu-ci required"' - gpu-ci-flexflow: - name: Single Machine, Multiple GPUs Tests + training-tests: + name: Training Tests runs-on: ubuntu-20.04 # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} needs: inference-tests diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml index 4a43a3dee7..3901d6b5f7 100644 --- a/.github/workflows/gpu-ci.yml +++ b/.github/workflows/gpu-ci.yml @@ -15,7 +15,7 @@ on: - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" - "tests/inference_tests.sh" - - "tests/multi_gpu_tests.sh" + - "tests/training_tests.sh" - "tests/python_interface_test.sh" push: branches: @@ -34,7 +34,7 @@ on: - ".github/workflows/gpu-ci.yml" - "tests/cpp_gpu_tests.sh" - "tests/inference_tests.sh" - - "tests/multi_gpu_tests.sh" + - "tests/training_tests.sh" - "tests/python_interface_test.sh" workflow_dispatch: @@ -141,7 +141,8 @@ jobs: run: shell: bash -l {0} # required to use an activated conda environment env: - CONDA: "3" + CONDA: "3" + HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }} needs: gpu-ci-concierge container: image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest @@ -185,7 +186,7 @@ jobs: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib # GPT tokenizer test - ./tests/gpt_tokenizer_test.sh + # ./tests/gpt_tokenizer_test.sh # Inference tests source ./build/set_python_envs.sh @@ -209,8 +210,8 @@ jobs: if: always() run: sudo rm -rf ~/.cache - gpu-ci-flexflow: - name: Single Machine, Multiple GPUs Tests + training-tests: + name: Training Tests runs-on: [self-hosted, gpu] # skip this time-consuming test for PRs to the inference branch # if: ${{ github.event_name != 'pull_request' || github.base_ref != 'inference' }} @@ -266,5 +267,5 @@ jobs: # C++ tests ./tests/cpp_gpu_tests.sh 4 # Python tests - ./tests/multi_gpu_tests.sh 4 + ./tests/training_tests.sh 4 diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml index ca2b47df27..226f953b38 100644 --- a/.github/workflows/multinode-test.yml +++ b/.github/workflows/multinode-test.yml @@ -78,7 +78,7 @@ jobs: export OMPI_ALLOW_RUN_AS_ROOT=1 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl_vader_single_copy_mechanism=none - ./tests/multi_gpu_tests.sh 2 2 + ./tests/training_tests.sh 2 2 multinode-gpu-test-ucx: name: Multinode GPU Test with UCX @@ -129,7 +129,7 @@ jobs: export OMPI_ALLOW_RUN_AS_ROOT=1 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl_vader_single_copy_mechanism=none - ./tests/multi_gpu_tests.sh 2 2 + ./tests/training_tests.sh 2 2 multinode-gpu-test-native-ucx: name: Multinode GPU Test with native UCX @@ -177,7 +177,7 @@ jobs: export OMPI_ALLOW_RUN_AS_ROOT=1 export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 export OMPI_MCA_btl_vader_single_copy_mechanism=none - ./tests/multi_gpu_tests.sh 2 2 + ./tests/training_tests.sh 2 2 notify-slack: name: Notify Slack in case of failure diff --git a/CMakeLists.txt b/CMakeLists.txt index f9ce66a0f1..3732d5ff6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,19 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) STRING "Choose the type of build." FORCE) endif() +if(INSTALL_DIR) + message(STATUS "INSTALL_DIR: ${INSTALL_DIR}") + set(CMAKE_INSTALL_PREFIX ${INSTALL_DIR} CACHE PATH "Installation directory" FORCE) +else() + # Install DIR not set. Use default, unless a conda environment is active + if (DEFINED ENV{CONDA_PREFIX} AND NOT FF_BUILD_FROM_PYPI) + set(CONDA_PREFIX $ENV{CONDA_PREFIX}) + # Set CMAKE_INSTALL_PREFIX to the Conda environment's installation path + set(CMAKE_INSTALL_PREFIX ${CONDA_PREFIX} CACHE PATH "Installation directory" FORCE) + message(STATUS "Active conda environment detected. Setting CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") + endif() +endif() + # do not disable assertions even if in release mode set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -UNDEBUG") diff --git a/INSTALL.md b/INSTALL.md index a197df24ed..1734319540 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -97,7 +97,7 @@ source ./build/set_python_envs.sh cd "$FF_HOME" ./python/flexflow_python examples/python/native/mnist_mlp.py -ll:py 1 -ll:gpu 1 -ll:fsize -ll:zsize ``` -A script to run all the Python examples is available at `tests/multi_gpu_tests.sh` +A script to run all the Python examples is available at `tests/training_tests.sh` ### Run FlexFlow C++ examples diff --git a/SERVE.md b/SERVE.md index 60d0b566f0..f6e34750cd 100644 --- a/SERVE.md +++ b/SERVE.md @@ -32,7 +32,7 @@ ff.init( Second, we specify the LLM to serve and the SSM(s) used to accelerate LLM serving. The list of supported LLMs and SSMs is available at [supported models](#supported-llms-and-ssms). ```python # Specify the LLM -llm = ff.LLM("decapoda-research/llama-7b-hf") +llm = ff.LLM("meta-llama/Llama-2-7b-hf") # Specify a list of SSMs (just one in this case) ssms=[] @@ -78,7 +78,7 @@ ff.init( ) # Create the FlexFlow LLM -llm = ff.LLM("decapoda-research/llama-7b-hf") +llm = ff.LLM("meta-llama/Llama-2-7b-hf") # Create the sampling configs generation_config = ff.GenerationConfig( @@ -116,7 +116,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui * `-ll:gpu`: number of GPU processors to use on each node for serving an LLM (default: 0) * `-ll:fsize`: size of device memory on each GPU in MB * `-ll:zsize`: size of zero-copy memory (pinned DRAM with direct GPU access) in MB. FlexFlow Serve keeps a replica of the LLM parameters on zero-copy memory, and therefore requires that the zero-copy memory is sufficient for storing the LLM parameters. -* `-llm-model`: the LLM model ID from HuggingFace (e.g. "decapoda-research/llama-7b-hf") +* `-llm-model`: the LLM model ID from HuggingFace (e.g. "meta-llama/Llama-2-7b-hf") * `-ssm-model`: the SSM model ID from HuggingFace (e.g. "JackFram/llama-160m"). You can use multiple `-ssm-model`s in the command line to launch multiple SSMs. * `-cache-folder`: the folder * `-data-parallelism-degree`, `-tensor-parallelism-degree` and `-pipeline-parallelism-degree`: parallelization degrees in the data, tensor, and pipeline dimensions. Their product must equal the number of GPUs available on the machine. When any of the three parallelism degree arguments is omitted, a default value of 1 will be used. @@ -126,7 +126,7 @@ A C++ example is available at [this folder](../inference/spec_infer/). After bui For example, you can use the following command line to serve a LLaMA-7B or LLaMA-13B model on 4 GPUs and use two collectively boost-tuned LLaMA-68M models for speculative inference. ```bash -./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion +./inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-68m -prompt /path/to/prompt.json -tensor-parallelism-degree 4 --fusion ``` @@ -157,7 +157,7 @@ Below is a list of models that we have explicitly tested and for which a SSM may | Model | Model id on HuggingFace | Boost-tuned SSMs | | :---- | :---- | :---- | -| LLaMA-7B | decapoda-research/llama-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | +| LLaMA-7B | meta-llama/Llama-2-7b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | | LLaMA-13B | decapoda-research/llama-13b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | | LLaMA-30B | decapoda-research/llama-30b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | | LLaMA-65B | decapoda-research/llama-65b-hf | [LLaMA-68M](https://huggingface.co/JackFram/llama-68m) , [LLaMA-160M](https://huggingface.co/JackFram/llama-160m) | diff --git a/conda/environment.yml b/conda/environment.yml index 9ae0dc9c7a..48cd8ddb33 100644 --- a/conda/environment.yml +++ b/conda/environment.yml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python>=3.6 + - python>=3.6,<3.12 - cffi>=1.11.0 - Pillow - pybind11 diff --git a/conda/flexflow.yml b/conda/flexflow.yml index 3e39407bfa..89421db758 100644 --- a/conda/flexflow.yml +++ b/conda/flexflow.yml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python>=3.6 + - python>=3.6,<3.12 - cffi>=1.11.0 - Pillow - pybind11 diff --git a/config/config.inc b/config/config.inc index 7f1f0ffcf4..5a7bde5ce9 100644 --- a/config/config.inc +++ b/config/config.inc @@ -24,7 +24,7 @@ fi #set installation dir if [ -n "$INSTALL_DIR" ]; then - SET_INSTALL_DIR="-DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}" + SET_INSTALL_DIR="-DINSTALL_DIR=${INSTALL_DIR}" fi if [ "$INFERENCE_TESTS" = "ON" ]; then diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h index 6fe52e6892..fb12adf2d3 100644 --- a/include/flexflow/ffconst.h +++ b/include/flexflow/ffconst.h @@ -195,11 +195,10 @@ enum OperatorType { enum ModelType { UNKNOWN = 3001, LLAMA = 3002, - LLAMA2 = 3003, - OPT = 3004, - FALCON = 3005, - STARCODER = 3006, - MPT = 3007 + OPT = 3003, + FALCON = 3004, + STARCODER = 3005, + MPT = 3006 }; enum PMParameter { diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md index e46e6b45d1..d78fb37be9 100644 --- a/inference/MODEL_WEIGHTS.md +++ b/inference/MODEL_WEIGHTS.md @@ -2,7 +2,7 @@ To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we fir ```python from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") for name, params in model.named_parameters(): for name, params in model.named_parameters(): diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index 90d1902716..7c4cef0973 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -186,14 +186,7 @@ void FlexFlow::top_level_task(Task const *task, auto architectures = model_config["architectures"]; for (auto const &str : architectures) { if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { - std::string nameOrPath = model_config["_name_or_path"]; - // TODO: support LLAMA-2 models not from Meta - bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0; - if (llama2) { - model_type = ModelType::LLAMA2; - } else { - model_type = ModelType::LLAMA; - } + model_type = ModelType::LLAMA; break; } else if (str == "OPTForCausalLM") { model_type = ModelType::OPT; @@ -229,7 +222,7 @@ void FlexFlow::top_level_task(Task const *task, rm->register_output_filepath(file_paths.output_file_path); FFModel model(ffconfig, ffconfig.cpu_offload); - if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) { + if (model_type == ModelType::LLAMA) { LLAMA::create_llama_model(model, config_filepath, weights_filepath, diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py index 3621ee83a3..4a146ab503 100644 --- a/inference/python/incr_decoding.py +++ b/inference/python/incr_decoding.py @@ -43,7 +43,7 @@ def get_configs(): # required parameters "num_gpus": 4, "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 30000, + "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, @@ -108,7 +108,7 @@ def main(): prompts = [s for s in json.load(open(configs.prompt))] results = llm.generate(prompts) else: - result = llm.generate("Here are some travel tips for Tokyo:\n") + result = llm.generate("Three tips for staying healthy are: ") if __name__ == "__main__": diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py index 3d0f1a1c0e..c9fb5cc7bb 100644 --- a/inference/python/spec_infer.py +++ b/inference/python/spec_infer.py @@ -43,7 +43,7 @@ def get_configs(): # required parameters "num_gpus": 4, "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 30000, + "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, @@ -60,7 +60,7 @@ def get_configs(): } llm_configs = { # required llm arguments - "llm_model": "decapoda-research/llama-7b-hf", + "llm_model": "meta-llama/Llama-2-7b-hf", # optional llm parameters "cache_path": "", "refresh_cache": False, @@ -68,7 +68,7 @@ def get_configs(): "ssms": [ { # required ssm parameter - "ssm_model": "JackFram/llama-160m-base", + "ssm_model": "JackFram/llama-160m", # optional ssm parameters "cache_path": "", "refresh_cache": False, @@ -154,7 +154,7 @@ def main(): prompts = [s for s in json.load(open(configs.prompt))] results = llm.generate(prompts) else: - result = llm.generate("Here are some travel tips for Tokyo:\n") + result = llm.generate("Three tips for staying healthy are: ") if __name__ == "__main__": diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc index 98b5ec4633..8b0eb926d9 100644 --- a/inference/spec_infer/spec_infer.cc +++ b/inference/spec_infer/spec_infer.cc @@ -166,14 +166,7 @@ void get_model_meta(FilePaths &file_paths, auto architectures = llm_model_config["architectures"]; for (auto const &str : architectures) { if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { - std::string nameOrPath = llm_model_config["_name_or_path"]; - // TODO: support LLAMA-2 models not from Meta - bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0; - if (llama2) { - model_metadata.llm_model_type = ModelType::LLAMA2; - } else { - model_metadata.llm_model_type = ModelType::LLAMA; - } + model_metadata.llm_model_type = ModelType::LLAMA; break; } else if (str == "OPTForCausalLM") { model_metadata.llm_model_type = ModelType::OPT; @@ -223,14 +216,7 @@ void get_model_meta(FilePaths &file_paths, auto architectures = ssm_model_config["architectures"]; for (auto const &str : architectures) { if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") { - std::string nameOrPath = ssm_model_config["_name_or_path"]; - // TODO: support LLAMA-2 models not from Meta - bool llama2 = nameOrPath.find("meta-llama/Llama-2") == 0; - if (llama2) { - ssm_model_type = ModelType::LLAMA2; - } else { - ssm_model_type = ModelType::LLAMA; - } + ssm_model_type = ModelType::LLAMA; break; } else if (str == "OPTForCausalLM") { ssm_model_type = ModelType::OPT; @@ -318,8 +304,7 @@ void FlexFlow::top_level_task(Task const *task, // Create LLM model FFModel tree_model(ffconfig, ffconfig.cpu_offload); - if (model_metadata.llm_model_type == ModelType::LLAMA || - model_metadata.llm_model_type == ModelType::LLAMA2) { + if (model_metadata.llm_model_type == ModelType::LLAMA) { LLAMA::create_llama_model(tree_model, model_metadata.llm_model_config_path, model_metadata.llm_weights_path, @@ -363,8 +348,7 @@ void FlexFlow::top_level_task(Task const *task, for (int ssm_id = 0; ssm_id < num_ssms; ssm_id++) { FFModel &beam_model = ssm_models[ssm_id]; - if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA || - model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA2) { + if (model_metadata.ssm_model_types[ssm_id] == ModelType::LLAMA) { LLAMA::create_llama_model(beam_model, model_metadata.ssm_model_config_paths[ssm_id], model_metadata.ssm_model_weights_paths[ssm_id], diff --git a/inference/utils/compress_llama_weights.py b/inference/utils/compress_llama_weights.py index c92ae6aca9..daaee9c9d5 100644 --- a/inference/utils/compress_llama_weights.py +++ b/inference/utils/compress_llama_weights.py @@ -91,7 +91,7 @@ def decompress(packed_data, config): if __name__ == "__main__": # torch.set_default_tensor_type(torch.HalfTensor) # torch.set_default_tensor_type(torch.cuda.HalfTensor) - model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf") + model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") config = CompressionConfig( num_bits=8, group_size=32, group_dim=0, symmetric=False) for name, params in model.named_parameters(): diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py index e0e1b2e155..24cf9efb30 100644 --- a/python/flexflow/serve/serve.py +++ b/python/flexflow/serve/serve.py @@ -112,7 +112,7 @@ def __init__( ): """Create the LLM object - :param model_name: The name of the HuggingFace model to use. E.g. 'decapoda-research/llama-7b-hf' + :param model_name: The name of the HuggingFace model to use. E.g. 'meta-llama/Llama-2-7b-hf' :type model_name: str :param data_type: The data type to use for the tensors (e.g. DataType.DT_FLOAT for full precision, or DataType.DT_HALF for half precision), defaults to DataType.DT_HALF :type data_type: DataType, optional @@ -447,7 +447,7 @@ def __init__( ): """Create the SSM object - :param model_name: The name of the HuggingFace model to use. E.g. 'decapoda-research/llama-7b-hf' + :param model_name: The name of the HuggingFace model to use. E.g. 'meta-llama/Llama-2-7b-hf' :type model_name: str :param data_type: The data type to use for the tensors (e.g. DataType.DT_FLOAT for full precision, or DataType.DT_HALF for half precision), defaults to DataType.DT_HALF :type data_type: DataType, optional diff --git a/python/flexflow/type.py b/python/flexflow/type.py index 9caecdde54..994a85f57e 100644 --- a/python/flexflow/type.py +++ b/python/flexflow/type.py @@ -75,11 +75,10 @@ class InferenceMode(Enum): class ModelType(Enum): UNKNOWN = 3001 LLAMA = 3002 - LLAMA2 = 3003 - OPT = 3004 - FALCON = 3005 - STARCODER = 3006 - MPT = 3007 + OPT = 3003 + FALCON = 3004 + STARCODER = 3005 + MPT = 3006 class OpType(Enum): diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 0b89010ab1..df8d43bc38 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -117,7 +117,7 @@ void RequestManager::register_tokenizer(ModelType type, this->eos_token_id = eos_token_id; std::string tokenizer_folder = (!path.empty() && path.back() != '/') ? path + '/' : path; - if (model_type == ModelType::LLAMA || model_type == ModelType::LLAMA2) { + if (model_type == ModelType::LLAMA) { bool path_to_file = !path.empty() && (path.size() >= strlen("tokenizer.model")) && path.find("tokenizer.model") == @@ -492,6 +492,12 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc, old_bc.requestsInfo[i].request_guid, request.tokens.size()); std::string output = this->tokenizer_->Decode(request.tokens); + // Unlike Huggingface, the sentencepiece C++ library automatically + // removes the BOS token + if (model_type == ModelType::LLAMA && + request.tokens.at(0) == bos_token_id) { + output = " " + output; + } { // update generation result and trigger future @@ -743,6 +749,12 @@ BeamSearchBatchConfig request.guid, request.tokens.size()); std::string output = this->tokenizer_->Decode(request.tokens); + // Unlike Huggingface, the sentencepiece C++ library automatically + // removes the BOS token + if (model_type == ModelType::LLAMA && + request.tokens.at(0) == bos_token_id) { + output = " " + output; + } { // update generation result and trigger future GenerationResult &gr = request_generation_results[request.guid]; @@ -854,6 +866,12 @@ BeamSearchBatchConfig } } std::string output = this->tokenizer_->Decode(request.tokens); + // Unlike Huggingface, the sentencepiece C++ library automatically + // removes the BOS token + if (model_type == ModelType::LLAMA && + request.tokens.at(0) == bos_token_id) { + output = " " + output; + } log_req_mgr.print("Output: %s", output.c_str()); } } else if (request.status == Request::PENDING) { @@ -887,6 +905,12 @@ BeamSearchBatchConfig // Token Info std::string output = this->tokenizer_->Decode(request.tokens); + // Unlike Huggingface, the sentencepiece C++ library automatically removes + // the BOS token + if (model_type == ModelType::LLAMA && + request.tokens.at(0) == bos_token_id) { + output = " " + output; + } log_req_mgr.print("Output: %s", output.c_str()); } else { assert(false); diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh index 42a6db09d8..8beea55999 100755 --- a/tests/inference/cpp_inference_tests.sh +++ b/tests/inference/cpp_inference_tests.sh @@ -10,9 +10,9 @@ cd "${BASH_SOURCE[0]%/*}" ############################################################################################### # LLAMA -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4 # LLAMA (half precision) -../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 +../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4 # OPT ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4 @@ -22,9 +22,9 @@ cd "${BASH_SOURCE[0]%/*}" # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (half precision) - ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -ssm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -37,14 +37,17 @@ fi ############################################################################################### # LLAMA (small model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4 + +../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1 + # LLAMA (small model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4 # LLAMA (big model) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4 # LLAMA (big model, half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4 # OPT (small model) ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4 @@ -57,9 +60,9 @@ fi ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4 # Falcon (full precision) -# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # Falcon (half precision) -../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 +# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4 # # StarCoder (full precision) # ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4 @@ -69,16 +72,16 @@ fi # Tensor parallelism tests if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then # LLAMA (small model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (small model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m-base -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4 # LLAMA (big model) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # LLAMA (big model, half precision) - ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model decapoda-research/llama-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 + ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 # OPT (small model) ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2 @@ -179,22 +182,22 @@ function compare_decoding_steps_spec_infer_incr_decoding { ############ Alignment between speculative inference and incremental decoding ################# # Full precision -diff <(tail -n +3 "../../inference/output/incr_decoding_llama_7B.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt") +diff <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B.txt") <(tail -n +3 "../../inference/output/spec_inference_llama.txt") diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt") <(tail -n +3 "../../inference/output/spec_inference_opt.txt") # Half precision -check_partial_token_match "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" +check_partial_token_match "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" check_partial_token_match "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt" # Speed test: speculative inference should be at very least 1.5x faster than incremental decoding # Full precision -#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B.txt" "../../inference/output/spec_inference_llama.txt" +#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B.txt" "../../inference/output/spec_inference_llama.txt" #compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt" -compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B.txt" "../../inference/output/spec_inference_llama.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B.txt" "../../inference/output/spec_inference_llama.txt" compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B.txt" "../../inference/output/spec_inference_opt.txt" # Half precision -#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" +#compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" #compare_speed_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt" -compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_llama_2_7B_half.txt" "../../inference/output/spec_inference_llama_half.txt" compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_decoding_opt_6B_half.txt" "../../inference/output/spec_inference_opt_half.txt" ############ Alignment between tensor model parallelism and pipeline parallelism only ################# @@ -205,8 +208,8 @@ if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then check_partial_token_match "../../inference/output/spec_inference_opt_half_tp.txt" "../../inference/output/spec_inference_opt_half.txt" diff <(tail -n +3 "../../inference/output/incr_decoding_llama_160M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_160M.txt") check_partial_token_match "../../inference/output/incr_decoding_llama_160M_half_tp.txt" "../../inference/output/incr_decoding_llama_160M_half.txt" - diff <(tail -n +3 "../../inference/output/incr_decoding_llama_7B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_7B.txt") - check_partial_token_match "../../inference/output/incr_decoding_llama_7B_half_tp.txt" "../../inference/output/incr_decoding_llama_7B_half.txt" + diff <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_llama_2_7B.txt") + check_partial_token_match "../../inference/output/incr_decoding_llama_2_7B_half_tp.txt" "../../inference/output/incr_decoding_llama_2_7B_half.txt" diff <(tail -n +3 "../../inference/output/incr_decoding_opt_125M_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_125M.txt") check_partial_token_match "../../inference/output/incr_decoding_opt_125M_half_tp.txt" "../../inference/output/incr_decoding_opt_125M_half.txt" diff <(tail -n +3 "../../inference/output/incr_decoding_opt_6B_tp.txt") <(tail -n +3 "../../inference/output/incr_decoding_opt_6B.txt") @@ -216,16 +219,16 @@ fi ######################### Alignment tests with HuggingFace #################################### # LLAMA (small model, full precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu # LLAMA (small model, half precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu # LLAMA (big model, full precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" +python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_2_7B.txt" # LLAMA (big model, half precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_2_7B_half.txt" --gpu # OPT (small model, full precision) python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 @@ -243,14 +246,14 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128 -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_160M.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_llama_7B.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_decoding_llama_160M.txt") +diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_2_7B.txt" <(tail -n +4 "../../inference/output/incr_decoding_llama_2_7B.txt") +diff <( < ../../inference/output/huggingface_llama_2_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_llama_2_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_125M.txt") -diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B.txt") -# diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_decoding_opt_6B_half.txt") -diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_decoding_falcon_7B.txt") +diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_125M.txt") +diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_decoding_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_6B.txt") +# diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_decoding_opt_6B_half.txt") +diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_decoding_falcon_7B.txt") diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py index fee215f4c4..5b533bf3c0 100644 --- a/tests/inference/huggingface_inference.py +++ b/tests/inference/huggingface_inference.py @@ -2,7 +2,14 @@ import json import os import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, LlamaTokenizer +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + AutoConfig, + LlamaTokenizer, + GenerationConfig, +) + def main(): # Change working dir to folder storing this script @@ -19,6 +26,7 @@ def main(): parser.add_argument( "--use-full-precision", action="store_true", help="Use full precision" ) + parser.add_argument("--do-sample", action="store_true", help="Use sampling") parser.add_argument("--gpu", action="store_true", help="Run on GPU") args = parser.parse_args() # Check if max-length is greater than 0 @@ -54,13 +62,19 @@ def main(): tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True) else: tokenizer = AutoTokenizer.from_pretrained(args.model_name) + generation_config = GenerationConfig.from_pretrained(args.model_name) + generation_config.do_sample = args.do_sample # Generate output with open(args.output_file, "w") as f: for i, prompt in enumerate(prompt_list): - batch = tokenizer( - prompt, return_tensors="pt", add_special_tokens=True - ).to(device) - generated = model.generate(batch["input_ids"], max_length=args.max_length) + batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to( + device + ) + generated = model.generate( + batch["input_ids"], + max_length=args.max_length, + generation_config=generation_config, + ) out = tokenizer.decode(generated[0]) # Write output to file out_str = out if i == (len(prompt_list) - 1) else out + "\n" diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh index 64c61ba0dc..3544f58e26 100755 --- a/tests/inference/python_inference_tests.sh +++ b/tests/inference/python_inference_tests.sh @@ -108,40 +108,40 @@ function compare_decoding_steps_spec_infer_incr_decoding { ############ Alignment between speculative inference and incremental decoding ################# # Full precision -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt") # Half precision -check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt" # Speed test: speculative inference should be at very least 1.5x faster than incremental decoding # Full precision -compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt" compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt" # Half precision -compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" compare_decoding_steps_spec_infer_incr_decoding "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt" ############ Alignment between tensor model parallelism and pipeline parallelism only ################# ## Specinfer # LLAMA -diff <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" # OPT diff <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/spec_infer-python-opt-6.7b-full_prec-1_tp_4_pp.txt") check_partial_token_match "../../inference/output/spec_infer-python-opt-6.7b-half_prec-2_tp_2_pp.txt" "../../inference/output/spec_infer-python-opt-6.7b-half_prec-1_tp_4_pp.txt" ## Incremental decoding # Small LLAMA -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-base-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-base-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-160m-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" # Big LLAMA -diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") -check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" -#diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") -#check_partial_token_match "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" +diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" +#diff <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-4_tp_1_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +#check_partial_token_match "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-4_tp_1_pp.txt" "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" # Small OPT diff <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-2_tp_2_pp.txt") <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") check_partial_token_match "../../inference/output/incr_dec-python-opt-125m-half_prec-2_tp_2_pp.txt" "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" @@ -157,16 +157,16 @@ check_partial_token_match "../../inference/output/incr_dec-python-opt-6.7b-half_ ######################### Alignment tests with HuggingFace #################################### # LLAMA (small model, full precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M.txt" --gpu # LLAMA (small model, half precision) -python3 ./huggingface_inference.py --model-name "JackFram/llama-160m-base" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "JackFram/llama-160m" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_160M_half.txt" --gpu # LLAMA (big model, full precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" +python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B.txt" # LLAMA (big model, half precision) -python3 ./huggingface_inference.py --model-name "decapoda-research/llama-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu +python3 ./huggingface_inference.py --model-name "meta-llama/Llama-2-7b-hf" --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_llama_7B_half.txt" --gpu # OPT (small model, full precision) python3 ./huggingface_inference.py --model-name "facebook/opt-125m" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_opt_125M.txt" --gpu --max-length 128 @@ -183,13 +183,13 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p # Falcon (full precision) python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128 -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-full_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_160M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-160m-base-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-full_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_llama_7B_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-llama-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_opt_125M.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_opt_125M_half.txt" | tr -s '[:space:]' '\n' | head -n 20) <(tail -n +5 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) -diff <(tail -n +2 "../../inference/output/huggingface_opt_6B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") -#diff <(tail -n +2 "../../inference/output/huggingface_opt_6B_half.txt") <(tail -n +5 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") -diff <(tail -n +2 "../../inference/output/huggingface_falcon_7B.txt") <(tail -n +5 "../../inference/output/incr_dec-python-falcon-7b-half_prec-1_tp_4_pp.txt") +diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt") +diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20) +diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt") +#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt") +diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt") diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py index 8efe8999c4..ebaadade32 100644 --- a/tests/inference/python_test_configs/generate_configs.py +++ b/tests/inference/python_test_configs/generate_configs.py @@ -6,7 +6,7 @@ # required parameters "num_gpus": 4, "memory_per_gpu": 14000, - "zero_copy_memory_per_node": 30000, + "zero_copy_memory_per_node": 40000, # optional parameters "num_cpus": 4, "legion_utility_processors": 4, @@ -35,7 +35,7 @@ "ssms": [ { # required ssm parameter - "ssm_model": "JackFram/llama-160m-base", + "ssm_model": "JackFram/llama-160m", # optional ssm parameters "cache_path": "", "refresh_cache": False, @@ -47,12 +47,16 @@ ff_init_configs.update(llm_configs) # Test parameters to fill in -llama_models = ["decapoda-research/llama-7b-hf", "JackFram/llama-160m-base"] +llama_models = ["meta-llama/Llama-2-7b-hf", "JackFram/llama-160m"] opt_models = ["facebook/opt-6.7b", "facebook/opt-125m"] -falcon_models = ["tiiuae/falcon-7b",] -mpt_models = ["mosaicml/mpt-7b", ] +falcon_models = [ + "tiiuae/falcon-7b", +] +mpt_models = [ + "mosaicml/mpt-7b", +] # starcoder_models = ["bigcode/starcoderbase-7b",] -parallelism_settings = [(1,4), (2,2), (4,1)] +parallelism_settings = [(1, 4), (2, 2), (4, 1)] # The paths below should be with respect to the folder from which the tests are launched (FF_HOME/tests/inference) prompt_file = "../../inference/prompt/test.json" @@ -69,7 +73,6 @@ for model_name in all_models: for full_precision in (True, False): for parallelism_degrees in parallelism_settings: - tp, pp = parallelism_degrees # Tensor parallelism not supported by small Falcon model atm @@ -79,14 +82,21 @@ if tp > 2 and ("7b" in model_name or "6.7b" in model_name): continue - if full_precision and ("falcon" in model_name or "starcoder" in model_name): + # Run Falcon only in full precision, Starcoder only in half precision + if (not full_precision and "falcon" in model_name) or (full_precision and "starcoder" in model_name): continue - + _, after_slash = model_name.rsplit("/", maxsplit=1) - filename = "incr_dec-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" + filename = ( + "incr_dec-" + + "python-" + + after_slash.lower() + + ("-full_prec-" if full_precision else "-half_prec-") + + f"{tp}_tp_{pp}_pp" + ) test_configs_file = "./" + filename + ".json" - output_file = os.path.join(output_folder, filename+".txt") - + output_file = os.path.join(output_folder, filename + ".txt") + ff_init_configs["tensor_parallelism_degree"] = tp ff_init_configs["pipeline_parallelism_degree"] = pp ff_init_configs["llm_model"] = model_name @@ -110,17 +120,23 @@ continue _, after_slash = big_model.rsplit("/", maxsplit=1) - filename = "spec_infer-" + "python-" + after_slash + ("-full_prec-" if full_precision else "-half_prec-") + f"{tp}_tp_{pp}_pp" + filename = ( + "spec_infer-" + + "python-" + + after_slash.lower() + + ("-full_prec-" if full_precision else "-half_prec-") + + f"{tp}_tp_{pp}_pp" + ) test_configs_file = "./" + filename + ".json" - output_file = os.path.join(output_folder, filename+".txt") - + output_file = os.path.join(output_folder, filename + ".txt") + ff_init_configs["tensor_parallelism_degree"] = tp ff_init_configs["pipeline_parallelism_degree"] = pp ff_init_configs["llm_model"] = big_model ff_init_configs["full_precision"] = full_precision ff_init_configs["output_file"] = output_file ff_init_configs["prompt"] = prompt_file - + ssm_configs["ssms"][0]["ssm_model"] = small_model ssm_configs["ssms"][0]["full_precision"] = full_precision ff_init_configs.update(ssm_configs) diff --git a/tests/inference_tests.sh b/tests/inference_tests.sh index c757dd5ee6..895b74c798 100755 --- a/tests/inference_tests.sh +++ b/tests/inference_tests.sh @@ -16,6 +16,12 @@ CPP_INFERENCE_TESTS=${CPP_INFERENCE_TESTS:-OFF} # Enable model parallelism tests in C++, if desired TENSOR_PARALLELISM_TESTS=${TENSOR_PARALLELISM_TESTS:-OFF} +# Token to access private huggingface models (e.g. LLAMA-2) +HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none} +if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then + huggingface-cli login --token "$HUGGINGFACE_TOKEN" +fi + # Clean up before test (just in case) cleanup @@ -24,7 +30,7 @@ pip3 install protobuf==3.20.3 # Create test prompt file mkdir -p ../inference/prompt -echo '["Give three tips for staying healthy."]' > ../inference/prompt/test.json +echo '["Three tips for staying healthy are: "]' > ../inference/prompt/test.json # Create output folder mkdir -p ../inference/output @@ -38,7 +44,7 @@ if [[ "$PYTHON_INFERENCE_TESTS" == "ON" ]]; then fi if [[ "$CPP_INFERENCE_TESTS" == "ON" ]]; then # Manually download the weights in both half and full precision - python3 ../inference/utils/download_hf_model.py "decapoda-research/llama-7b-hf" "JackFram/llama-160m-base" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" + python3 ../inference/utils/download_hf_model.py "meta-llama/Llama-2-7b-hf" "JackFram/llama-160m" "facebook/opt-6.7b" "facebook/opt-125m" "tiiuae/falcon-7b" echo "Running C++ inference tests..." ./inference/cpp_inference_tests.sh fi diff --git a/tests/multinode_helpers/mpi_wrapper1.sh b/tests/multinode_helpers/mpi_wrapper1.sh index 87d17d11a3..076fd2d66c 100755 --- a/tests/multinode_helpers/mpi_wrapper1.sh +++ b/tests/multinode_helpers/mpi_wrapper1.sh @@ -8,5 +8,5 @@ if [ -z "$GPUS" ]; then echo "GPUS variable is not defined, aborting tests"; exi # We need to wrap the instruction below in its own script because MPI throws an error if we try # to run "mpirun" more than once in the same script. Hence, we cannot simply call "mpirun" in the -# multi_gpu_tests.sh script +# training_tests.sh script mpirun -np "$NUM_NODES" "$FF_HOME"/tests/multinode_helpers/mpi_wrapper2.sh "$@" diff --git a/tests/python_interface_test.sh b/tests/python_interface_test.sh index 4f83918a49..5ce4d9803b 100755 --- a/tests/python_interface_test.sh +++ b/tests/python_interface_test.sh @@ -14,13 +14,13 @@ check_python_interface() { # Generate configs JSON files test_params=$(jq -n --arg num_gpus "$GPUS" --arg memory_per_gpu "$FSIZE" --arg zero_copy_memory_per_node "$ZSIZE" --arg batch_size "$BATCHSIZE" --arg only_data_parallel "$ONLY_DATA_PARALLEL" '{"num_gpus":$num_gpus,"memory_per_gpu":$memory_per_gpu,"zero_copy_memory_per_node":$zero_copy_memory_per_node,"batch_size":$batch_size,"only_data_parallel":$only_data_parallel}') - mkdir -p /tmp/flexflow/multi_gpu_tests - echo "$test_params" > /tmp/flexflow/multi_gpu_tests/test_params.json + mkdir -p /tmp/flexflow/training_tests + echo "$test_params" > /tmp/flexflow/training_tests/test_params.json if [[ "$interpreter" == "python" ]]; then EXE="python" echo "Running a single-GPU Python test to check the Python interface (native python interpreter)" - $EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json + $EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json elif [[ "$interpreter" == "flexflow_python" ]]; then if [[ "$installation_status" == "before-installation" ]]; then EXE="$BUILD_FOLDER"/flexflow_python diff --git a/tests/multi_gpu_tests.sh b/tests/training_tests.sh similarity index 61% rename from tests/multi_gpu_tests.sh rename to tests/training_tests.sh index 3a6f6467df..2d1f00883b 100755 --- a/tests/multi_gpu_tests.sh +++ b/tests/training_tests.sh @@ -33,57 +33,57 @@ test_params_5_epochs=$(echo "$test_params" | jq '. + {"epochs": 5}') test_params_40_epochs=$(echo "$test_params" | jq '. + {"epochs": 40}') test_params_5_epochs_no_batch_size=$(echo "$test_params_5_epochs" | jq 'del(.batch_size)') test_params_40_epochs_no_batch_size=$(echo "$test_params_40_epochs" | jq 'del(.batch_size)') -mkdir -p /tmp/flexflow/multi_gpu_tests -echo "$test_params" > /tmp/flexflow/multi_gpu_tests/test_params.json -echo "$test_params_5_epochs" > /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json -echo "$test_params_5_epochs_no_batch_size" > /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json -echo "$test_params_40_epochs_no_batch_size" > /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json +mkdir -p /tmp/flexflow/training_tests +echo "$test_params" > /tmp/flexflow/training_tests/test_params.json +echo "$test_params_5_epochs" > /tmp/flexflow/training_tests/test_params_5_epochs.json +echo "$test_params_5_epochs_no_batch_size" > /tmp/flexflow/training_tests/test_params_5_epochs_no_batch_size.json +echo "$test_params_40_epochs_no_batch_size" > /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json #Sequential model tests -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -#$EXE "$FF_HOME"/examples/python/keras/seq_reuters_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_nested.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json +#$EXE "$FF_HOME"/examples/python/keras/seq_reuters_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_cifar10_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_mlp_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/seq_mnist_cnn_nested.py -config-file /tmp/flexflow/training_tests/test_params.json #Keras other -$EXE "$FF_HOME"/examples/python/keras/callback.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/unary.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/reshape.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/elementwise_mul_broadcast.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/reduce_sum.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/identity_loss.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/elementwise_max_min.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/rsqrt.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/gather.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/regularizer.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/callback.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/unary.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/reshape.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/elementwise_mul_broadcast.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/reduce_sum.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/identity_loss.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/elementwise_max_min.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/rsqrt.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/gather.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/regularizer.py -config-file /tmp/flexflow/training_tests/test_params.json #Functional API -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat2.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_nested.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_alexnet.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_net2net.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_concat2.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_nested.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_alexnet.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_mnist_mlp_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_net2net.py -config-file /tmp/flexflow/training_tests/test_params.json #Python -$EXE "$FF_HOME"/examples/python/native/print_layers.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json -$EXE "$FF_HOME"/examples/python/native/split.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/native/alexnet.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json -$EXE "$FF_HOME"/examples/python/native/mnist_mlp.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json -$EXE "$FF_HOME"/examples/python/native/mnist_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs.json -$EXE "$FF_HOME"/examples/python/native/cifar10_cnn.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json -$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_attach.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json -$EXE "$FF_HOME"/examples/python/native/mnist_mlp_attach.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_5_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/print_layers.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs.json +$EXE "$FF_HOME"/examples/python/native/split.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/native/alexnet.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/mnist_mlp.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs.json +$EXE "$FF_HOME"/examples/python/native/mnist_cnn.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs.json +$EXE "$FF_HOME"/examples/python/native/cifar10_cnn.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_attach.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/native/mnist_mlp_attach.py -config-file /tmp/flexflow/training_tests/test_params_5_epochs_no_batch_size.json #Possible crash -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -config-file /tmp/flexflow/multi_gpu_tests/test_params.json -$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -config-file /tmp/flexflow/multi_gpu_tests/test_params_40_epochs_no_batch_size.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_model.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/keras/func_cifar10_cnn_concat_seq_model.py -config-file /tmp/flexflow/training_tests/test_params.json +$EXE "$FF_HOME"/examples/python/native/cifar10_cnn_concat.py -config-file /tmp/flexflow/training_tests/test_params_40_epochs_no_batch_size.json